### Experiement sklearn Pipelines to prevent data leakeage

https://medium.com/@benlc77/how-to-write-clean-and-scalable-code-with-custom-transformers-sklearn-pipelines-ecb8e53fe110
https://cloud.google.com/ai-platform/prediction/docs/custom-pipeline#create_custom_transformers
https://sklearn-template.readthedocs.io/en/latest/user_guide.html
https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html


In [52]:

### System
import os
import joblib

### Set seed
import random
random.seed(42)

### Mains
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import seaborn as sns
%matplotlib inline 


# ### Models:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

### Ensemble Models:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, VotingClassifier


### Dats Splits 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

### Pipelines
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline, make_pipeline 
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder

# # Sampling Methods
# from imblearn.over_sampling import SMOTE, RandomOverSampler
# from imblearn.under_sampling import NearMiss, RandomUnderSampler

### Metrics:
import sklearn.metrics as skm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from sklearn.base import BaseEstimator, TransformerMixin

In [39]:

source = "data_with_distance.pkl" # includes distance_customer_merchant from geopy.distance
df = pd.read_pickle(source)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181822 entries, 0 to 181821
Data columns (total 23 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   transDate                   181822 non-null  datetime64[ns]
 1   creditCardNum               181822 non-null  int64         
 2   business                    181822 non-null  object        
 3   category                    181822 non-null  object        
 4   amount                      181822 non-null  float64       
 5   firstName                   181822 non-null  object        
 6   lastName                    181822 non-null  object        
 7   gender                      181822 non-null  object        
 8   street                      181822 non-null  object        
 9   city                        181822 non-null  object        
 10  state                       181822 non-null  object        
 11  zip                         181822 non-

In [38]:
# df.sort_values(by=['creditCardNum', 'transDate'], inplace=True)
# df['numOfPrevFraudTxns'] = df.groupby('creditCardNum')['isFraud'].cumsum() - df['isFraud']
# df['historyOfFraud'] = (df['numOfPrevFraudTxns'] > 0).astype(int)

# df[df['historyOfFraud'] > 0].iloc[0]

In [69]:
class DropFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Return a new DataFrame with specified columns dropped
        return X.drop(columns=self.variables, errors='ignore')

class CombinedFraudFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, date_column='transDate', id_column='creditCardNum', target_column='isFraud', category_columns=['city', 'job', 'state']):
        # Initialization parameters
        self.date_column = date_column
        self.id_column = id_column
        self.target_column = target_column
        self.category_columns = category_columns
        self.fraud_rate_dict = {}  # Initialize an empty dictionary to store fraud rate mappings for each category

    def fit(self, X, y=None):
        # Combine X and y for the purpose of calculating fraud rates based on target variable
        X_combined = X.copy()
        X_combined[self.target_column] = y

        # Calculate fraud rate for each category in specified columns
        for column in self.category_columns:
            grouped = X_combined.groupby(column)
            total_transactions = grouped.size()
            fraud_transactions = grouped[self.target_column].sum()
            fraud_rate = (fraud_transactions / total_transactions) * 100
            # Store the fraud rate mapping for the current column
            self.fraud_rate_dict[column] = fraud_rate.to_dict()

        return self

    def transform(self, X):
        X_transformed = X.copy()

        # Ensure the DataFrame is sorted by ID and date for any date-based calculations
        X_transformed.sort_values(by=[self.id_column, self.date_column], inplace=True)

        # No target-dependent transformations should occur here

        # Apply fraud rate encoding for each specified category column based on mappings learned during fit
        for column in self.category_columns:
            fraud_rate_column = f"{column}_fraudrate"
            # Use mappings calculated in fit; if a category was unseen during fit, default to NaN or another placeholder
            X_transformed[fraud_rate_column] = X_transformed[column].apply(lambda x: self.fraud_rate_dict[column].get(x, np.nan))

            # Optionally, drop the original categorical column to finalize feature set
            X_transformed.drop(columns=[column], inplace=True)

        return X_transformed

class DateFeatureCreator(BaseEstimator, TransformerMixin):
    def __init__(self, date_column='transDate', dob_column='dateOfBirth'):
        self.date_column = date_column
        self.dob_column = dob_column
    
    def fit(self, X, y=None):
        # No fitting process needed for this transformation
        return self
    
    def transform(self, X):
        # Make sure to work on a copy to avoid changing the original DataFrame
        X_transformed = X.copy()
        
        # Convert columns to datetime if not already
        X_transformed[self.date_column] = pd.to_datetime(X_transformed[self.date_column])
        X_transformed[self.dob_column] = pd.to_datetime(X_transformed[self.dob_column])
        
        # Create the specified date-based features
        X_transformed["trans_day"] = X_transformed[self.date_column].dt.dayofyear
        X_transformed["trans_weekday"] = X_transformed[self.date_column].dt.weekday
        X_transformed["trans_hour"] = X_transformed[self.date_column].dt.hour
        X_transformed["age_at_transaction"] = X_transformed[self.date_column].dt.year - X_transformed[self.dob_column].dt.year
        
        return X_transformed
    
class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns  # List of columns to encode
        self.encoders = {column: OneHotEncoder(sparse=False) for column in columns}
    
    def fit(self, X, y=None):
        # Fit an encoder for each column
        for column in self.columns:
            self.encoders[column].fit(X[[column]])
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        for column in self.columns:
            # Transform the data for each column
            encoded = self.encoders[column].transform(X_transformed[[column]])
            
            # Convert the encoded categories back to a DataFrame
            encoded_df = pd.DataFrame(
                encoded, 
                columns=self.encoders[column].get_feature_names_out([column])
            )
            
            # Reset index to ensure indices align when concatenating
            encoded_df.reset_index(drop=True, inplace=True)
            X_transformed.reset_index(drop=True, inplace=True)
            
            # Concatenate the original DataFrame with the new one-hot encoded DataFrame
            X_transformed = pd.concat([X_transformed, encoded_df], axis=1)
            
            # Drop the original column as it's now encoded
            X_transformed.drop(column, axis=1, inplace=True)
        
        return X_transformed

In [70]:


# Assume df is your DataFrame and CombinedFraudFeatures is defined as above
X = df.drop(columns=['isFraud'])
y = df['isFraud']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

drop_columns=[
        "business",
        "firstName",
        "lastName",
        "gender",
        "street",
        "zip",
        "unixTime",
        "creditCardNum",
        "transNum",
        "merchLatitude",
        "merchLongitude",
        "latitude",
        "longitude",
        "dateOfBirth",
        "transDate",
    ]


pipeline = Pipeline([
    ('date_features', DateFeatureCreator(date_column='transDate', dob_column='dateOfBirth')),
    ('combined_features', CombinedFraudFeatures(category_columns=['city', 'job', 'state'])),
    ('onehot_encoder', CustomOneHotEncoder(columns=['category'])), 
    ('drop_features', DropFeatureSelector(variables=drop_columns)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)  # No need to reintroduce 'isFraud' here

# Predict using the pipeline directly on X_test
y_pred = pipeline.predict_proba(X_test)[:, 1] >= 0.479



In [71]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9927127732710024
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Confusion Matrix:
 [[36100     0]
 [  265     0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [72]:
pipelines = {
    "LogisticRegression": make_pipeline(StandardScaler(), LogisticRegression()),
    "KNeighborsClassifier": make_pipeline(StandardScaler(), KNeighborsClassifier()),
    "RandomForestClassifier": make_pipeline(StandardScaler(), RandomForestClassifier()),
    "DecisionTreeClassifier": make_pipeline(StandardScaler(), DecisionTreeClassifier()),
    "GradientBoostingClassifier": make_pipeline(StandardScaler(), GradientBoostingClassifier()),
}

# Add preprocessing steps common to all pipelines
for name, pipeline in pipelines.items():
    pipeline.steps.insert(0, ('date_features', DateFeatureCreator(date_column='transDate', dob_column='dateOfBirth')))
    pipeline.steps.insert(1, ('combined_features', CombinedFraudFeatures(category_columns=['city', 'job', 'state'])))
    pipeline.steps.insert(2, ('onehot_encoder', CustomOneHotEncoder(columns=['category'])))
    pipeline.steps.insert(3, ('drop_features', DropFeatureSelector(variables=drop_columns)))
    
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=["isFraud"]),  # Use your actual DataFrame here
    df["isFraud"],
    test_size=0.2,
    random_state=42
)
from sklearn.model_selection import train_test_split, cross_val_score
fit_models = {}
for algo, pipeline in pipelines.items():
    print(f"Fitting {algo}")
    model = pipeline.fit(X_train, y_train)
    fit_models[algo] = model

# Evaluate models using cross-validation
for name, model in fit_models.items():
    print(f"Model: {name}")
    for metric in ["precision", "recall", "f1"]:
        scores = cross_val_score(model, X_test, y_test, cv=10, scoring=metric)
        mean_score = round(scores.mean() * 100, 2)
        print(f"--> {metric.capitalize()}: {mean_score}%")
    print("-" * 40)

Fitting LogisticRegression




Fitting KNeighborsClassifier




Fitting RandomForestClassifier




Fitting DecisionTreeClassifier




Fitting GradientBoostingClassifier




Model: LogisticRegression


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/linear_model/_base.py", line 419, in predict
    scores = self.decision_function(X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/pytho

--> Precision: nan%


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/linear_model/_base.py", line 419, in predict
    scores = self.decision_function(X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/pytho

--> Recall: nan%


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/linear_model/_base.py", line 419, in predict
    scores = self.decision_function(X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/pytho

--> F1: nan%
----------------------------------------
Model: KNeighborsClassifier


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/neighbors/_classification.py", line 234, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "/home/kayaba_attribution

--> Precision: nan%


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/neighbors/_classification.py", line 234, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "/home/kayaba_attribution

--> Recall: nan%


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/neighbors/_classification.py", line 234, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "/home/kayaba_attribution

--> F1: nan%
----------------------------------------
Model: RandomForestClassifier


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 820, in predict
    proba = self.predict_proba(X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/si

--> Precision: nan%


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 820, in predict
    proba = self.predict_proba(X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/si

--> Recall: nan%


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 820, in predict
    proba = self.predict_proba(X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/si

--> F1: nan%
----------------------------------------
Model: DecisionTreeClassifier


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 426, in predict
    X = self._validate_X_predict(X, check_input)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/

--> Precision: nan%


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 426, in predict
    X = self._validate_X_predict(X, check_input)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/

--> Recall: nan%


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 426, in predict
    X = self._validate_X_predict(X, check_input)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/

--> F1: nan%
----------------------------------------
Model: GradientBoostingClassifier


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/ensemble/_gb.py", line 1308, in predict
    raw_predictions = self.decision_function(X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/p

--> Precision: nan%


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/ensemble/_gb.py", line 1308, in predict
    raw_predictions = self.decision_function(X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/p

--> Recall: nan%


Traceback (most recent call last):
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/python3.8/site-packages/sklearn/ensemble/_gb.py", line 1308, in predict
    raw_predictions = self.decision_function(X)
  File "/home/kayaba_attribution/.virtualenvs/ds/lib/p

--> F1: nan%
----------------------------------------


In [31]:
X = df.drop(columns='isFraud')
y = df['isFraud']

# Sample data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    DropFeatureSelector(variables=)
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Now you can predict using the pipeline
y_pred = pipeline.predict(X_test)

TypeError: cannot unpack non-iterable DropFeatureSelector object

In [16]:
# encode city
grouped_transactions = df.groupby("city")
total_transactions = grouped_transactions.size()
fraud_transactions = grouped_transactions["isFraud"].sum()
fraud_rate = (fraud_transactions / total_transactions) * 100
result_dict = fraud_rate.to_dict()

df["city_fraudrate"] = df["city"].map(result_dict)
df.drop(columns=["city"], inplace=True)

In [None]:
# Setup cross validation folds
kf = KFold(n_splits=12, random_state=42, shuffle=True)

In [None]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class HistoricalFeaturesCreator(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Initialization code
        pass

    def fit(self, X, y=None):
        # Fit doesn't need to compute anything for this transformation
        return self
    
    def transform(self, X, y=None):
        # Ensure X is a DataFrame
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        
        # Example feature: Calculate fraudRate
        # Assuming 'creditCardNum' and 'isFraud' are columns in X
        X_sorted = X.sort_values(by=['creditCardNum', 'transactionDate'])
        X_sorted['cumulativeFraud'] = X_sorted.groupby('creditCardNum')['isFraud'].cumsum()
        X_sorted['cumulativeTransactions'] = X_sorted.groupby('creditCardNum').cumcount() + 1  # +1 to avoid division by zero
        X_sorted['fraudRate'] = X_sorted['cumulativeFraud'] / X_sorted['cumulativeTransactions']
        
        # To avoid data leakage, subtract current transaction's fraud status from cumulativeFraud count
        X_sorted['cumulativeFraud'] = X_sorted['cumulativeFraud'] - X_sorted['isFraud']
        X_sorted['fraudRate'] = X_sorted['cumulativeFraud'] / (X_sorted['cumulativeTransactions'] - 1)
        X_sorted['fraudRate'] = X_sorted['fraudRate'].fillna(0)  # Handle division by zero for the first transaction

        # Optionally drop columns that were only used for calculation
        X_transformed = X_sorted.drop(columns=['cumulativeFraud', 'cumulativeTransactions'])
        
        return X_transformed

# Assuming 'df' is your DataFrame and 'y' is your target variable
# Convert target variable into a numpy array if it's not already
y = df['isFraud'].values
X = df.drop(columns=['isFraud'])

# Create a pipeline
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('historical_features', HistoricalFeaturesCreator()),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Split your data (ensure you have a way to handle temporal data correctly)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)  # Adjust based on your df

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
# Evaluation code (e.g., accuracy_score, confusion_matrix) goes here
