In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import seaborn as sns

In [2]:
from project_fraud.lib import merge_data, clean_merge_data

In [3]:
data = clean_merge_data()
data.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000.0,86400.0,68.5,W,13926.0,363.099769,150.0,discover,142.0,credit,...,,,,,,,,,,
1,2987001.0,86401.0,29.0,W,2755.0,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,2987002.0,86469.0,59.0,W,4663.0,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,2987003.0,86499.0,50.0,W,18132.0,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,2987004.0,86506.0,50.0,H,4497.0,514.0,150.0,mastercard,102.0,credit,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [4]:
data.shape

(1097231, 254)

In [5]:
print(data.isnull().sum().sum())

29015135


## Convert mail column

In [6]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 
          'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft',
          'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 
          'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink',
          'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other',
          'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 
          'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 
          'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo',
          'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft',
          'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 
          'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 
          'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 
          'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 
          'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 
          'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other',
          'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}


##us_emails = ['gmail',  'net',  'edu']

# https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest-579654
for c in ['P_emaildomain']:
    data[c + '_bin'] = data[c].map(emails)
    data[c + '_suffix'] = data[c].map(lambda x: str(x).split('.')[-1])
    

   # df_train[c + '_suffix'] = df_train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    #df_test[c + '_suffix'] = df_test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')



changing Nan values on Unknown 

In [10]:
data[['P_emaildomain_bin']]= data['P_emaildomain_bin'].fillna(value = "Unknown")

In [11]:
print(data['P_emaildomain_bin'].isnull().sum().sum())

0


After converting the emails column we need to check the new amount of categorical columns, so we will will print a list of them
and see missing values

# LIST OF FEATURES TO USE 

In [None]:
1 097 231 - rows 



data['TransactionID']  #no null 
data['card1'] #no null 
data['card2']   17 587 
data['addr1']    131 315
data['TransactionAmt'] #no nul
data['card5']  8 806

data['D15']  101 182
data['C13']4 748
data['D2']515 566
data['D10']88 567
data['D4']245 773
data['P_emaildomain_bin'] #engineered by us  163648
data['P_emaildomain_suffix']#engineered by us  no null 
data['TransactionDT'] #no null

In [None]:
print(data['P_emaildomain_suffix'].isnull().sum())

In [None]:
data['D2'].head(20)


### To check missimg values 

In [None]:
print(data[col].isnull().sum().sum())

## Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

aspiration_encoder = LabelEncoder()

data["col_name"] = aspiration_encoder.fit_transform(data['col_name'])

data.head()

## Train Test Split

In [None]:
# Create X and y
X = data.drop(columns=['isFraud'])
y = data['isFraud']

In [None]:
# Create a smaller dataset for investigation purpose only
sample_size = 20000
tmp = data.sample(sample_size, random_state=414)
X_small = tmp.drop(columns=['isFraud'])
y_small = tmp['isFraud']

In [None]:
# Train Test Split using random_state=414
# (let's forget for the sake of this challenge that we are data-leaking a bit here, we should have done the split earlier)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=414)
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(X_small, y_small, random_state=414)


In [None]:
# (optional) Create here an train/eval split within the train set itself.
# Some powerfull models (XGBOOST, Neural Network...) which are prone to overfitting on the traning set, needs "early stopping criteria", to avoid descending the gradient completely and avoid overfitting.
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train, y_train)
X_train_train_small, X_train_test_small, y_train_train_small, y_train_test_small = train_test_split(X_train_small, y_train_small)

👇 Combine the following steps in a Pipeline:
- Impute missing values with a KNNImputer
- Scale all the features with a MinMaxScaler
- Model a LogisticRegression with default parameters
- Use the scoring metric relevant for the task

In [None]:
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Preprocessing pipeline
pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('model', LogisticRegression() )
])

# Grid search KNNImputer parameter n_neighbors
grid_search = GridSearchCV(
    pipe, 
    param_grid={
        'imputer__n_neighbors': [2,5,10]},
        cv=5,
    scoring="recall")

grid_search.fit(data.drop(columns="malignant"), data['malignant'])

grid_search.best_params_


# BASELINE 

In [None]:
# create Basemodel: SGDClassifier Logistic Regression 

log_reg_model = LogisticRegression(class_weight='balanced')
cross_val_score(log_reg_model, X_train, y_train, cv=3, scoring='recall')

base_model = SGDClassifier(loss='log', alpha=0.5, class_weight='balanced')
cv_results_base_model = cross_validate(base_model, X_train, y_train, cv=5, n_jobs=1, scoring=['recall', 'f1_macro'])
cv_results_base_model['test_f1_macro'].mean()

In [None]:
# Logistic Regression Model 

log_reg_model = LogisticRegression(class_weight='balanced')
cv_results_log_reg_model = cross_val_score(log_reg_model, X_train, y_train, cv=5, scoring=['recall', 'f1_macro'])

## KNN


In [None]:
# Import model
from sklearn.neighbors import KNeighborsClassifier

# Instanciate the model
knn_model = KNeighborsClassifier(n_neighbors=10)

# Train the model on the Training data
knn_model.fit(X_train, y_train)

# Score the model on the Testing data
knn_model.score(X_test,y_test)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.plot(range(1,25),score,color='blue', linestyle='dashed', marker='o',markerfacecolor='red', markersize=10)
plt.title('Score vs. K Neighbors')
plt.xlabel('K')
plt.ylabel('Accuracy')

Loop over different values of K and record the model's score for each value.

In [None]:
score = []

for k in range(1,25):
    
    # Instanciate the model
    knn_model = KNeighborsClassifier(n_neighbors = k)

    # Train the model on the scaled Training data
    knn_model.fit(X_train, y_train)

    # Append the score 
    score.append(knn_model.score(X_test,y_test))
    

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.plot(range(1,25),score,color='blue', linestyle='dashed', marker='o',markerfacecolor='red', markersize=10)
plt.title('Score vs. K Neighbors')
plt.xlabel('K')
plt.ylabel('Accuracy')

to see which value of K performs best

In [None]:
import numpy as np

np.argmax(score)+1 # +1

In [None]:
max(score)

# OR 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate

knn = KNeighborsClassifier()
cross_validate(knn, X_train_scaled, y_train, cv=5, scoring='roc_auc')["test_score"].mean()

## 3. Grid search

Use KNeighborsClassifier

👇 Grid search a KNN's hyperparameter k on the training data.
- Search k = [1,5,10,20]
- 5-fold cross validate
- Score with recall

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Instanciate model
model = KNeighborsClassifier()

# Hyperparameter Grid
k_grid = {'n_neighbors' : [1, 5,10,20]}

# Instanciate Grid Search
grid = GridSearchCV(model, k_grid, n_jobs=-1, scoring = 'roc_auc', cv = 5)

# Fit data to Grid Search
grid.fit(X_train_scaled, y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

Extract the best model from the grid search and score its performance on the test set.

In [None]:
from sklearn.metrics import roc_auc_score
model = grid.best_estimator_
roc_auc_score(model.predict(scaler.transform(X_test)),y_test)

# OR


# Random Search 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

model = KNeighborsClassifier()

search_space = {'n_neighbors': randint(1, 40), 'p': [1, 2]}

search = RandomizedSearchCV(model, param_distributions=search_space,
                            n_jobs=-1, scoring='roc_auc', cv=5, n_iter=10)

search.fit(X_train_scaled, y_train)

print(search.best_score_)
print(search.best_params_)

## SVM Classifier for non-linearly separable data

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

# Instanciate model
model = SVC()

# Hyperparameter search space
search_space = {
    'kernel': ['sigmoid'],
    'C': stats.uniform(0.01, 1000),
    'gamma': stats.loguniform(0.001,10),
    'coef0': stats.uniform(-5,5),
}

# Instanciate Random Search
rsearch = RandomizedSearchCV(
    model, search_space,
    n_jobs=-1, scoring='accuracy', cv=5, n_iter=1000, verbose=1)


rsearch.fit(X,y)

In [None]:
print(rsearch.best_params_)
print(rsearch.best_score_)
best_svm = rsearch.best_estimator_.fit(X,y)
plot_decision_regions(X, y, classifier=best_svm)

In [None]:
from sklearn.model_selection import cross_val_score
print('CROSS VALIDATED RESULT')
print('mean accuracy', cross_val_score(best_svm, X, y, cv=10).mean())
print('std', cross_val_score(rsearch.best_estimator_, X, y, cv=10).std())

# VIF 

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
df = pd.DataFrame()
df["vif_index"] = [vif(Xp, i) for i in range(Xp.shape[1])]
df