In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import seaborn as sns


In [4]:
data = pd.read_csv('data.csv')


In [3]:
data.shape

(1097231, 254)

In [4]:
print(data.isnull().sum().sum())

29015135


## Convert mail column

In [6]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 
          'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft',
          'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 
          'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink',
          'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other',
          'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 
          'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 
          'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo',
          'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft',
          'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 
          'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 
          'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 
          'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 
          'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 
          'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other',
          'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}

##us_emails = ['gmail',  'net',  'edu']

# https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest-579654
for c in ['P_emaildomain']:
    data[c + '_bin'] = data[c].map(emails)
   
    
    data[c + '_suffix'] = data[c].map(lambda x: str(x).split('.')[-1])
    
    
   # df_train[c + '_suffix'] = df_train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    #df_test[c + '_suffix'] = df_test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')



In [7]:
data.head(2)

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,P_emaildomain_bin,P_emaildomain_suffix
0,2987000.0,86400.0,68.5,W,13926.0,363.099769,150.0,discover,142.0,credit,...,,,,,,,,,,
1,2987001.0,86401.0,29.0,W,2755.0,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,google,com


After converting the emails column we need to check the new amount of categorical columns, so we will will print a list of them
and see missing values

c = (data.dtypes == 'object')
categorical_cols = list(c[c].index)
for i in categorical_cols:
    print(data[i].value_counts())
    print(i, "missing values: ", data[i].isnull().sum()) 
    print(data[i].isnull().sum()*100/len(data[i]), "\n") # missing percent

In [14]:
# create a list of numerical features
# create a list of categorical features

c = (data.dtypes == 'object')
n = (data.dtypes != 'object')
cat_id_cols = list(c[c].index)
num_id_cols = list(n[n].index) 

print(cat_id_cols, "\n")
print("number categorical identity features: ", len(cat_id_cols), "\n\n")
print(num_id_cols, "\n")
print("number numerical identity features: ", len(num_id_cols))

['ProductCD', 'card4', 'card6', 'P_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 'id_16', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'P_emaildomain_bin', 'P_emaildomain_suffix'] 

number categorical identity features:  29 


['TransactionID', 'TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D10', 'D11', 'D15', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60',

# LIST OF FEATURES TO USE 

# First list of features 


### from this notebook https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt

In [None]:


data['TransactionID'] #noice
data['card1']  #noice
data['card2']   #noice  #this one can be combined with ['addr1']
data['TransactionAmt_to_mean_card1']
data['addr1']    #noice
data['TransactionAmt_to_std_card1'] #noice
data['TransactionAmt'] #noice
data['PCA_V_13'] 
data['TransactionAmt_to_std_card4']#noice
data['card5']  #in other data set was used with mean 
data['D8'] #noice
data['TransactionAmt_to_mean_card4']#noice
data['PCA_V_29']
data['PCA_V_14']
data['PCA_V_27']
data['PCA_V_23']
data['dist1'] #noice
data['PCA_V_17']
data['D15']  #noice
data['PCA_V_26']
data['P_emaildomain_bin'] #engineered by us 
data['P_emaildomain_suffix']#engineered by us 

# Second list of features 


### from this notebook https://www.kaggle.com/plasticgrammer/ieee-cis-fraud-detection-eda

In [None]:
data['card1']
data['_card2__addr1']
data['addr1']
data['_weekday__hour']
data['_count__card_all__addr1']
data['_count_rate']
data['_card1__card2']
data['_count_card1']
data['card2']
data['_amount_pct__card_all__addr1']
data['_P_emaildomain__addr1']
data['_card1_addr1']
data['_amount_fraction']
data['D15']
data['_amount_mean_P_emaildomain']
data['_count_card2']
data['_amount_std_P_emaildomain']
data['_vcol_pca0']
data['_amount_pct_card1']
data['_hour']
data['dist1']
data['C13']
data['D2']
data['D10']
data['D4']


# PLAN FOR TOMORROW FOR FEATURES 
## - generate features to std and to mean if we want to use them 
## - generate features to combine card and adress if we want to 
## - PCA Not sure 


# FEATURES MEAN STD 

In [6]:
#Dropping the outlier rows with standard deviation

In [None]:
data['Trans_min_mean'] = data['TransactionAmt'] - data['TransactionAmt'].mean()
data['Trans_min_std'] = data['Trans_min_mean'] / data['TransactionAmt'].std()


In [None]:
data['TransactionAmt_to_mean_card1'] = data['TransactionAmt'] / data.groupby(['card1'])['TransactionAmt'].transform('mean')
data['TransactionAmt_to_mean_card4'] = data['TransactionAmt'] / data.groupby(['card4'])['TransactionAmt'].transform('mean')
data['TransactionAmt_to_std_card1'] = data['TransactionAmt'] / data.groupby(['card1'])['TransactionAmt'].transform('std')
data['TransactionAmt_to_std_card4'] = data['TransactionAmt'] / data.groupby(['card4'])['TransactionAmt'].transform('std')



Log transformation helps to handle skewed data and after transformation, the distribution becomes more approximate to normal.


In [None]:
data['TransactionAmt'] = np.log(data['TransactionAmt'])

### To check missimg values 

In [None]:
print(data[col].isnull().sum().sum())

## Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

aspiration_encoder = LabelEncoder()

data["col_name"] = aspiration_encoder.fit_transform(data['col_name'])

data.head()


## Simple Imputer

In [None]:
from sklearn.impute import SimpleImputer

# Initiate imputer with desired strategy
imputer = SimpleImputer(strategy="most_frequent")

# Call the method "fit" on the object
imputer.fit(data[['col_name']])

#Call the method "transform" on the object
data['col_name'] = imputer.transform(data[['col_name']])

data.head()

## Train Test Split

In [None]:
# Create X and y
X = data.drop(columns=['isFraud'])
y = data['isFraud']

In [None]:
# Create a smaller dataset for investigation purpose only
sample_size = 20000
tmp = data.sample(sample_size, random_state=414)
X_small = tmp.drop(columns=['isFraud'])
y_small = tmp['isFraud']

In [None]:
# Train Test Split using random_state=414
# (let's forget for the sake of this challenge that we are data-leaking a bit here, we should have done the split earlier)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=414)
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(X_small, y_small, random_state=414)


In [None]:
# (optional) Create here an train/eval split within the train set itself.
# Some powerfull models (XGBOOST, Neural Network...) which are prone to overfitting on the traning set, needs "early stopping criteria", to avoid descending the gradient completely and avoid overfitting.
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train, y_train)
X_train_train_small, X_train_test_small, y_train_train_small, y_train_test_small = train_test_split(X_train_small, y_train_small)


# BASELINE 

In [None]:
# create Basemodel: SGDClassifier Logistic Regression 

log_reg_model = LogisticRegression(class_weight='balanced')
cross_val_score(log_reg_model, X_train, y_train, cv=3, scoring='recall')

base_model = SGDClassifier(loss='log', alpha=0.5, class_weight='balanced')
cv_results_base_model = cross_validate(base_model, X_train, y_train, cv=5, n_jobs=1, scoring=['recall', 'f1_macro'])
cv_results_base_model['test_f1_macro'].mean()

In [None]:
# Logistic Regression Model 

log_reg_model = LogisticRegression(class_weight='balanced')
cv_results_log_reg_model = cross_val_score(log_reg_model, X_train, y_train, cv=5, scoring=['recall', 'f1_macro'])

# Concating dfs to get PCA of V features

In [None]:
def PCA_change(data, col, n_components, prefix='PCA_', rand_seed=4):
    pca = PCA(n_components=n_components, random_state=rand_seed)

    principalComponents = pca.fit_transform(data[col])

    principalDf = pd.DataFrame(principalComponents)

    data.drop(cols, axis=1, inplace=True)

    principalDf.rename(columns=lambda x: str(prefix)+str(x), inplace=True)

    data = pd.concat([data, principalDf], axis=1)
    
    return data

## KNN


In [None]:
# Import model
from sklearn.neighbors import KNeighborsClassifier

# Instanciate the model
knn_model = KNeighborsClassifier(n_neighbors=10)

# Train the model on the Training data
knn_model.fit(X_train, y_train)

# Score the model on the Testing data
knn_model.score(X_test,y_test)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.plot(range(1,25),score,color='blue', linestyle='dashed', marker='o',markerfacecolor='red', markersize=10)
plt.title('Score vs. K Neighbors')
plt.xlabel('K')
plt.ylabel('Accuracy')

Loop over different values of K and record the model's score for each value.

In [None]:
score = []

for k in range(1,25):
    
    # Instanciate the model
    knn_model = KNeighborsClassifier(n_neighbors = k)

    # Train the model on the scaled Training data
    knn_model.fit(X_train, y_train)

    # Append the score 
    score.append(knn_model.score(X_test,y_test))
    

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.plot(range(1,25),score,color='blue', linestyle='dashed', marker='o',markerfacecolor='red', markersize=10)
plt.title('Score vs. K Neighbors')
plt.xlabel('K')
plt.ylabel('Accuracy')

to see which value of K performs best

In [None]:
import numpy as np

np.argmax(score)+1 # +1

In [None]:
max(score)

# OR 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate

knn = KNeighborsClassifier()
cross_validate(knn, X_train_scaled, y_train, cv=5, scoring='roc_auc')["test_score"].mean()

## 3. Grid search

Use KNeighborsClassifier

👇 Grid search a KNN's hyperparameter k on the training data.
- Search k = [1,5,10,20]
- 5-fold cross validate
- Score with recall

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Instanciate model
model = KNeighborsClassifier()

# Hyperparameter Grid
k_grid = {'n_neighbors' : [1, 5,10,20]}

# Instanciate Grid Search
grid = GridSearchCV(model, k_grid, n_jobs=-1, scoring = 'roc_auc', cv = 5)

# Fit data to Grid Search
grid.fit(X_train_scaled, y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

Extract the best model from the grid search and score its performance on the test set.

In [None]:
from sklearn.metrics import roc_auc_score
model = grid.best_estimator_
roc_auc_score(model.predict(scaler.transform(X_test)),y_test)

# OR


# Random Search 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

model = KNeighborsClassifier()

search_space = {'n_neighbors': randint(1, 40), 'p': [1, 2]}

search = RandomizedSearchCV(model, param_distributions=search_space,
                            n_jobs=-1, scoring='roc_auc', cv=5, n_iter=10)

search.fit(X_train_scaled, y_train)

print(search.best_score_)
print(search.best_params_)

## SVM Classifier for non-linearly separable data

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

# Instanciate model
model = SVC()

# Hyperparameter search space
search_space = {
    'kernel': ['sigmoid'],
    'C': stats.uniform(0.01, 1000),
    'gamma': stats.loguniform(0.001,10),
    'coef0': stats.uniform(-5,5),
}

# Instanciate Random Search
rsearch = RandomizedSearchCV(
    model, search_space,
    n_jobs=-1, scoring='accuracy', cv=5, n_iter=1000, verbose=1)


rsearch.fit(X,y)

In [None]:
print(rsearch.best_params_)
print(rsearch.best_score_)
best_svm = rsearch.best_estimator_.fit(X,y)
plot_decision_regions(X, y, classifier=best_svm)

In [None]:
from sklearn.model_selection import cross_val_score
print('CROSS VALIDATED RESULT')
print('mean accuracy', cross_val_score(best_svm, X, y, cv=10).mean())
print('std', cross_val_score(rsearch.best_estimator_, X, y, cv=10).std())

# VIF 

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
df = pd.DataFrame()
df["vif_index"] = [vif(Xp, i) for i in range(Xp.shape[1])]
df