In [1]:
# import models
import pandas as pd
import numpy as np

#pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 

#models
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

**IMPORT DATA**

In [2]:
from project_fraud.data import cleaned_featured_data

In [3]:
df = cleaned_featured_data('~/data/')

In [4]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,hours,cardID,mean,min,max,median,dist_mean,dist_median,dist_mean_rel,dist_median_rel
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,0.0,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,0.0,2755404.0150.0mastercard102.0credit,235.020796,10.0,6085.23,115.0,-206.020796,-86.0,-0.876607,-0.747826
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,0.0,4663490.0150.0visa166.0debit,96.791005,12.5,994.0,59.0,-37.791005,0.0,-0.390439,0.0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,0.0,18132567.0150.0mastercard117.0debit,123.308485,6.0,3190.0,59.95,-73.308485,-9.95,-0.594513,-0.165972
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,4497514.0150.0mastercard102.0credit,96.972222,20.95,200.0,108.95,-46.972222,-58.95,-0.484388,-0.541074


In [5]:
df.shape

(590540, 239)

In [6]:
def getnulls(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum() / data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['total', 'precent'])
    return missing_data

In [7]:
missing_data_train = getnulls(train)
missing_data_train.head(350).T

NameError: name 'train' is not defined

**SPLIT DATA**

In [None]:
X = df[['TransactionID','P_emaildomain_bin','card1','card2','addr1','TransactionAmt','card5','D15','C13','D2','D10','D4','weekday','hours','dist_mean', 'dist_median','dist_mean_rel','dist_median_rel']]
y = df['isFraud']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3)

sample_size = 15000

X_small = X_train.sample(sample_size, random_state=0)

y_small = y_train.sample(sample_size, random_state=0)

X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(X_small, y_small, random_state=0)

**PIPELINES**

In [None]:
n = (X.dtypes != 'object')
num_cols = list(n[n].index)
medium_missing_num_cols = []
low_missing_num_cols =[]
for i in num_cols:
    percentage = df[i].isnull().sum() * 100 / len(df[i])
    if percentage < 15:
        low_missing_num_cols.append(i)
    elif percentage >= 15 and percentage <= 60:
        medium_missing_num_cols.append(i)

In [None]:
# trainer.py

num_transformer_low = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])
num_transformer_medium = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('one_hot', OneHotEncoder())
])
    
preprocessor = ColumnTransformer([
    ('low_num_imputer',num_transformer_low, low_missing_num_cols),
    ('medium_num_imputer', num_transformer_medium, medium_missing_num_cols),
    ('cat_transformer', cat_pipeline, ['P_emaildomain_bin','weekday','hours'])],
    remainder='drop')

**MODEL SVC**

In [None]:
# Instanciate model
from scipy import stats

svc = SVC(probability=True)

pipeline_svc = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", svc),
     ])

param_grid_3 = {'classifier__kernel': ['rbf', 'poly', 'linear'],
              'classifier__gamma' : [0.1, 10],
              'classifier__C': [0.01, 0.1, 1, 10, 100],
              'classifier__degree' : [1,2,3,4],
              'classifier__class_weight': ['balanced'],
             }

grid_search_3 = GridSearchCV(pipeline_svc, param_grid_3,
                           cv=5,
                           n_jobs= -1,
                           verbose=0,
                           scoring = 'recall')

grid_search_3.fit(X_train_small, y_train_small)

In [None]:
grid_search_3.score(X_test_small, y_test_small)

In [None]:
grid_search_3.best_params_