# Classification Project

#### This data set contains the following features:
>- TransactionDT: - timedelta from a given reference datetime (not an actual timestamp)
- TransactionAMT - transaction payment amount in USD
- ProductCD - product code, the product for each transaction
- card1 - card6 -payment card information, such as card type, card category, issue bank, country, etc
- addr -adress
- dist: -distance
- P_ and (R__) emaildomain - purchaser and recipient email domain
- C1-C14 -counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
- D1-D15 -timedelta, such as days between previous transaction, etc
- M1-M9 -match, such as names on card and address, etc.
- Vxxx -Vesta engineered rich features, including ranking, counting, and other entity relations.

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('fivethirtyeight')

# Gather Data

In [2]:
df = pd.read_csv(r'Downloads\Cat\train.csv')

In [3]:
test = pd.read_csv(r'Downloads\Cat\test.csv')

# Assess Data

In [4]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3210227,0,5261726,59.0,W,17412,451.0,150.0,visa,226.0,...,,,,,,,,,,
1,3397895,0,10374074,64.347,C,7949,142.0,185.0,visa,226.0,...,chrome 65.0,,,,F,F,T,F,desktop,Windows
2,3108818,0,2397517,1308.5,W,9500,321.0,150.0,visa,226.0,...,,,,,,,,,,
3,3244037,0,6150927,57.95,W,10112,360.0,150.0,visa,166.0,...,,,,,,,,,,
4,3385123,0,10025230,53.95,W,9500,321.0,150.0,visa,226.0,...,,,,,,,,,,


In [5]:
test.tail()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
194874,3353774,9123849,49.0,W,6174,490.0,150.0,visa,226.0,debit,...,,,,,,,,,,
194875,3269455,6897318,44.107,C,15885,545.0,185.0,visa,138.0,debit,...,chrome 64.0 for android,,,,F,F,T,F,mobile,SM-G928V Build/MMB29K
194876,3016750,754854,66.928,C,9633,296.0,185.0,visa,138.0,debit,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
194877,3255487,6491084,58.95,W,9500,321.0,150.0,visa,226.0,debit,...,,,,,,,,,,
194878,3200702,4981147,171.0,W,18103,288.0,150.0,visa,226.0,debit,...,,,,,,,,,,


In [6]:
#shape
df.shape

(395661, 434)

In [7]:
#info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395661 entries, 0 to 395660
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: float64(399), int64(4), object(31)
memory usage: 1.3+ GB


In [8]:
def missing_value(df):
    mis_val = df.isnull().sum()
    mis_val_precent = (df.isnull().sum() / len(df))*100
    mis_val_table = pd.concat([mis_val,mis_val_precent],axis=1).rename(columns={0:'Missing Values', 1 : '% of Total Values'})
    return mis_val_table
nulls = missing_value(df)
nulls

Unnamed: 0,Missing Values,% of Total Values
TransactionID,0,0.000000
isFraud,0,0.000000
TransactionDT,0,0.000000
TransactionAmt,0,0.000000
ProductCD,0,0.000000
card1,0,0.000000
card2,6034,1.525043
card3,1055,0.266642
card4,1064,0.268917
card5,2874,0.726379


In [9]:
null = nulls[nulls['% of Total Values'] >= 25 ].index

In [10]:
df.drop(null,axis=1,inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395661 entries, 0 to 395660
Columns: 182 entries, TransactionID to V321
dtypes: float64(174), int64(4), object(4)
memory usage: 549.4+ MB


In [12]:
missing_value(df)

Unnamed: 0,Missing Values,% of Total Values
TransactionID,0,0.000000
isFraud,0,0.000000
TransactionDT,0,0.000000
TransactionAmt,0,0.000000
ProductCD,0,0.000000
card1,0,0.000000
card2,6034,1.525043
card3,1055,0.266642
card4,1064,0.268917
card5,2874,0.726379


In [13]:
df['isFraud'].value_counts(normalize=True)

0    0.965331
1    0.034669
Name: isFraud, dtype: float64

In [14]:
for col in df.columns:
    if df[col].dtype == 'float':
        df[col].fillna(df[col].mean(),inplace=True)
    elif df[col].dtype == 'int':
        df[col].fillna(df[col].mode().values[0],inplace=True)
    else:
        df[col].fillna(df[col].mode().values[0],inplace=True)

In [15]:
missing_value(df)

Unnamed: 0,Missing Values,% of Total Values
TransactionID,0,0.0
isFraud,0,0.0
TransactionDT,0,0.0
TransactionAmt,0,0.0
ProductCD,0,0.0
card1,0,0.0
card2,0,0.0
card3,0,0.0
card4,0,0.0
card5,0,0.0


In [16]:
df.duplicated().sum()

0

In [17]:
for col in df.columns:
    if df[col].dtype == 'object':
        print(col)

ProductCD
card4
card6
P_emaildomain


In [18]:
df['P_emaildomain'].nunique()

59

In [19]:
df.drop(['P_emaildomain','TransactionID'],axis=1,inplace=True)

In [20]:
df['ProductCD'] = df['ProductCD'].astype('category').cat.codes
df['card4'] = df['card4'].astype('category').cat.codes
df['card6'] = df['card6'].astype('category').cat.codes

In [21]:
df.sample(10)

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
337604,0,15533714,28.896,0,16136,204.0,185.0,3,138.0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153920,0,15556919,57.95,4,15986,360.0,150.0,2,229.0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
163949,0,12514094,130.5,4,17363,111.0,150.0,3,226.0,2,...,549.0,0.0,0.0,0.0,323.850006,1007.849976,323.850006,0.0,0.0,0.0
339555,0,245762,25.0,4,6951,111.0,150.0,3,226.0,2,...,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
301803,0,9898006,59.0,4,12065,575.0,150.0,2,224.0,2,...,0.0,0.0,59.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
207334,0,11378361,149.95,4,6530,206.0,150.0,2,126.0,2,...,0.0,0.0,460.850006,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14832,0,1744075,65.95,4,12577,268.0,150.0,3,166.0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.899994,140.899994,140.899994
174096,0,13102181,2.292,0,15885,545.0,185.0,3,138.0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
254949,0,15187361,1849.07,4,15497,490.0,150.0,3,226.0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104253,0,585450,170.0,4,7069,111.0,150.0,2,202.0,2,...,220.0,87.0,730.0,87.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
X = df.drop('isFraud',axis=1)
y = df['isFraud']

## Feature Selection

In [23]:
from sklearn.ensemble import ExtraTreesClassifier

In [24]:
model = ExtraTreesClassifier()
model.fit(X,y)



ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [25]:
feat_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)

In [26]:
feat_importances

TransactionDT     0.064280
TransactionAmt    0.057472
card1             0.037542
C1                0.037042
card2             0.035020
addr1             0.031255
C2                0.027874
C14               0.026362
C13               0.024021
C11               0.020916
card5             0.019530
C6                0.017778
card6             0.013163
C12               0.012949
V86               0.012772
card4             0.012681
D15               0.012526
V87               0.012383
D1                0.011658
ProductCD         0.011146
C4                0.011124
C8                0.010875
C10               0.009856
V78               0.008640
card3             0.008465
V307              0.008316
D10               0.007987
V94               0.007749
V283              0.007349
C9                0.007072
                    ...   
V63               0.001281
V90               0.001267
V59               0.001244
V60               0.001237
V297              0.001184
V31               0.000956
V

In [27]:
X = X[feat_importances[:20].index]

In [28]:
X.sample(8)

Unnamed: 0,TransactionDT,TransactionAmt,card1,C1,card2,addr1,C2,C14,C13,C11,card5,C6,card6,C12,V86,card4,D15,V87,D1,ProductCD
355619,2057203,554.0,2455,1.0,321.0,204.0,2.0,1.0,1.0,1.0,226.0,2.0,1,0.0,1.0,3,3.0,1.0,3.0,4
61344,8106972,75.0,12758,1.0,161.0,181.0,1.0,1.0,2.0,1.0,195.0,1.0,2,0.0,1.065119,2,163.501085,1.099194,77.0,2
248185,7599285,24.0,7585,1.0,553.0,315.0,3.0,1.0,40.0,1.0,226.0,1.0,1,1.0,1.0,3,557.0,1.0,556.0,4
175863,2099108,226.0,2616,3.0,362.663447,264.0,1.0,3.0,4.0,1.0,102.0,1.0,1,0.0,1.0,1,111.0,1.0,0.0,4
183200,1828261,20.95,9995,110.0,479.0,143.0,105.0,92.0,413.0,83.0,166.0,88.0,2,0.0,1.0,3,466.0,1.0,163.0,4
108320,2079402,125.0,2803,1.0,100.0,433.0,1.0,1.0,1.0,1.0,226.0,1.0,2,0.0,1.065119,3,163.501085,1.099194,0.0,2
5116,8627614,39.0,3762,1.0,555.0,325.0,1.0,1.0,2.0,1.0,226.0,1.0,2,0.0,1.0,3,17.0,1.0,17.0,4
15374,12536134,159.95,15185,2.0,332.0,264.0,2.0,2.0,11.0,1.0,226.0,1.0,2,0.0,1.0,3,509.0,1.0,35.0,4


## Split data into training set and test set

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42,stratify = y)

## Try Random Forest Model

In [393]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_grid = [{'n_estimators': [10,100,500,1000], 'max_features': [10,30,50]}]
grid = GridSearchCV(rfor,param_grid,scoring='roc_auc',cv=5,return_train_score=True)

In [287]:
rfor = RandomForestClassifier(n_estimators=500)

In [283]:
from sklearn.model_selection import cross_val_score

In [275]:
scores = cross_val_score(rfor,X_train, y_train, cv=3,scoring="roc_auc")



In [276]:
scores

array([0.88959903, 0.89089224, 0.89221707])

In [288]:
rfor.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [289]:
y_for = rfor.predict(X_test)

In [290]:
roc_auc_score(y_test,y_for)

0.791100116721883

In [36]:
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score

In [53]:
print(classification_report(y_test,y_pred))
print('**********************************')
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     38195
           1       0.92      0.47      0.62      1372

    accuracy                           0.98     39567
   macro avg       0.95      0.73      0.80     39567
weighted avg       0.98      0.98      0.98     39567

**********************************
[[38141    54]
 [  734   638]]


In [54]:
roc_auc_score(y_test,y_pred)

0.7318003898209929

## Try Decision Tree Model

In [31]:
from sklearn.tree import DecisionTreeClassifier

In [47]:
from sklearn.model_selection import GridSearchCV
# Create lists of parameter for Decision Tree Classifier
criterion = ['gini', 'entropy']
max_depth = [4,6,8,12]

    # Create a dictionary of all the parameter options 
    # Note has you can access the parameters of steps of a pipeline by using '__’
parameters = dict(criterion=criterion,
                      max_depth=max_depth)

    # Conduct Parameter Optmization With Pipeline
    # Create a grid search object
clf = GridSearchCV(tree, parameters)

    # Fit the grid search
clf.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 6, 8, 12]},
             pre_d

In [174]:
tree = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=320,
                       max_features=18, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [175]:
tree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=320,
                       max_features=18, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [176]:
tree_pred = tree.predict(X_test)

In [177]:
roc_auc_score(y_test,tree_pred)

0.8065737043886267

In [77]:
y_new = tree.predict(test)

In [48]:
clf.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=12,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

## Try Resampling the data

In [34]:
from sklearn.utils import resample

In [82]:
x = pd.concat([X_train,y_train],axis = 1)

In [83]:
fraud = x[x.isFraud == 1]
not_fraud = x[x.isFraud == 0]

In [84]:
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=len(not_fraud), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_fraud, fraud_upsampled])

In [85]:
upsampled.isFraud.value_counts()

1    343749
0    343749
Name: isFraud, dtype: int64

In [86]:
X=upsampled.drop('isFraud',axis=1)
y=upsampled['isFraud']

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42,stratify = y)

## Try Logistic Regression model after resampling the data

In [88]:
from sklearn.linear_model import LogisticRegression

In [89]:
log = LogisticRegression()

In [None]:
log.fit(X_train,y_train)

In [44]:
y_pred2 = log.predict(X_test)

In [46]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred2)

0.6467636363636364

In [47]:
from sklearn.model_selection import GridSearchCV

In [48]:
logreg = LogisticRegression(class_weight='balanced')
param = {'C':[0.001,0.003,0.005,0.01,0.03,0.05,0.1,0.3,0.5,1,2,3,3,4,5,10,20]}
clf = GridSearchCV(logreg,param,scoring='roc_auc',refit=True,cv=10)
clf.fit(X_train,y_train)
print('Best roc_auc: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_))











Best roc_auc: 0.7243, with best C: {'C': 0.003}


In [55]:
log = LogisticRegression(class_weight='balanced',C=0.003)

In [56]:
log.fit(X_train,y_train)



LogisticRegression(C=0.003, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [57]:
yy = log.predict(X_test)

In [58]:
roc_auc_score(y_test,yy)

0.6514181818181818

## Test Data

In [67]:
test.shape

(194879, 433)

In [68]:
test.drop(null,axis=1,inplace=True)

In [69]:
for col in test.columns:
    if test[col].dtype == 'float':
        test[col].fillna(test[col].mean(),inplace=True)
    elif test[col].dtype == 'int':
        test[col].fillna(test[col].mode().values[0],inplace=True)
    else:
        test[col].fillna(test[col].mode().values[0],inplace=True)

In [70]:
for col in test.columns:
    if test[col].dtype == 'object':
        print(col)

ProductCD
card4
card6
P_emaildomain


In [71]:
test.drop(['P_emaildomain','TransactionID'],axis=1,inplace=True)

In [72]:
test['ProductCD'] = test['ProductCD'].astype('category').cat.codes
test['card4'] = test['card4'].astype('category').cat.codes
test['card6'] = test['card6'].astype('category').cat.codes

In [73]:
test = test[feat_importances[:20].index]

In [74]:
test

Unnamed: 0,TransactionDT,TransactionAmt,card1,C1,card2,addr1,C2,C14,C13,C11,card5,C6,card6,C12,V86,card4,D15,V87,D1,ProductCD
0,12153579,724.000,7826,3.0,481.0,387.000000,1.0,2.0,2.0,1.0,224.0,1.0,2,0.0,1.00000,2,145.000000,1.000000,0.0,4
1,15005886,108.500,12544,2.0,321.0,476.000000,1.0,2.0,7.0,1.0,226.0,1.0,2,1.0,1.00000,3,347.000000,1.000000,122.0,4
2,6970178,47.950,9400,1.0,111.0,315.000000,1.0,1.0,3.0,1.0,224.0,1.0,2,0.0,1.00000,2,33.000000,1.000000,32.0,4
3,5673658,100.599,15885,2.0,545.0,290.349683,3.0,0.0,0.0,1.0,138.0,1.0,2,1.0,2.00000,3,0.000000,2.000000,0.0,0
4,6886780,107.950,15497,10.0,490.0,299.000000,14.0,9.0,43.0,10.0,226.0,8.0,2,0.0,1.00000,3,549.000000,1.000000,549.0,4
5,10444930,280.000,7919,6.0,194.0,472.000000,4.0,5.0,13.0,3.0,166.0,2.0,2,0.0,1.00000,2,0.000000,1.000000,0.0,4
6,10442147,311.950,9002,106.0,453.0,315.000000,105.0,80.0,407.0,82.0,226.0,92.0,2,0.0,1.00000,3,341.000000,1.000000,330.0,4
7,12254683,330.990,14183,2.0,555.0,184.000000,1.0,1.0,1.0,1.0,226.0,1.0,1,0.0,1.06441,3,164.239021,1.099989,0.0,4
8,9228284,10.392,9633,4.0,130.0,290.349683,5.0,1.0,1.0,1.0,138.0,2.0,2,1.0,2.00000,3,150.000000,2.000000,0.0,0
9,739585,335.000,11207,2.0,361.0,231.000000,2.0,2.0,12.0,2.0,226.0,8.0,2,0.0,1.00000,3,284.000000,1.000000,40.0,4


In [75]:
y_hat = tree.predict(test)

In [78]:
z = pd.read_csv(r'Downloads\Cat\sample_submisson.csv')

In [79]:
z['TransactionID'] = z['TransactionID'].astype('object')

In [80]:
z.isFraud = y_new

In [81]:
z.isFraud = z.isFraud.astype('float')

In [82]:
z.to_csv('sample_submisson.csv',index=False)