# 0 Set up and Data Import

In [1]:
import pandas as pd
import numpy as np

In [2]:
id_raw_df   =pd.read_csv("train_identity.csv")
id_trans_df =pd.read_csv("train_transaction.csv")

In [3]:
train_full_df = pd.merge(id_trans_df, id_raw_df, on='TransactionID', how='left')

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
columns_def=pd.DataFrame({"na_count":train_full_df.isnull().sum().sort_values(ascending=False)})

In [6]:
#Create column of percent columns, #higher is worst
columns_def["per"]=columns_def["na_count"]/len(train_full_df)

In [7]:
limit=0.15 #only columns w/ less 15% will remain

In [8]:
#Select final columns
columns_final=columns_def[columns_def["per"]<limit].index 

train_full_df=train_full_df[train_full_df.columns.intersection(columns_final)]

## Dealing with Categorical Variables

In [9]:
card6_dummy_df=pd.get_dummies(train_full_df["card6"])

In [10]:
train_full_df=pd.concat([train_full_df,card6_dummy_df],axis=1)

In [11]:
train_full_df=train_full_df._get_numeric_data()

train_full_df=train_full_df.dropna()

In [12]:
x_full_df=train_full_df.drop(["isFraud"],axis=1)
y_full_df=train_full_df["isFraud"]


In [13]:
X_train,X_test,y_train,y_test=train_test_split(x_full_df,y_full_df,test_size=0.50)

In [14]:
X_test=X_test.reset_index(drop=True)
y_test=y_test.reset_index(drop=True)

# 03 Modeling

In [15]:
from sklearn.linear_model import LogisticRegression  
from sklearn.preprocessing import StandardScaler  

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

In [16]:
lr = LogisticRegression(solver='lbfgs')  
lr.fit(X_train, y_train)  

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
lr.score(X_test, y_test)

0.9794483900324235

In [18]:
y_pred=lr.predict(X_test)

In [19]:
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,214163,22,214185
1,4472,12,4484
All,218635,34,218669


In [20]:
results_df=pd.DataFrame({"true":y_test, "predicted":y_pred})

In [21]:
results_df[(results_df.true==1)&(results_df.predicted==1)]

Unnamed: 0,true,predicted
3100,1,1
9616,1,1
16370,1,1
27830,1,1
34252,1,1
71134,1,1
76488,1,1
84251,1,1
100279,1,1
107841,1,1


In [23]:
lr.predict(X_test.iloc[[3100]])

array([1], dtype=int64)

In [33]:
X_test.iloc[[107841]].to_json(orient='records')

'[{"TransactionID":3242147,"TransactionDT":6111303,"TransactionAmt":700.0,"card1":17480,"card2":528.0,"card3":150.0,"card5":226.0,"addr1":191.0,"addr2":87.0,"C1":1.0,"C2":3.0,"C3":0.0,"C4":0.0,"C5":0.0,"C6":6.0,"C7":0.0,"C8":0.0,"C9":1.0,"C10":0.0,"C11":2.0,"C12":0.0,"C13":6.0,"C14":1.0,"D1":15.0,"D10":12.0,"V12":1.0,"V13":1.0,"V14":1.0,"V15":0.0,"V16":0.0,"V17":0.0,"V18":0.0,"V19":0.0,"V20":4.0,"V21":0.0,"V22":0.0,"V23":1.0,"V24":1.0,"V25":1.0,"V26":2.0,"V27":0.0,"V28":0.0,"V29":0.0,"V30":0.0,"V31":0.0,"V32":0.0,"V33":0.0,"V34":0.0,"V53":1.0,"V54":1.0,"V55":1.0,"V56":3.0,"V57":0.0,"V58":0.0,"V59":0.0,"V60":0.0,"V61":3.0,"V62":3.0,"V63":0.0,"V64":0.0,"V65":1.0,"V66":1.0,"V67":2.0,"V68":0.0,"V69":0.0,"V70":0.0,"V71":0.0,"V72":0.0,"V73":0.0,"V74":0.0,"V95":0.0,"V96":12.0,"V97":8.0,"V98":0.0,"V99":2.0,"V100":2.0,"V101":0.0,"V102":9.0,"V103":5.0,"V104":0.0,"V105":1.0,"V106":1.0,"V107":1.0,"V108":1.0,"V109":1.0,"V110":1.0,"V111":1.0,"V112":1.0,"V113":1.0,"V114":1.0,"V115":1.0,"V116":1.0,"V1

In [25]:
import pickle

In [27]:
from sklearn.externals import joblib 



In [28]:
joblib.dump(lr, 'model.pkl') 
knn_from_joblib = joblib.load('model.pkl')  
knn_from_joblib.predict(X_test) 


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [31]:
knn_from_joblib.predict_proba(X_test)[0][1]


0.019081194901715486