In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression # LogisticRegression Model from sklearn
from sklearn.metrics import classification_report, confusion_matrix,roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
fraud_txn_df = pd.read_csv("fraud_txn.csv")
fraud_txn_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-12-16 12:40:56,377113842678100,"fraud_O'Connell, Botsford and Hand",home,32.65,Billy,Gallagher,M,673 Delgado Burg,...,39.4055,-75.3209,804,Insurance risk surveyor,1965-03-25,ecf3cff893d47eeec287958f2dcbe9f8,1355661656,38.988831,-75.44242,0
1,1,2019-11-10 21:29:24,502038091380,"fraud_Haley, Jewess and Bechtelar",shopping_pos,819.52,Kurt,Peters,M,7290 Ashlee Keys,...,36.6704,-93.0377,1201,"Engineer, agricultural",1980-03-18,46f3f81e3f27d015db2d80f16a663b3d,1352582964,37.502359,-92.941619,1
2,2,2019-12-31 22:11:08,30181367542315,"fraud_Tillman, Fritsch and Schmitt",misc_net,808.97,Danielle,Walker,F,9824 Mitchell Run,...,44.2415,-121.0113,2016,Local government officer,1953-12-25,cf6151929f0b38c9db2181b27423cd34,1356991868,44.93175,-121.287451,1
3,3,2020-04-09 17:23:25,376445266762684,"fraud_Mueller, Gerhold and Mueller",health_fitness,64.85,Rachel,Lowe,F,372 Jeffrey Course,...,41.1558,-101.136,1789,Insurance broker,1982-02-11,5124d4971ec41f68634646067f63cd16,1365528205,41.084349,-100.922198,0
4,4,2019-12-22 15:03:23,3500969075198072,fraud_Frami Group,entertainment,63.2,Kenneth,Sanchez,M,0110 Ashley Forest,...,47.2271,-117.0819,895,Clothing/textile technologist,1999-05-31,c8b812894d9b9ca7feeff9c83facaec1,1356188603,46.411414,-116.657814,0


In [13]:
# Check data
fraud_txn_df.info() # no missing info detected

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15506 entries, 0 to 15505
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             15506 non-null  int64  
 1   trans_date_trans_time  15506 non-null  object 
 2   cc_num                 15506 non-null  int64  
 3   merchant               15506 non-null  object 
 4   category               15506 non-null  object 
 5   amt                    15506 non-null  float64
 6   first                  15506 non-null  object 
 7   last                   15506 non-null  object 
 8   gender                 15506 non-null  object 
 9   street                 15506 non-null  object 
 10  city                   15506 non-null  object 
 11  state                  15506 non-null  object 
 12  zip                    15506 non-null  int64  
 13  lat                    15506 non-null  float64
 14  long                   15506 non-null  float64
 15  ci

In [14]:
# Handle Missing Values
# forward fill or forward propagation. Fill NaNs if any 
# by propagating the last valid observation forward to the next missing value NaN.
fraud_txn_df.ffill(inplace= True) # update existing dataframe.
fraud_txn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15506 entries, 0 to 15505
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             15506 non-null  int64  
 1   trans_date_trans_time  15506 non-null  object 
 2   cc_num                 15506 non-null  int64  
 3   merchant               15506 non-null  object 
 4   category               15506 non-null  object 
 5   amt                    15506 non-null  float64
 6   first                  15506 non-null  object 
 7   last                   15506 non-null  object 
 8   gender                 15506 non-null  object 
 9   street                 15506 non-null  object 
 10  city                   15506 non-null  object 
 11  state                  15506 non-null  object 
 12  zip                    15506 non-null  int64  
 13  lat                    15506 non-null  float64
 14  long                   15506 non-null  float64
 15  ci

In [None]:
#Drop unnecessary columns. Demographic columns is not needed.
fraud_txn_df.drop(columns=['Unnamed: 0','trans_date_trans_time', 'first', 'last', 'street', 'city', 'state', 'job', 'dob', 'trans_num']
                  , inplace=True) # Update the same dataframe - inplace=True
fraud_txn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15506 entries, 0 to 15505
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   cc_num      15506 non-null  int64  
 1   merchant    15506 non-null  object 
 2   category    15506 non-null  object 
 3   amt         15506 non-null  float64
 4   gender      15506 non-null  object 
 5   zip         15506 non-null  int64  
 6   lat         15506 non-null  float64
 7   long        15506 non-null  float64
 8   city_pop    15506 non-null  int64  
 9   unix_time   15506 non-null  int64  
 10  merch_lat   15506 non-null  float64
 11  merch_long  15506 non-null  float64
 12  is_fraud    15506 non-null  int64  
dtypes: float64(5), int64(5), object(3)
memory usage: 1.5+ MB


In [17]:
fraud_txn_df.head()

Unnamed: 0,cc_num,merchant,category,amt,gender,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,377113842678100,"fraud_O'Connell, Botsford and Hand",home,32.65,M,8323,39.4055,-75.3209,804,1355661656,38.988831,-75.44242,0
1,502038091380,"fraud_Haley, Jewess and Bechtelar",shopping_pos,819.52,M,65680,36.6704,-93.0377,1201,1352582964,37.502359,-92.941619,1
2,30181367542315,"fraud_Tillman, Fritsch and Schmitt",misc_net,808.97,F,97753,44.2415,-121.0113,2016,1356991868,44.93175,-121.287451,1
3,376445266762684,"fraud_Mueller, Gerhold and Mueller",health_fitness,64.85,F,69165,41.1558,-101.136,1789,1365528205,41.084349,-100.922198,0
4,3500969075198072,fraud_Frami Group,entertainment,63.2,M,99033,47.2271,-117.0819,895,1356188603,46.411414,-116.657814,0


In [None]:
# Covert Categorical columns to Numerical values - Gender column
mapped_gender_series = fraud_txn_df['gender'].map({'M':0, 'F':1})
print("mapped gender values : \n ",mapped_gender_series.values)
fraud_txn_df['gender'] = mapped_gender_series
fraud_txn_df.head()

mapped gender values : 
  [0 0 1 ... 1 0 0]


Unnamed: 0,cc_num,merchant,category,amt,gender,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,377113842678100,"fraud_O'Connell, Botsford and Hand",home,32.65,0,8323,39.4055,-75.3209,804,1355661656,38.988831,-75.44242,0
1,502038091380,"fraud_Haley, Jewess and Bechtelar",shopping_pos,819.52,0,65680,36.6704,-93.0377,1201,1352582964,37.502359,-92.941619,1
2,30181367542315,"fraud_Tillman, Fritsch and Schmitt",misc_net,808.97,1,97753,44.2415,-121.0113,2016,1356991868,44.93175,-121.287451,1
3,376445266762684,"fraud_Mueller, Gerhold and Mueller",health_fitness,64.85,1,69165,41.1558,-101.136,1789,1365528205,41.084349,-100.922198,0
4,3500969075198072,fraud_Frami Group,entertainment,63.2,0,99033,47.2271,-117.0819,895,1356188603,46.411414,-116.657814,0


In [None]:
# One-hot encoding to Numerical values - Gender column