# 1.) Import the Credit Card Fraud Data From CCLE

In [1]:
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
import numpy as np

In [2]:
drive.mount('/content/gdrive/', force_remount = True)

Mounted at /content/gdrive/


In [3]:
df = pd.read_csv("/content/gdrive/MyDrive/ECON 441B/fraudTest.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


# 2.) Select four columns to use as features (one just be trans_date_trans)

In [5]:
df.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [6]:
df_select = df[['trans_date_trans_time', 'category', 'amt', 'city_pop', 'is_fraud']]

In [7]:
df_select.head()

Unnamed: 0,trans_date_trans_time,category,amt,city_pop,is_fraud
0,2020-06-21 12:14:25,personal_care,2.86,333497,0
1,2020-06-21 12:14:33,personal_care,29.84,302,0
2,2020-06-21 12:14:53,health_fitness,41.28,34496,0
3,2020-06-21 12:15:15,misc_pos,60.05,54767,0
4,2020-06-21 12:15:17,travel,3.19,1126,0


# 3.) Create a unique variable out of trans_date.

In [8]:
type(df['trans_date_trans_time'][0])

str

In [9]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

In [10]:
df_select["time_var"] = [i.hour for i in df["trans_date_trans_time"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_select["time_var"] = [i.hour for i in df["trans_date_trans_time"]]


In [11]:
df_select["evening"] = np.where(df_select["time_var"] > 18, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_select["evening"] = np.where(df_select["time_var"] > 18, 1, 0)


In [12]:
df_select["morning"] = np.where(df_select["time_var"]< 5, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_select["morning"] = np.where(df_select["time_var"]< 5, 1, 0)


In [13]:
df_select["day"] = [1 if 5< i <=18 else 0 for i in df_select["time_var"] ]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_select["day"] = [1 if 5< i <=18 else 0 for i in df_select["time_var"] ]


In [14]:
dummies = pd.get_dummies(df_select["category"])
X = pd.concat([dummies, df_select[["amt", "city_pop", "time_var", "evening",	"morning",	"day"]]], axis = 1)
y = df_select["is_fraud"]

In [15]:
X.head()

Unnamed: 0,entertainment,food_dining,gas_transport,grocery_net,grocery_pos,health_fitness,home,kids_pets,misc_net,misc_pos,personal_care,shopping_net,shopping_pos,travel,amt,city_pop,time_var,evening,morning,day
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2.86,333497,12,0,0,1
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,29.84,302,12,0,0,1
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,41.28,34496,12,0,0,1
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,60.05,54767,12,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3.19,1126,12,0,0,1


In [16]:
y

0         0
1         0
2         0
3         0
4         0
         ..
555714    0
555715    0
555716    0
555717    0
555718    0
Name: is_fraud, Length: 555719, dtype: int64

# 4.) Oversample the data (this will be your training data).

# 5.) Train a Logistic regression.

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
split_size = int(len(X) * 0.8)

In [19]:
x_train, x_test = X[:split_size], X[split_size:]
y_train, y_test = y[:split_size], y[split_size:]

In [20]:
x_train

Unnamed: 0,entertainment,food_dining,gas_transport,grocery_net,grocery_pos,health_fitness,home,kids_pets,misc_net,misc_pos,personal_care,shopping_net,shopping_pos,travel,amt,city_pop,time_var,evening,morning,day
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2.86,333497,12,0,0,1
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,29.84,302,12,0,0,1
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,41.28,34496,12,0,0,1
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,60.05,54767,12,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3.19,1126,12,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444570,0,1,0,0,0,0,0,0,0,0,0,0,0,0,13.12,9521,13,0,0,1
444571,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3.08,365,13,0,0,1
444572,0,0,0,0,0,0,0,1,0,0,0,0,0,0,35.61,1423,13,0,0,1
444573,0,1,0,0,0,0,0,0,0,0,0,0,0,0,28.86,370,13,0,0,1


In [21]:
log_reg = LogisticRegression().fit(x_train,y_train)

# 6.) The company you are working for wants to target at a False Positive rate of 5% what threshold should you use? (Use oversampled data)

In [22]:
from sklearn.metrics import confusion_matrix

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# Train your model
clf = LogisticRegression()
clf.fit(x_train, y_train)

# Predict probabilities for test set
probs = clf.predict_proba(x_test)

# Find the threshold that gives a False Positive rate of 5%
threshold = 0
for t in np.arange(0, 1, 0.01):
    preds = probs[:,1] > t
    cm = confusion_matrix(y_test, preds)
    fpr = cm[0,1] / (cm[0,0] + cm[0,1])
    if fpr <= 0.05:
        threshold = t
        break

In [24]:
threshold

0.01

# 7.) If the company makes .02*amt on True transactions and loses -amt on False (Use original data)

In [25]:
preds = probs[:,1] > 0.01

In [26]:
x_test

Unnamed: 0,entertainment,food_dining,gas_transport,grocery_net,grocery_pos,health_fitness,home,kids_pets,misc_net,misc_pos,personal_care,shopping_net,shopping_pos,travel,amt,city_pop,time_var,evening,morning,day
444575,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2.01,1190,13,0,0,1
444576,0,1,0,0,0,0,0,0,0,0,0,0,0,0,163.80,1563,13,0,0,1
444577,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2.33,1228,13,0,0,1
444578,0,0,0,0,0,0,0,0,1,0,0,0,0,0,163.97,564,13,0,0,1
444579,0,0,0,0,0,0,0,0,0,0,0,0,0,1,9.08,6284,13,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,0,0,0,0,0,1,0,0,0,0,0,0,0,0,43.77,519,23,1,0,0
555715,0,0,0,0,0,0,0,1,0,0,0,0,0,0,111.84,28739,23,1,0,0
555716,0,0,0,0,0,0,0,1,0,0,0,0,0,0,86.88,3684,23,1,0,0
555717,0,0,0,0,0,0,0,0,0,0,0,0,0,1,7.99,129,23,1,0,0


In [27]:
x_test["Preds"] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test["Preds"] = preds


In [28]:
x_test["signal"] = x_test["Preds"] * y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test["signal"] = x_test["Preds"] * y_test


In [29]:
df_compare = pd.concat([x_test, y_test], axis = 1)

In [30]:
df_compare["Preds"] = [0 if i == False else 1 for i in df_compare["Preds"]]

In [31]:
df_compare["FN"] = (df_compare["is_fraud"] == 1) & (df_compare["Preds"] == 0)

In [32]:
df_compare

Unnamed: 0,entertainment,food_dining,gas_transport,grocery_net,grocery_pos,health_fitness,home,kids_pets,misc_net,misc_pos,...,amt,city_pop,time_var,evening,morning,day,Preds,signal,is_fraud,FN
444575,0,0,0,0,0,0,1,0,0,0,...,2.01,1190,13,0,0,1,0,0,0,False
444576,0,1,0,0,0,0,0,0,0,0,...,163.80,1563,13,0,0,1,0,0,0,False
444577,0,0,0,0,0,0,0,0,0,0,...,2.33,1228,13,0,0,1,0,0,0,False
444578,0,0,0,0,0,0,0,0,1,0,...,163.97,564,13,0,0,1,0,0,0,False
444579,0,0,0,0,0,0,0,0,0,0,...,9.08,6284,13,0,0,1,0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,0,0,0,0,0,1,0,0,0,0,...,43.77,519,23,1,0,0,0,0,0,False
555715,0,0,0,0,0,0,0,1,0,0,...,111.84,28739,23,1,0,0,0,0,0,False
555716,0,0,0,0,0,0,0,1,0,0,...,86.88,3684,23,1,0,0,0,0,0,False
555717,0,0,0,0,0,0,0,0,0,0,...,7.99,129,23,1,0,0,0,0,0,False


In [33]:
df_compare["profit"] = df_compare["signal"] * 0.02 * df_compare["amt"] + df_compare["FN"] * -df_compare["amt"]

In [34]:
df_compare["profit"].sum()

-22745.207799999996

In [35]:
# x1 * sum(TP) + x2 * sum(fp) + x3* sum(FN) + x4 * sum(TN)

# 8.) Using Logistic Regression Lasso to inform you. Would you use the selected features in a trusted prediction model?

In [36]:
model = LogisticRegression(solver = "liblinear", penalty='l1').fit(x_train, y_train)

In [37]:
model.coef_

array([[-1.84296001e+00, -1.97053645e+00,  2.26977821e-01,
         7.49731540e-02,  1.27540828e+00, -2.03790157e+00,
        -2.27472300e+00, -2.10652645e+00,  6.48830009e-01,
        -1.11810691e+00, -1.76610566e+00,  2.52883193e-02,
        -1.15797203e+00, -5.78100676e+00,  2.40187847e-03,
        -7.10687359e-07,  2.30286240e-01, -7.66226115e-02,
         2.58022925e+00, -1.54915662e+00]])

In [38]:
# see if the variables all got to 0 -> input data is terrible

In [39]:
#As we can see from the coefficients, they are not very close to 0 
#so we would say that the selected features could be used in a trusted prediction model.