#**Business Understanding**

Goal: To detect the transactions either legit or fraud.
Objective: By using predictive models.



In [None]:
import numpy as np
import pandas as pd
from termcolor import colored as cl # text customization


In [None]:
df=pd.read_csv("creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


The dataset is imbalanced towards a feature. Checking the transaction distribution.

In [None]:
Total_transactions = len(df)
normal = len(df[df.Class == 0])
fraudulent = len(df[df.Class == 1])
fraud_percentage = round(fraudulent/normal*100, 2)
print(cl('Total number of Transactions are {}'.format(Total_transactions), attrs = ['bold']))
print(cl('Number of Normal Transactions are {}'.format(normal), attrs = ['bold']))
print(cl('Number of fraudulent Transactions are {}'.format(fraudulent), attrs = ['bold']))
print(cl('Percentage of fraud Transactions is {}'.format(fraud_percentage), attrs = ['bold']))

[1mTotal number of Transactions are 81299[0m
[1mNumber of Normal Transactions are 81100[0m
[1mNumber of fraudulent Transactions are 198[0m
[1mPercentage of fraud Transactions is 0.24[0m


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81299 entries, 0 to 81298
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    81299 non-null  int64  
 1   V1      81299 non-null  float64
 2   V2      81299 non-null  float64
 3   V3      81299 non-null  float64
 4   V4      81299 non-null  float64
 5   V5      81299 non-null  float64
 6   V6      81298 non-null  float64
 7   V7      81298 non-null  float64
 8   V8      81298 non-null  float64
 9   V9      81298 non-null  float64
 10  V10     81298 non-null  float64
 11  V11     81298 non-null  float64
 12  V12     81298 non-null  float64
 13  V13     81298 non-null  float64
 14  V14     81298 non-null  float64
 15  V15     81298 non-null  float64
 16  V16     81298 non-null  float64
 17  V17     81298 non-null  float64
 18  V18     81298 non-null  float64
 19  V19     81298 non-null  float64
 20  V20     81298 non-null  float64
 21  V21     81298 non-null  float64
 22

In [None]:
min(df.Amount), max(df.Amount)

(0.0, 19656.53)

Checking the minimum and maximum, the difference is huge that can deviate our result. Using standard scaler to scale it

In [None]:
sc = StandardScaler()
amount = df['Amount'].values
df['Amount'] = sc.fit_transform(amount.reshape(-1, 1))

Dropping time feature.

In [None]:
df.drop(['Time'], axis=1, inplace=True)

Check for any duplicate transactions. 284807 transactions in our data. Remove the duplicate and observe the changes.

In [None]:
df.shape

(81299, 30)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

(79511, 30)

In [None]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)


In [None]:
df.shape

(79511, 30)

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.preprocessing import  PolynomialFeatures, KBinsDiscretizer, FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, OrdinalEncoder
import statsmodels.formula.api as smf
import statsmodels.tsa as tsa
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import BaggingClassifier, BaggingRegressor,RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor 
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

Train & Test Split

In [None]:
X = df.drop('Class', axis = 1).values
y = df['Class'].values

In [None]:
#splitting train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

Modelling

K-Nearest Neighbors

In [None]:
n = 7
KNN = KNeighborsClassifier(n_neighbors = n)
KNN.fit(X_train, y_train)
knn_yhat = KNN.predict(X_test)

print('Accuracy score of the K-Nearest Neighbors model is {}'.format(accuracy_score(y_test, knn_yhat)))

ValueError: ignored

In [None]:
#lr = LogisticRegression()
#lr.fit(X_train, y_train)
#lr_yhat = lr.predict(X_test)

model=LogisticRegression()
model.fit(X_train,y_train)
y_preds=model.predict(X_test)
print(classification_report(y_test, y_preds))

ValueError: ignored

AttributeError: ignored