In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [5]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [6]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
87235,61595,-0.788289,-0.270922,1.426067,-1.557292,0.326118,-1.139993,0.191611,0.055201,1.245247,...,0.138402,0.528514,-0.099555,0.436539,0.305689,-0.779462,0.082087,0.048889,9.95,0.0
87236,61595,1.510898,-1.237081,0.055205,-1.596874,-1.328718,-0.290028,-1.184932,0.088371,-1.669287,...,-0.238966,-0.474265,-0.059932,-0.588342,0.378212,-0.165141,0.006521,-0.00102,25.0,0.0
87237,61595,-0.903199,0.920795,1.087351,1.206105,-0.123057,0.67309,0.369554,0.678393,-0.83088,...,0.097581,0.188902,-0.061636,-0.304928,0.168362,-0.186627,-0.007386,0.033238,80.51,0.0
87238,61596,-1.136232,0.825319,0.466206,-1.296195,0.493619,-0.109541,0.571839,0.355682,-0.873281,...,0.100479,0.027285,-0.329304,-0.868148,0.200414,0.944678,-0.269153,-0.013332,51.34,0.0
87239,61596,-6.759521,5.572257,-0.951531,-3.194834,,,,,,...,,,,,,,,,,


In [7]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87240 entries, 0 to 87239
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    87240 non-null  int64  
 1   V1      87240 non-null  float64
 2   V2      87240 non-null  float64
 3   V3      87240 non-null  float64
 4   V4      87240 non-null  float64
 5   V5      87239 non-null  float64
 6   V6      87239 non-null  float64
 7   V7      87239 non-null  float64
 8   V8      87239 non-null  float64
 9   V9      87239 non-null  float64
 10  V10     87239 non-null  float64
 11  V11     87239 non-null  float64
 12  V12     87239 non-null  float64
 13  V13     87239 non-null  float64
 14  V14     87239 non-null  float64
 15  V15     87239 non-null  float64
 16  V16     87239 non-null  float64
 17  V17     87239 non-null  float64
 18  V18     87239 non-null  float64
 19  V19     87239 non-null  float64
 20  V20     87239 non-null  float64
 21  V21     87239 non-null  float64
 22

In [8]:
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,1
V6,1
V7,1
V8,1
V9,1


In [9]:
# Drop rows with any missing values
credit_card_data_cleaned = credit_card_data.dropna()


In [11]:
credit_card_data_cleaned = credit_card_data.dropna(subset=['V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28'])


In [12]:
credit_card_data.dropna(inplace=True)


In [14]:
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [15]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,87034
1.0,205


In [16]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [17]:
print(legit.shape)
print(fraud.shape)

(87034, 31)
(205, 31)


In [18]:
# statistical measures of the data
legit.Amount.describe()

Unnamed: 0,Amount
count,87034.0
mean,98.392549
std,267.475633
min,0.0
25%,7.68
50%,27.0
75%,89.79
max,19656.53


In [19]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,205.0
mean,101.984439
std,228.446986
min,0.0
25%,1.0
50%,7.58
75%,99.99
max,1809.68


In [20]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,39219.680136,-0.249921,-0.05018,0.700945,0.15149,-0.267417,0.099006,-0.0946,0.047273,-0.018269,...,0.040984,-0.031313,-0.105935,-0.03658,0.009933,0.132438,0.026564,-0.000518,0.002005,98.392549
1.0,34268.526829,-6.38594,4.394241,-8.432887,5.084061,-4.597691,-1.9551,-6.705734,2.927156,-3.042642,...,0.389325,0.752293,-0.146696,-0.238003,-0.095301,0.223491,0.093696,0.560829,0.044682,101.984439


In [21]:
legit_sample = legit.sample(n=492)

In [23]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [24]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
32980,37040,1.480963,-0.98359,0.569513,-1.394963,-1.445784,-0.534671,-1.063592,-0.053352,-1.785451,...,-0.435421,-0.879569,0.253874,0.008201,0.037692,-0.448887,0.051244,0.022871,5.76,0.0
71820,54456,1.379716,0.024631,0.039978,-0.402325,0.003408,-0.192715,-0.150522,-0.147705,0.300537,...,-0.210598,-0.528729,-0.092277,-0.919666,0.346391,0.986794,-0.064657,-0.001759,3.8,0.0
21203,31534,-0.473954,0.772266,0.932299,-0.674218,1.585655,0.84773,1.045921,-0.255888,-0.388362,...,-0.061179,0.281314,-0.495129,-1.655042,0.09087,0.404699,-0.137863,-0.288349,13.0,0.0
38044,39224,-0.812919,0.628536,-0.408347,-1.062627,0.862254,-1.022522,0.533009,0.297109,-0.78805,...,0.18534,0.350516,-0.030257,-0.41101,-0.76797,0.770434,0.228444,0.113926,14.95,0.0
48654,43727,1.197258,0.174998,0.377905,0.373984,-0.035526,0.00044,-0.094563,0.075063,-0.265168,...,-0.194278,-0.540522,0.107416,-0.300933,0.173624,0.125592,-0.013868,0.003123,1.98,0.0


In [25]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
83053,59669,0.326007,1.286638,-2.007181,2.419675,-1.532902,-1.432803,-2.45953,0.617738,-1.125861,...,0.556895,0.169776,-0.174357,0.308061,0.710996,-0.23103,0.580495,0.300984,6.27,1.0
83297,59777,-8.257111,-4.814461,-5.365307,1.20423,-3.34742,-1.331601,-1.967893,1.295438,-1.674415,...,0.43639,-0.077553,-3.091624,-0.390201,-0.288689,-0.340004,0.039819,-1.0079,319.2,1.0
83417,59840,-3.215382,-0.364223,-1.261883,3.794949,0.711206,-1.316476,-5.165141,0.625278,-1.582301,...,0.401341,0.152191,-0.934675,-0.256148,-0.469403,-0.282496,0.866077,-0.433466,5.91,1.0
84543,60353,-3.975216,0.581573,-1.880372,4.319241,-3.02433,1.240793,-1.909559,0.660718,-2.752611,...,0.578984,1.397311,1.045322,-0.304,0.005295,0.235435,0.962015,-0.673557,454.82,1.0
86155,61108,-2.756007,0.683821,-1.390169,1.501887,-1.165614,-0.131207,-1.478741,-0.246922,-0.100523,...,0.320474,0.611027,0.174864,-0.502151,-0.174713,1.179242,-1.166315,0.821215,101.5,1.0


In [26]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,492
1.0,205


In [27]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,39375.270325,-0.313967,0.023472,0.62306,0.134215,-0.304168,0.053964,-0.137111,0.053333,0.006274,...,0.04842,-0.046289,-0.12415,-0.034326,0.018629,0.139012,0.039058,-0.000598,-0.014786,93.07498
1.0,34268.526829,-6.38594,4.394241,-8.432887,5.084061,-4.597691,-1.9551,-6.705734,2.927156,-3.042642,...,0.389325,0.752293,-0.146696,-0.238003,-0.095301,0.223491,0.093696,0.560829,0.044682,101.984439


In [28]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [29]:
print(X)

        Time        V1        V2        V3        V4        V5        V6  \
32980  37040  1.480963 -0.983590  0.569513 -1.394963 -1.445784 -0.534671   
71820  54456  1.379716  0.024631  0.039978 -0.402325  0.003408 -0.192715   
21203  31534 -0.473954  0.772266  0.932299 -0.674218  1.585655  0.847730   
38044  39224 -0.812919  0.628536 -0.408347 -1.062627  0.862254 -1.022522   
48654  43727  1.197258  0.174998  0.377905  0.373984 -0.035526  0.000440   
...      ...       ...       ...       ...       ...       ...       ...   
83053  59669  0.326007  1.286638 -2.007181  2.419675 -1.532902 -1.432803   
83297  59777 -8.257111 -4.814461 -5.365307  1.204230 -3.347420 -1.331601   
83417  59840 -3.215382 -0.364223 -1.261883  3.794949  0.711206 -1.316476   
84543  60353 -3.975216  0.581573 -1.880372  4.319241 -3.024330  1.240793   
86155  61108 -2.756007  0.683821 -1.390169  1.501887 -1.165614 -0.131207   

             V7        V8        V9  ...       V20       V21       V22  \
32980 -1.0635

In [30]:
print(Y)

32980    0.0
71820    0.0
21203    0.0
38044    0.0
48654    0.0
        ... 
83053    1.0
83297    1.0
83417    1.0
84543    1.0
86155    1.0
Name: Class, Length: 697, dtype: float64


In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [32]:
print(X.shape, X_train.shape, X_test.shape)

(697, 30) (557, 30) (140, 30)


In [33]:
model = LogisticRegression()

In [36]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000000)  # Increase the number of iterations
model.fit(X_train, Y_train)


In [38]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train the logistic regression model on the scaled data
model = LogisticRegression(max_iter=1000000)
model.fit(X_train_scaled, Y_train)

In [39]:
model = LogisticRegression(solver='liblinear', max_iter=1000000)
model.fit(X_train, Y_train)


In [41]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train the logistic regression model on the scaled data
model = LogisticRegression(max_iter=1000000)
model.fit(X_train_scaled, Y_train)

In [42]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

In [43]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [44]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9784560143626571


In [45]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [46]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.95
