In [None]:
import numpy as np #numpy is useful for making arrays 
import pandas as pd #to make dataframes(structured table)
from sklearn.model_selection import train_test_split #to allow training data and test data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score #to check performance of model


In [None]:
# load the dataset to a pandas dataframe
credit_card_data = pd.read_csv("../input/creditcardfraud/creditcard.csv")

In [None]:
# 1st five rows of data set
credit_card_data.head()

In [None]:
credit_card_data.tail() #last five rows

In [None]:
#dataset informations
credit_card_data.info()

In [None]:
#checking the no. of missing values in each column
credit_card_data.isnull().sum()

We dont have any missing values in this dataset.

In [None]:
#check distribution of legit transactions & fraudulent transactions.
credit_card_data['Class'].value_counts()

This dataset is highly unbalanced

0 --> Normal Transaction
1 -- > Fraudulent transaction 

In [None]:
 #seperating the data for analysis 
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [None]:
print(legit.shape)
print(fraud.shape)

In [None]:
#statistical measures of data
legit.Amount.describe()

In [None]:
fraud.Amount.describe()

Mean for fraudulent is more

In [None]:
#compare the values for both transaction
credit_card_data.groupby('Class').mean()

In [None]:
#Dealing with unbalance data

UNDER-SAMPLING 

Build a sample dataset containing similar distribution of normal transaction and fraudulent transactions.

Number of fraudulent transactions --> 492

In [None]:
legit_sample = legit.sample(n=492)

In [None]:
#concatenating two dataframes
new_dataset = pd.concat([legit_sample, fraud],axis=0) #axis =0 row wise axis = 1 column wise

In [None]:
new_dataset.head()

In [None]:
new_dataset.tail()

In [None]:
new_dataset['Class'].value_counts()

In [None]:
new_dataset.groupby('Class').mean() 
#to check good sample or bad sample we have to compare it with previous data we can observe that good sample contains similar mean as compare to previous data

Splitting data into features and targets

In [None]:
x = new_dataset.drop(columns = 'Class',axis=1)
y = new_dataset['Class']

In [None]:
print(x)

In [None]:
print(y)

Split the data into training data and test data 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size = 0.2, stratify=y, random_state=2)

In [None]:
''' we created 4 arrays here because x and y train will store the 80% data and their
corresponding labels and x&y test will store 20% data and their corresponding labels''' 


In [None]:
''' test size = 0.2 denotes we are taking 20% of data as our test data
stratify denotes even distribution of class between x train and x test '''

In [None]:
print(x.shape,x_train.shape,x_test.shape)

Model Training

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression Model with Training Data
model.fit(x_train,y_train)

Model Evaluation

Accuracy score

In [None]:
# accuracy on training data 
x_train_prediction = model.predict(x_train) #this will predict y_train values corresponding to x_train
training_data_accuracy = accuracy_score(x_train_prediction, y_train) #will compare values predicted by our model with values stored in y_train

In [None]:
print('Accuracy on training data:',training_data_accuracy)

In [None]:
# accuracy on test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [None]:
print('Accuracy on Test Data:',test_data_accuracy)