## Exploring the data

In [849]:
import numpy as np
import pandas as pd

In [850]:
df = pd.read_csv("/content/Creditcard_data.csv")

In [851]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [852]:
df.shape

(772, 31)

In [853]:
df.Time.value_counts()

145    5
427    5
284    5
190    5
140    5
      ..
304    1
305    1
308    1
318    1
581    1
Name: Time, Length: 415, dtype: int64

In [854]:
df.Class.value_counts(normalize=True)

0    0.988342
1    0.011658
Name: Class, dtype: float64

In [855]:
from sklearn.model_selection import train_test_split

In [856]:
inputs = df.drop('Class',axis=1)
targets = df['Class']

In [857]:
inputs.shape

(772, 30)

In [858]:
targets.shape

(772,)

In [859]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs,targets, test_size=0.3,random_state=42)

In [860]:
train_inputs.shape

(540, 30)

In [861]:
val_inputs.shape

(232, 30)

In [862]:
train_targets.shape

(540,)

In [863]:
val_targets.shape

(232,)

# Checking with linear relationsip

### Applying Logistic Regression without any sampling technique used


In [864]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [865]:
model1 = LogisticRegression(class_weight='balanced',random_state=11)

In [866]:
model1.fit(train_inputs,train_targets)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(class_weight='balanced', random_state=11)

In [867]:
val_preds = model1.predict(val_inputs)

In [868]:
accuracy_score(val_preds,val_targets)

0.9094827586206896

### Applying Logistic Regression Model with default parameters

In [869]:
model2 = LogisticRegression(random_state=11)

In [870]:
model2.fit(train_inputs,train_targets)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=11)

In [871]:
val_preds2 = model2.predict(val_inputs)

In [872]:
accuracy_score(val_preds2,val_targets)

0.9827586206896551

# Checking with non-linear relationship

## Tree based classifier

In [873]:
from sklearn.ensemble import ExtraTreesClassifier

In [874]:
model3 = ExtraTreesClassifier(random_state=11)

In [875]:
model3.fit(train_inputs,train_targets)

ExtraTreesClassifier(random_state=11)

In [876]:
val_preds3 = model3.predict(val_inputs)

In [877]:
accuracy_score(val_preds3,val_targets)

0.9870689655172413

**Tree based model worked slightly better because it can classify non-linear relationship better**


# Now applying sampling techniques

## Random Under Sampling
#### Removing data from the class with more value counts

In [878]:
train_targets.value_counts()

0    534
1      6
Name: Class, dtype: int64

In [879]:
val_targets.value_counts()

0    229
1      3
Name: Class, dtype: int64

In [880]:
class_0_inputs = train_inputs[train_targets == 0]
class_1_inputs = train_inputs[train_targets == 1]

class_0_targets = train_targets[train_targets == 0]
class_1_targets = train_targets[train_targets == 1]

class_0 = pd.concat([class_0_inputs,class_0_targets],axis=1)
class_1 = pd.concat([class_1_inputs,class_1_targets],axis=1)

In [881]:
class_0

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
333,244,-1.118946,-0.071366,2.807769,1.025675,-0.100748,0.508680,0.620313,-0.213137,0.333039,...,-0.045782,0.455553,0.170942,0.076211,0.197637,-0.286674,-0.230530,-0.405084,100.37,0
306,221,-0.342871,-0.199546,1.976353,-0.003495,-1.170366,0.883501,-0.151879,0.160106,0.137973,...,-0.313443,0.086207,0.109600,-0.098951,-0.943009,-0.618657,0.253306,0.240271,99.82,0
507,375,-0.837689,0.777698,1.841252,3.056892,0.303627,0.615335,0.531504,-0.081955,-0.522527,...,-0.070069,0.556788,0.217681,0.100721,-0.332479,0.252526,0.138865,-0.085152,29.18,0
338,248,-0.216867,0.900896,1.502850,0.812492,0.193952,-0.031488,0.490795,0.120991,-0.907336,...,0.147497,0.463470,-0.045124,0.224126,-0.275402,-0.415339,0.108635,0.052981,10.00,0
139,85,-4.257597,1.649489,-1.591242,-1.374317,-1.180136,-0.331040,-0.117992,1.592032,0.415996,...,-0.452568,-0.671703,0.001422,-0.876377,0.070098,0.729582,-0.002942,-0.481497,124.67,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,46,-0.378245,0.732925,-0.120154,0.185755,2.594269,3.797183,0.059088,0.976768,-0.412661,...,-0.107582,-0.157140,-0.194659,1.013897,0.145503,-0.237620,0.411372,0.202788,11.45,0
106,70,-0.426072,-0.060304,2.220828,0.024742,-0.584964,0.460623,-0.322526,0.434776,1.252404,...,0.149485,0.769878,-0.092634,0.150536,-0.234230,0.504710,0.069158,0.041024,21.80,0
270,190,-0.549414,0.676861,2.151950,1.014523,-0.620012,0.076154,0.041578,0.342672,0.124723,...,0.212024,0.850203,-0.185597,0.544990,-0.130609,-0.196374,0.422119,0.203313,20.70,0
435,313,-0.907420,1.103912,1.288489,1.243612,-0.068032,0.214040,0.324000,0.436037,-0.437409,...,0.022520,0.399523,-0.049081,0.220258,-0.162924,-0.286994,0.015071,-0.104668,15.08,0


In [882]:
len(class_0_inputs)

534

In [883]:
len(class_1_inputs)

6

In [884]:
class_0_under = class_0.sample(len(class_1),random_state=11)
len(class_0_under)

6

In [885]:
class_0_under_inputs = class_0_under.drop('Class',axis=1)
class_0_under_targets = class_0_under['Class']

In [886]:
class_under_sampled_inputs = pd.concat([class_0_under_inputs,class_1_inputs],axis=0)
class_under_sampled_targets = pd.concat([class_0_under_targets,class_1_targets],axis=0)

In [887]:
len(class_under_sampled_inputs)
len(class_under_sampled_targets)

12

#### Applying Tree Based Classifier

In [888]:
model = ExtraTreesClassifier(random_state=11)

In [889]:
model.fit(class_under_sampled_inputs,class_under_sampled_targets)

ExtraTreesClassifier(random_state=11)

In [890]:
val_preds = model.predict(val_inputs)

In [891]:
accuracy_score(val_preds,val_targets)

0.7155172413793104

**Under Sampling have performed really bad. This happened because, when we are doing under sampling there is loss of valuable data from the majority class. Also, the data formed after performing Random Under Sampling is too small to make accurate predictions.**

## Random Over Sampling

#### Adding data to the class with lower value counts

In [892]:
class_1.Class.value_counts()

1    6
Name: Class, dtype: int64

In [893]:
class_1_over = class_1.sample(len(class_0),replace=True)
len(class_1_over)

534

In [894]:
class_1_over_inputs = class_1_over.drop(['Class'],axis=1)
class_1_over_targets = class_1_over['Class']

In [895]:
class_over_sampled_inputs = pd.concat([class_1_over_inputs,class_0_inputs],axis=0)
class_over_sampled_targets = pd.concat([class_1_over_targets,class_0_targets],axis=0)

In [896]:
len(class_over_sampled_inputs)

1068

In [897]:
len(class_over_sampled_targets)

1068

In [898]:
class_over_sampled_targets.value_counts()

1    534
0    534
Name: Class, dtype: int64

#### Applying Tree Based Classifier

In [899]:
model.fit(class_over_sampled_inputs,class_over_sampled_targets)

ExtraTreesClassifier(random_state=11)

In [900]:
val_preds = model.predict(val_inputs)

In [901]:
accuracy_score(val_preds,val_targets)

0.9870689655172413

In [902]:
import xgboost as xgb

In [903]:
modelXGB = xgb.XGBClassifier()

In [904]:
modelXGB.fit(class_over_sampled_inputs,class_over_sampled_targets)

XGBClassifier()

In [905]:
val_preds = modelXGB.predict(val_inputs)

In [906]:
accuracy_score(val_preds,val_targets)

0.9655172413793104

#### Above is the self built implementation of sampling

#### Below lets use the inbuilt resampling

#### Random Over Sampling using inbuilt class

In [907]:
from imblearn.over_sampling import RandomOverSampler

In [908]:
ros = RandomOverSampler(random_state=42)

In [909]:
resampled_inputs, resampled_targets = ros.fit_resample(inputs,targets)

In [910]:
resampled_targets.value_counts()

0    763
1    763
Name: Class, dtype: int64

In [911]:
modelXGB.fit(resampled_inputs,resampled_targets)

XGBClassifier()

In [912]:
val_preds = modelXGB.predict(val_inputs)

In [913]:
accuracy_score(val_preds,val_targets)

1.0

In [914]:
modelETC = ExtraTreesClassifier()

In [915]:
modelETC.fit(resampled_inputs,resampled_targets)

ExtraTreesClassifier()

In [916]:
val_preds = modelETC.predict(val_inputs)

In [917]:
accuracy_score(val_preds,val_targets)

1.0

In [918]:
modelLR = LogisticRegression()

In [919]:
modelLR.fit(resampled_inputs,resampled_targets)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [920]:
val_preds = modelLR.predict(val_inputs)

In [921]:
accuracy_score(val_preds,val_targets)

0.8362068965517241

#### Here we can clearly see that after performing Random Over Sampling and then applying tree based classifier for classification. The accuracy score is coming to be exactly 1.
#### After performing resampling also Tree Based Classifiers are performing better then the linear classifier logistic regression

#### Now dataset is properly balanced as it gives good result with the classifier. Now, i will use resampled_inputs as my new input and resampled_targets as my new targets 

### **Till here i have properly balanced the data**
##### **Total rows are 1526 after balancing having 763 rows for each class** 

# Creating Five Samples of my new input data using Sample Size Detection Formula for Simple Random Sampling

In [922]:
resampled_inputs.shape

(1526, 30)

In [923]:
resampled_targets.shape

(1526,)

In [924]:
resampled = pd.concat([resampled_inputs,resampled_targets],axis=1)

In [925]:
resampled.Class.value_counts()

0    763
1    763
Name: Class, dtype: int64

In [926]:

## Assuming z-score = 1.96 (for 95% confidence)
## Assuming p = 0.5
## Taking margin of error = 5%
## These are the default parameters and can be changed according to the input data

#Size detection for Simple Random Sampling
def sample_size_detect_simple(zs=1.96,p=0.5,e=0.05):
  q = 1-p
  return ((zs**2)*p*q)/(e**2)

#Size Detection for Stratified Sampling
def sample_size_detect_stratified(zs=1.96,p=0.5,e=0.05,no_class=2):
  q = 1-p
  return ((zs**2)*p*q)/((e/no_class)**2)

#Size Detection for Cluster Sampling 
def sample_size_detect_cluster(c,zs=1.96,p=0.5,e=0.05):
  q = 1-p
  return ((zs**2)*p*q)/((e/c)**2)


In [927]:
import math

In [928]:
n1 = sample_size_detect_simple()
n1 = math.ceil(n1)

n2 = sample_size_detect_stratified()
n2 = math.ceil(n2)

In [929]:
## Simple Random Sampling
sample1 = resampled.sample(n=n1,replace=True,random_state=1)

In [930]:
## Stratified Sampling
resampled_0 = resampled[resampled.Class == 0]
resampled_1 = resampled[resampled.Class == 1]

##as both classes have same no. of elements the proportion for stratified sampling will be half 
resampled_0_stratified = resampled_0.sample(n=n2//2,replace=True,random_state=2)
resampled_1_stratified = resampled_1.sample(n=n2//2,replace=True,random_state=3)

sample2 = pd.concat([resampled_0_stratified,resampled_1_stratified],axis=0)


In [931]:
## Cluster Sampling

from sklearn.cluster import KMeans
k = 10
kmeans = KMeans(n_clusters=15)
kmeans.fit(resampled)

print(kmeans.inertia_)
resampled['labels'] = kmeans.labels_
resampled

2175341.135630363


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class,labels
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0,12
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,1,2
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0,7
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0,12
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521,529,-2.000567,-2.495484,2.467149,1.140053,2.462010,0.594262,-2.110183,0.788347,0.958809,...,1.195394,0.297836,-0.857105,-0.219322,0.861019,-0.124622,-0.171060,1.50,1,3
1522,164,0.073497,0.551033,0.451890,0.114964,0.822947,0.251480,0.296319,0.139497,-0.123050,...,-0.381932,0.151012,-1.363967,-1.389079,0.075412,0.231750,0.230171,0.99,1,5
1523,164,0.073497,0.551033,0.451890,0.114964,0.822947,0.251480,0.296319,0.139497,-0.123050,...,-0.381932,0.151012,-1.363967,-1.389079,0.075412,0.231750,0.230171,0.99,1,5
1524,539,-1.738582,0.052740,1.187057,-0.656652,0.920623,-0.291788,0.269083,0.140631,0.023464,...,-0.192036,-0.261879,-0.237477,-0.335040,0.240323,-0.345129,-0.383563,1.00,1,3


In [932]:
size=0
clusters = []
for i in range(0,k):
  print(i," ", len(resampled[resampled['labels'] == i]))
  size = size + len(resampled[resampled['labels'] == i])
  clusters.append(resampled[resampled['labels'] == i])

avg_size = size//k
print(avg_size)

0   183
1   102
2   177
3   320
4   1
5   180
6   6
7   13
8   127
9   162
127


In [933]:
import random
random_cluster = random.sample(clusters,k//2)

sample3 = pd.DataFrame()
for sam in random_cluster:
  sample3 = pd.concat([sample3,sam],axis=0)

sample3.Class.value_counts()

1    366
0    282
Name: Class, dtype: int64

In [934]:
sample3 = sample3.sample(n=n2,replace=True,random_state=20)

In [935]:
sample3.Class.value_counts()

1    838
0    699
Name: Class, dtype: int64

In [936]:
resampled.drop('labels',axis=1,inplace=True)
resampled

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521,529,-2.000567,-2.495484,2.467149,1.140053,2.462010,0.594262,-2.110183,0.788347,0.958809,...,0.422452,1.195394,0.297836,-0.857105,-0.219322,0.861019,-0.124622,-0.171060,1.50,1
1522,164,0.073497,0.551033,0.451890,0.114964,0.822947,0.251480,0.296319,0.139497,-0.123050,...,-0.128758,-0.381932,0.151012,-1.363967,-1.389079,0.075412,0.231750,0.230171,0.99,1
1523,164,0.073497,0.551033,0.451890,0.114964,0.822947,0.251480,0.296319,0.139497,-0.123050,...,-0.128758,-0.381932,0.151012,-1.363967,-1.389079,0.075412,0.231750,0.230171,0.99,1
1524,539,-1.738582,0.052740,1.187057,-0.656652,0.920623,-0.291788,0.269083,0.140631,0.023464,...,-0.179545,-0.192036,-0.261879,-0.237477,-0.335040,0.240323,-0.345129,-0.383563,1.00,1


In [937]:
len(resampled)

1526

In [938]:
## Stratified Sampling
shuffled_resampled = resampled.sample(frac=1.0, random_state=100)
n = len(shuffled_resampled)
print(n)
k = int(math.sqrt(n))
print(k)
sample4 = shuffled_resampled.iloc[::k]
sample4.Class.value_counts()

1526
39


0    25
1    15
Name: Class, dtype: int64

In [939]:
## Stratifies systematic sampling
### Here i am going to divide my dataset into 2 stratas according to the 'Class' label and then i am going to apply systematic sampling on
### each strata and after applying applying systematic sampling to each strata i am going to create a sample of size n2(size calculated
### above for stratified sampling) 
#### This technique is a good option for selecting samples as it ensures that each subgroup of the population is represented in the sample
#### with randomness. Randomness decreases biasness, increases diversity, improves generalisation

resampled_0 = resampled[resampled.Class == 0]
resampled_1 = resampled[resampled.Class == 1]

n = len(resampled_0)
k = int(math.sqrt(n))
a = resampled_0.iloc[::k]

n_ = len(resampled_1)
k_ = int(math.sqrt(n_))
b = resampled_1.iloc[::k_]

ab = pd.concat([a,b],axis=0)

sample5 = ab.sample(n=n2,replace=True,random_state=60)
sample5.Class.value_counts()


0    789
1    748
Name: Class, dtype: int64

In [940]:
sample1_inputs = sample1.drop('Class',axis=1)
sample1_targets = sample1['Class']

sample2_inputs = sample2.drop('Class',axis=1)
sample2_targets = sample2['Class']

sample3_inputs = sample3.drop('Class',axis=1)
sample3_targets = sample3['Class']
sample3_inputs.drop('labels',axis=1,inplace=True)

sample4_inputs = sample4.drop('Class',axis=1)
sample4_targets = sample4['Class']

sample5_inputs = sample5.drop('Class',axis=1)
sample5_targets = sample5['Class']

# Applying different ML Model on the different samples created

##### The Five ML Models that i am going to apply on each sample are:
##### 1) Logistic Regression
##### 2) Random Forest Classifier
##### 3) XGBoost Classifier
##### 4) ExtraTreesClassifier
##### 5) KNN

In [941]:
# importing the necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [942]:
model1 = LogisticRegression()
model2 = ExtraTreesClassifier()
model3 = RandomForestClassifier()
model4 = xgb.XGBClassifier()
model5 = KNeighborsClassifier(n_neighbors=5)

## Sample 1

In [943]:
model1.fit(sample1_inputs,sample1_targets)
model2.fit(sample1_inputs,sample1_targets)
model3.fit(sample1_inputs,sample1_targets)
model4.fit(sample1_inputs,sample1_targets)
model5.fit(sample1_inputs,sample1_targets)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KNeighborsClassifier()

In [944]:
val_preds1 = model1.predict(val_inputs)
val_preds2 = model2.predict(val_inputs)
val_preds3 = model3.predict(val_inputs)
val_preds4 = model4.predict(val_inputs)
val_preds5 = model5.predict(val_inputs)

In [945]:
as1 = accuracy_score(val_preds1,val_targets) * 100
as2 = accuracy_score(val_preds2,val_targets) * 100
as3 = accuracy_score(val_preds3,val_targets) * 100
as4 = accuracy_score(val_preds4,val_targets) * 100
as5 = accuracy_score(val_preds5,val_targets) * 100

In [946]:
print("Accuracy Score for model 1 : ",as1)
print("Accuracy Score for model 2 : ",as2)
print("Accuracy Score for model 3 : ",as3)
print("Accuracy Score for model 4 : ",as4)
print("Accuracy Score for model 5 : ",as4)

Accuracy Score for model 1 :  79.74137931034483
Accuracy Score for model 2 :  99.56896551724138
Accuracy Score for model 3 :  99.13793103448276
Accuracy Score for model 4 :  95.6896551724138
Accuracy Score for model 5 :  95.6896551724138


In [947]:
acc_sample1 = pd.Series([as1,as2,as3,as4,as5])
print(acc_sample1)

0    79.741379
1    99.568966
2    99.137931
3    95.689655
4    92.672414
dtype: float64


## Sample 2

In [948]:
model1.fit(sample2_inputs,sample2_targets)
model2.fit(sample2_inputs,sample2_targets)
model3.fit(sample2_inputs,sample2_targets)
model4.fit(sample2_inputs,sample2_targets)
model5.fit(sample2_inputs,sample2_targets)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KNeighborsClassifier()

In [949]:
val_preds1 = model1.predict(val_inputs)
val_preds2 = model2.predict(val_inputs)
val_preds3 = model3.predict(val_inputs)
val_preds4 = model4.predict(val_inputs)
val_preds5 = model5.predict(val_inputs)

In [950]:
as1 = accuracy_score(val_preds1,val_targets) * 100
as2 = accuracy_score(val_preds2,val_targets) * 100
as3 = accuracy_score(val_preds3,val_targets) * 100
as4 = accuracy_score(val_preds4,val_targets) * 100
as5 = accuracy_score(val_preds5,val_targets) * 100

In [951]:
print("Accuracy Score for model 1 : ",as1)
print("Accuracy Score for model 2 : ",as2)
print("Accuracy Score for model 3 : ",as3)
print("Accuracy Score for model 4 : ",as4)
print("Accuracy Score for model 5 : ",as4)

Accuracy Score for model 1 :  84.91379310344827
Accuracy Score for model 2 :  100.0
Accuracy Score for model 3 :  100.0
Accuracy Score for model 4 :  99.13793103448276
Accuracy Score for model 5 :  99.13793103448276


In [952]:
acc_sample2 = pd.Series([as1,as2,as3,as4,as5])
print(acc_sample2)

0     84.913793
1    100.000000
2    100.000000
3     99.137931
4     99.568966
dtype: float64


## Sample 3

In [953]:
model1.fit(sample3_inputs,sample3_targets)
model2.fit(sample3_inputs,sample3_targets)
model3.fit(sample3_inputs,sample3_targets)
model4.fit(sample3_inputs,sample3_targets)
model5.fit(sample3_inputs,sample3_targets)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KNeighborsClassifier()

In [954]:
val_preds1 = model1.predict(val_inputs)
val_preds2 = model2.predict(val_inputs)
val_preds3 = model3.predict(val_inputs)
val_preds4 = model4.predict(val_inputs)
val_preds5 = model5.predict(val_inputs)

In [955]:
as1 = accuracy_score(val_preds1,val_targets) * 100
as2 = accuracy_score(val_preds2,val_targets) * 100
as3 = accuracy_score(val_preds3,val_targets) * 100
as4 = accuracy_score(val_preds4,val_targets) * 100
as5 = accuracy_score(val_preds5,val_targets) * 100

In [956]:
print("Accuracy Score for model 1 : ",as1)
print("Accuracy Score for model 2 : ",as2)
print("Accuracy Score for model 3 : ",as3)
print("Accuracy Score for model 4 : ",as4)
print("Accuracy Score for model 5 : ",as4)

Accuracy Score for model 1 :  93.53448275862068
Accuracy Score for model 2 :  99.56896551724138
Accuracy Score for model 3 :  99.56896551724138
Accuracy Score for model 4 :  99.56896551724138
Accuracy Score for model 5 :  99.56896551724138


In [957]:
acc_sample3 = pd.Series([as1,as2,as3,as4,as5])
print(acc_sample3)

0    93.534483
1    99.568966
2    99.568966
3    99.568966
4    99.568966
dtype: float64


## Sample 4

In [958]:
model1.fit(sample4_inputs,sample4_targets)
model2.fit(sample4_inputs,sample4_targets)
model3.fit(sample4_inputs,sample4_targets)
model4.fit(sample4_inputs,sample4_targets)
model5.fit(sample4_inputs,sample4_targets)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KNeighborsClassifier()

In [959]:
val_preds1 = model1.predict(val_inputs)
val_preds2 = model2.predict(val_inputs)
val_preds3 = model3.predict(val_inputs)
val_preds4 = model4.predict(val_inputs)
val_preds5 = model5.predict(val_inputs)

In [960]:
as1 = accuracy_score(val_preds1,val_targets) * 100
as2 = accuracy_score(val_preds2,val_targets) * 100
as3 = accuracy_score(val_preds3,val_targets) * 100
as4 = accuracy_score(val_preds4,val_targets) * 100
as5 = accuracy_score(val_preds5,val_targets) * 100

In [961]:
print("Accuracy Score for model 1 : ",as1)
print("Accuracy Score for model 2 : ",as2)
print("Accuracy Score for model 3 : ",as3)
print("Accuracy Score for model 4 : ",as4)
print("Accuracy Score for model 5 : ",as4)

Accuracy Score for model 1 :  64.65517241379311
Accuracy Score for model 2 :  93.10344827586206
Accuracy Score for model 3 :  89.65517241379311
Accuracy Score for model 4 :  87.06896551724138
Accuracy Score for model 5 :  87.06896551724138


In [962]:
acc_sample4 = pd.Series([as1,as2,as3,as4,as5])
print(acc_sample4)

0    64.655172
1    93.103448
2    89.655172
3    87.068966
4    68.965517
dtype: float64


## Sample 5

In [963]:
model1.fit(sample5_inputs,sample5_targets)
model2.fit(sample5_inputs,sample5_targets)
model3.fit(sample5_inputs,sample5_targets)
model4.fit(sample5_inputs,sample5_targets)
model5.fit(sample5_inputs,sample5_targets)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KNeighborsClassifier()

In [964]:
val_preds1 = model1.predict(val_inputs)
val_preds2 = model2.predict(val_inputs)
val_preds3 = model3.predict(val_inputs)
val_preds4 = model4.predict(val_inputs)
val_preds5 = model5.predict(val_inputs)

In [965]:
as1 = accuracy_score(val_preds1,val_targets) * 100
as2 = accuracy_score(val_preds2,val_targets) * 100
as3 = accuracy_score(val_preds3,val_targets) * 100
as4 = accuracy_score(val_preds4,val_targets) * 100
as5 = accuracy_score(val_preds5,val_targets) * 100

In [966]:
print("Accuracy Score for model 1 : ",as1)
print("Accuracy Score for model 2 : ",as2)
print("Accuracy Score for model 3 : ",as3)
print("Accuracy Score for model 4 : ",as4)
print("Accuracy Score for model 5 : ",as4)

Accuracy Score for model 1 :  80.60344827586206
Accuracy Score for model 2 :  95.6896551724138
Accuracy Score for model 3 :  95.25862068965517
Accuracy Score for model 4 :  82.32758620689656
Accuracy Score for model 5 :  82.32758620689656


In [967]:
acc_sample5 = pd.Series([as1,as2,as3,as4,as5])
print(acc_sample5)

0    80.603448
1    95.689655
2    95.258621
3    82.327586
4    87.068966
dtype: float64


## Final Results

In [968]:
final_results = pd.concat([acc_sample1,acc_sample2,acc_sample3,acc_sample4,acc_sample5],axis=1)

In [969]:
final_results.columns = ['Sample 1','Sample 2','Sample 3','Sample 4','Sample 5']
final_results.index = ['Model 1','Model 2','Model 3','Model 4','Model 5']

In [970]:
final_results

Unnamed: 0,Sample 1,Sample 2,Sample 3,Sample 4,Sample 5
Model 1,79.741379,84.913793,93.534483,64.655172,80.603448
Model 2,99.568966,100.0,99.568966,93.103448,95.689655
Model 3,99.137931,100.0,99.568966,89.655172,95.258621
Model 4,95.689655,99.137931,99.568966,87.068966,82.327586
Model 5,92.672414,99.568966,99.568966,68.965517,87.068966


##### Sample 1 gives best result when Model 2(Extra Tree Classifier) was applied.
##### Sample 2 gives best result when Model 2(Extra Tree Classifier) and Model 3(Random Forest Classifier) were applied.
##### Sample 3 gives best result when Model 2(Extra Tree Classifier), Model 3(Random Forest Classifier), Model 4(XG Boost), Model 5(KNN) was applied. 	
##### Sample 4 gives best result when Model 2(Extra Tree Classifier).
##### Sample 5 gives best result when Model 3(Random Forest Classifier) was applied.


## Result
#### Highest accuracy is coming for 3 cases:
#### 1) When Model 2(Extra Tree Classifier) is applied on the Sample 1.
#### 2) When Model 2(Extra Tree Classifier) is applied on the Sample 2.
#### 3) When Model 3(Random Forest Classifier) is applied on the Sample 2.