## Importing Libraries

In [1]:
## libraries for reading and manipulating data
import pandas as pd
import numpy as np

## libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

## libraries for splitting data
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

## libraries for computing accuracy score
from sklearn.metrics import (f1_score, accuracy_score, recall_score, precision_score,
                            confusion_matrix, roc_auc_score, confusion_matrix)

## library for data scaling
from sklearn.preprocessing import StandardScaler

## library for imputing missing values
from sklearn.impute import SimpleImputer

## library for hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

## creating pipeplines
from sklearn.pipeline import Pipeline

## model building 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ( AdaBoostClassifier, GradientBoostingClassifier, 
                              RandomForestClassifier, BaggingClassifier
)
from xgboost import XGBClassifier

## suppress warnings 
import warnings
warnings.filterwarnings("ignore")

## Loading Data

In [2]:
## loading dataset with pandas
train_generator_data = pd.read_csv("../dataset/Train.csv.csv")
test_generator_data = pd.read_csv("../dataset/Test.csv.csv")

In [3]:
## creating a copy of the data
train_df = train_generator_data.copy()
test_df = test_generator_data.copy()

## Data Overview

In [5]:
## lets preview our train data
train_df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V32,V33,V34,V35,V36,V37,V38,V39,V40,Target
0,-4.464606,-4.679129,3.101546,0.50613,-0.221083,-2.032511,-2.91087,0.050714,-1.522351,3.761892,...,3.0597,-1.69044,2.846296,2.235198,6.667486,0.443809,-2.369169,2.950578,-3.480324,0
1,3.365912,3.653381,0.909671,-1.367528,0.332016,2.358938,0.7326,-4.332135,0.565695,-0.10108,...,-1.795474,3.03278,-2.467514,1.894599,-2.29778,-1.731048,5.908837,-0.386345,0.616242,0
2,-3.831843,-5.824444,0.634031,-2.418815,-1.773827,1.016824,-2.098941,-3.173204,-2.08186,5.392621,...,-0.257101,0.80355,4.086219,2.292138,5.36085,0.351993,2.940021,3.83916,-4.309402,0
3,1.618098,1.888342,7.046143,-1.147285,0.08308,-1.52978,0.207309,-2.493629,0.344926,2.118578,...,-3.584425,-2.577474,1.363769,0.622714,5.5501,-1.526796,0.138853,3.10143,-1.277378,0
4,-0.11144,3.872488,-3.758361,-2.982897,3.792714,0.54496,0.205433,4.848994,-1.85492,-6.220023,...,8.265896,6.629213,-10.068689,1.222987,-3.229763,1.686909,-2.163896,-3.644622,6.510338,0


In [6]:
## lets preview our test data
test_df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V32,V33,V34,V35,V36,V37,V38,V39,V40,Target
0,-0.613489,-3.81964,2.202302,1.30042,-1.184929,-4.495964,-1.835817,4.722989,1.20614,-0.341909,...,2.291204,-5.411388,0.870073,0.574479,4.157191,1.428093,-10.511342,0.454664,-1.448363,0
1,0.389608,-0.512341,0.527053,-2.576776,-1.016766,2.235112,-0.441301,-4.405744,-0.332869,1.966794,...,-2.474936,2.493582,0.315165,2.059288,0.683859,-0.485452,5.12835,1.720744,-1.488235,0
2,-0.874861,-0.640632,4.084202,-1.590454,0.525855,-1.957592,-0.695367,1.347309,-1.732348,0.4665,...,-1.318888,-2.997464,0.459664,0.619774,5.631504,1.323512,-1.752154,1.808302,1.675748,0
3,0.238384,1.458607,4.014528,2.534478,1.196987,-3.11733,-0.924035,0.269493,1.322436,0.702345,...,3.517918,-3.074085,-0.28422,0.954576,3.029331,-1.367198,-3.41214,0.906,-2.450889,0
4,5.828225,2.76826,-1.23453,2.809264,-1.641648,-1.406698,0.568643,0.965043,1.918379,-2.774855,...,1.773841,-1.501573,-2.226702,4.77683,-6.559698,-0.805551,-0.276007,-3.858207,-0.537694,0


## Data Understanding

In [7]:
## lets the check the number of cols and rows for train data
print(f"Total Number Of Rows: {train_df.shape[0]} -> Total Number Of Cols: {test_df.shape[1]}")

Total Number Of Rows: 20000 -> Total Number Of Cols: 41


In [8]:
## lets the check the number of cols and rows for train data
print(f"Total Number Of Rows: {test_df.shape[0]} -> Total Number Of Cols: {test_df.shape[1]}")

Total Number Of Rows: 5000 -> Total Number Of Cols: 41


In [9]:
## findng a summary description of the train data
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 41 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      19982 non-null  float64
 1   V2      19982 non-null  float64
 2   V3      20000 non-null  float64
 3   V4      20000 non-null  float64
 4   V5      20000 non-null  float64
 5   V6      20000 non-null  float64
 6   V7      20000 non-null  float64
 7   V8      20000 non-null  float64
 8   V9      20000 non-null  float64
 9   V10     20000 non-null  float64
 10  V11     20000 non-null  float64
 11  V12     20000 non-null  float64
 12  V13     20000 non-null  float64
 13  V14     20000 non-null  float64
 14  V15     20000 non-null  float64
 15  V16     20000 non-null  float64
 16  V17     20000 non-null  float64
 17  V18     20000 non-null  float64
 18  V19     20000 non-null  float64
 19  V20     20000 non-null  float64
 20  V21     20000 non-null  float64
 21  V22     20000 non-null  float64
 22

In [10]:
## findng a summary description of the test data
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 41 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      4995 non-null   float64
 1   V2      4994 non-null   float64
 2   V3      5000 non-null   float64
 3   V4      5000 non-null   float64
 4   V5      5000 non-null   float64
 5   V6      5000 non-null   float64
 6   V7      5000 non-null   float64
 7   V8      5000 non-null   float64
 8   V9      5000 non-null   float64
 9   V10     5000 non-null   float64
 10  V11     5000 non-null   float64
 11  V12     5000 non-null   float64
 12  V13     5000 non-null   float64
 13  V14     5000 non-null   float64
 14  V15     5000 non-null   float64
 15  V16     5000 non-null   float64
 16  V17     5000 non-null   float64
 17  V18     5000 non-null   float64
 18  V19     5000 non-null   float64
 19  V20     5000 non-null   float64
 20  V21     5000 non-null   float64
 21  V22     5000 non-null   float64
 22  

In [11]:
## computing the summary statistics on train data
train_df.describe().round()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V32,V33,V34,V35,V36,V37,V38,V39,V40,Target
count,19982.0,19982.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,...,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,-0.0,0.0,2.0,-0.0,-0.0,-1.0,-1.0,-1.0,-0.0,-0.0,...,0.0,0.0,-0.0,2.0,2.0,0.0,-0.0,1.0,-1.0,0.0
std,3.0,3.0,3.0,3.0,2.0,2.0,2.0,3.0,2.0,2.0,...,6.0,4.0,3.0,3.0,4.0,2.0,4.0,2.0,3.0,0.0
min,-12.0,-12.0,-11.0,-15.0,-9.0,-10.0,-8.0,-16.0,-9.0,-10.0,...,-20.0,-17.0,-18.0,-15.0,-15.0,-5.0,-17.0,-6.0,-11.0,0.0
25%,-3.0,-2.0,0.0,-2.0,-2.0,-2.0,-2.0,-3.0,-1.0,-1.0,...,-3.0,-2.0,-2.0,0.0,-1.0,-1.0,-3.0,-0.0,-3.0,0.0
50%,-1.0,0.0,2.0,-0.0,-0.0,-1.0,-1.0,-0.0,-0.0,0.0,...,0.0,-0.0,-0.0,2.0,2.0,-0.0,-0.0,1.0,-1.0,0.0
75%,2.0,3.0,5.0,2.0,1.0,0.0,0.0,2.0,1.0,1.0,...,4.0,2.0,1.0,4.0,4.0,1.0,2.0,2.0,1.0,0.0
max,15.0,13.0,17.0,13.0,8.0,7.0,8.0,12.0,8.0,8.0,...,24.0,17.0,14.0,15.0,19.0,7.0,15.0,8.0,11.0,1.0


In [12]:
## computing the summary statistics on test data
test_df.describe().round()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V32,V33,V34,V35,V36,V37,V38,V39,V40,Target
count,4995.0,4994.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,-0.0,0.0,3.0,-0.0,-0.0,-1.0,-1.0,-1.0,0.0,0.0,...,0.0,-0.0,-0.0,2.0,2.0,0.0,-0.0,1.0,-1.0,0.0
std,3.0,3.0,3.0,3.0,2.0,2.0,2.0,3.0,2.0,2.0,...,6.0,4.0,3.0,3.0,4.0,2.0,4.0,2.0,3.0,0.0
min,-12.0,-11.0,-9.0,-15.0,-8.0,-9.0,-8.0,-12.0,-7.0,-8.0,...,-17.0,-15.0,-15.0,-12.0,-13.0,-5.0,-15.0,-5.0,-10.0,0.0
25%,-3.0,-2.0,0.0,-2.0,-2.0,-2.0,-2.0,-3.0,-1.0,-1.0,...,-4.0,-2.0,-2.0,0.0,-1.0,-1.0,-3.0,-0.0,-3.0,0.0
50%,-1.0,0.0,2.0,-0.0,-0.0,-1.0,-1.0,-0.0,-0.0,0.0,...,-0.0,-0.0,-0.0,2.0,2.0,-0.0,-0.0,1.0,-1.0,0.0
75%,2.0,2.0,5.0,2.0,1.0,0.0,0.0,2.0,1.0,2.0,...,4.0,2.0,1.0,4.0,4.0,1.0,2.0,2.0,1.0,0.0
max,14.0,14.0,15.0,12.0,8.0,5.0,8.0,10.0,9.0,7.0,...,27.0,13.0,12.0,13.0,17.0,7.0,13.0,7.0,9.0,1.0


In [13]:
## lets check the number of duplicates in the train data
train_df.duplicated().sum()

0

In [14]:
## lets check the number of duplicates in the test data
test_df.duplicated().sum()

0

In [15]:
## lets check for missing values in the train data
train_df.isnull().sum()

V1        18
V2        18
V3         0
V4         0
V5         0
V6         0
V7         0
V8         0
V9         0
V10        0
V11        0
V12        0
V13        0
V14        0
V15        0
V16        0
V17        0
V18        0
V19        0
V20        0
V21        0
V22        0
V23        0
V24        0
V25        0
V26        0
V27        0
V28        0
V29        0
V30        0
V31        0
V32        0
V33        0
V34        0
V35        0
V36        0
V37        0
V38        0
V39        0
V40        0
Target     0
dtype: int64

In [16]:
## lets check for missing values in the test data
test_df.isnull().sum()

V1        5
V2        6
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
V29       0
V30       0
V31       0
V32       0
V33       0
V34       0
V35       0
V36       0
V37       0
V38       0
V39       0
V40       0
Target    0
dtype: int64