## Importing libraries
_____

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Loading Model libraries
import sys
!{sys.executable} -m pip install xgboost

from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from lightgbm import LGBMClassifier
from scipy.special import erfc
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,accuracy_score,classification_report,confusion_matrix , recall_score, precision_score
from sklearn.metrics import roc_curve, roc_auc_score, log_loss

from keras.layers import Input, Dense
from keras.models import Model

np.random.seed(2017)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Reading Files
_____

In [None]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/School/DataScience/santander-value-prediction-challenge/train.csv")
# Preview the first five rows of the train dataset
print(f'The shape of the dataset is: {train_data.shape}')
train_data.head()

The shape of the dataset is: (4459, 4993)


Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [None]:
test_data = pd.read_csv("/content/drive/MyDrive/School/DataScience/santander-value-prediction-challenge/test.csv")
# Preview the first five rows of the test dataset
print(f'The shape of the dataset is: {test_data.shape}')
test_data.head()

The shape of the dataset is: (49342, 4992)


Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00056a333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00056d8eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Observations**: 
 *   We are provided with an anonymized dataset containing numeric feature variables, the numeric target column, and a string ID column

* The train data and test data  has 4993 and 4992  unique Columns 
* the train data has 4459 rows 
* the test data has 49342 rows 
* In the Train data , the Number of columns is more than the number of train rows.
* in the Test data the Number of Rows  is almost 10 times the Number of Columns 

## DATA UNDERSTANDING 

In [None]:
# check datatypes
train_data.info()

In [None]:
test_data.info()

In [None]:
# describing numerical values in train data 
train_data.describe().T

In [None]:
# Categorical Values/Object Values in train data 
train_data.describe(include="O").T

In [None]:
# describing numerical values in test data 
test_data.describe().T

In [None]:
# Categorical Values/Object Values in test data 
test_data.describe(include="O").T

### Checking for nulls and duplicates

In [None]:
#checking for missing value in train data 
train_data.isnull().sum().sort_values(ascending=False)

In [None]:
#checking for missing value in test 
test_data.isnull().sum().sort_values(ascending=False)

In [None]:
#duplicate train data rows  
train_data.duplicated().sum()

In [None]:
#duplicate test data rows 
test_data.duplicated().sum()

In [None]:
#checking for duplicated column name since all column are anyonymous 
# get the boolean array of duplicate column names
duplicate_col = train_data.columns.duplicated()

# check if there are any duplicate column names
if any(duplicate_col):
    print("There are duplicate column names")
else:
    print("All column names are unique")

In [None]:
# get the boolean array of duplicate column names
duplicate_col = test_data.columns.duplicated()

# check if there are any duplicate column names
if any(duplicate_col):
    print("There are duplicate column names")
else:
    print("All column names are unique")

**Observations**: 
1. The train and test dataset is full of zeros 
2. The train and test dataset has 0 missing value 
3. The train and test dataset has 0 duplicate rows
4. All the column names are unique

## DATA PREPARATION

## Reducing dimensionality using Autoencoder
_________

Dimensionality reduction is the process of reducing the number of features or variables in a dataset while preserving as much of the important information as possible. This can be useful for visualizing high-dimensional data, reducing the computational cost of modeling, and avoiding overfitting. 



Autoencoder is an unsupervised neural network that learns to reconstruct the input data by compressing it into a lower-dimensional representation (encoding) and then decompressing it back to its original form (decoding). It can be used for dimensionality reduction by using the encoded representation as a new feature space.

**Steps involved**: 
* Prepare Data
* Design Auto Encoder
* Train Auto Encoder
* Use Encoder to obtain reduced dimensionality data for train and test sets


In [None]:
# lets first create a copy of the train and test data 
train_df = train_data.copy()
test_df = test_data.copy()

In [None]:
# drop the target and id column from the train data and test data to protect them from encoding
train_df.drop(train_df[['ID', 'target']], axis=1, inplace=True)
test_df.drop(test_df[['ID']], axis=1, inplace= True)
print(train_df.shape)
print(test_df.shape)

(4459, 4991)
(49342, 4991)


In [None]:
# scale the train and test data for neural network  
# Create the scaler object
scaler = StandardScaler()
# Scale the train data data
train_scaled = scaler.fit_transform(train_df )
test_scaled = scaler.fit_transform(test_df )

In [None]:
np.random.seed(2017)
X_train, X_test = train_test_split(train_scaled, train_size = 0.9, random_state = np.random.seed(2017))


**Defining the input layer** 
* We define different input layers for the train and test datasets 
* This is because they have different number of columns which means they'll have different input dimensions


In [None]:
train_col_no = train_scaled.shape[1] #retrieve the no of col in train data we use the attribute shape to return the number of col
train_input_dim = Input(shape = (train_col_no, )) #create the input layer with the specified shape 

# Defining the encoder dimension and set to 200
encoding_dim = 200

# Creating  Encoder Layers
encoded1 = Dense(3000, activation = 'relu')(train_input_dim)# we  create the first layer using the Dense function from Keras and specifies a hidden layer with 3000 units
encoded2 = Dense(2750, activation = 'relu')(encoded1)#Each subsequent layer is created by applying the previous layer to the Dense function, with a different number of units  
encoded3 = Dense(2500, activation = 'relu')(encoded2)
encoded4 = Dense(2250, activation = 'relu')(encoded3)
encoded5 = Dense(2000, activation = 'relu')(encoded4)
encoded6 = Dense(1750, activation = 'relu')(encoded5)
encoded7 = Dense(1500, activation = 'relu')(encoded6)
encoded8 = Dense(1250, activation = 'relu')(encoded7)
encoded9 = Dense(1000, activation = 'relu')(encoded8)
encoded10 = Dense(750, activation = 'relu')(encoded9)
encoded11 = Dense(500, activation = 'relu')(encoded10)
encoded12 = Dense(250, activation = 'relu')(encoded11)
encoded13 = Dense(encoding_dim, activation = 'relu')(encoded12) # the final layer has encoding_dim no of unit 

# Creating the Decoder Layers
decoded1 = Dense(250, activation = 'relu')(encoded13)
decoded2 = Dense(500, activation = 'relu')(decoded1)
decoded3 = Dense(750, activation = 'relu')(decoded2)
decoded4 = Dense(1000, activation = 'relu')(decoded3)
decoded5 = Dense(1250, activation = 'relu')(decoded4)
decoded6 = Dense(1500, activation = 'relu')(decoded5)
decoded7 = Dense(1750, activation = 'relu')(decoded6)
decoded8 = Dense(2000, activation = 'relu')(decoded7)
decoded9 = Dense(2250, activation = 'relu')(decoded8)
decoded10 = Dense(2500, activation = 'relu')(decoded9)
decoded11 = Dense(2750, activation = 'relu')(decoded10)
decoded12 = Dense(3000, activation = 'relu')(decoded11)
decoded13 = Dense(train_col_no, activation = 'sigmoid')(decoded12)

# Creating the autoenconder
# The combined Encoder and Decoder layers input will be the input dim layer and output is the decode layer 
train_autoencoder = Model(inputs = train_input_dim, outputs = decoded13)

# Compiling the Model
train_autoencoder.compile(optimizer = 'adadelta', loss = 'binary_crossentropy')

In [None]:
train_autoencoder.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 4991)]            0         
                                                                 
 dense (Dense)               (None, 3000)              14976000  
                                                                 
 dense_1 (Dense)             (None, 2750)              8252750   
                                                                 
 dense_2 (Dense)             (None, 2500)              6877500   
                                                                 
 dense_3 (Dense)             (None, 2250)              5627250   
                                                                 
 dense_4 (Dense)             (None, 2000)              4502000   
                                                                 
 dense_5 (Dense)             (None, 1750)              350175

In [None]:
# Once the autoencoder is compiled, we train it using the training dataset.
train_autoencoder.fit(X_train, X_train, epochs = 10, batch_size = 32, shuffle = False, validation_data = (X_test, X_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f893d6a5460>

Using the encoder to reduce dimensionality:
* Once the autoencoder is trained, you can use the encoder part of the autoencoder to reduce the dimensionality of the dataset. By calling the predict() function on the encoder, you can transform the input data to a lower-dimensional representation.

In [None]:
# We use the autoencoder to reduce the dimension of the train dataset
train_encoder = Model(inputs = train_input_dim, outputs = encoded13)
train_encoded_input = Input(shape = (encoding_dim, ))

In [None]:
# Predict the new train and test using the autoencoder 
new_train = pd.DataFrame(train_encoder.predict(train_scaled))
new_train = new_train.add_prefix('feature_')


**We repeat the same process for the test dataset**

In [None]:
np.random.seed(2017)
Y_train, Y_test = train_test_split(test_scaled, train_size = 0.9, random_state = np.random.seed(2017))


In [None]:
test_col_no = test_scaled.shape[1]
test_input_dim = Input(shape = (test_col_no, ))

encoding_dim = 200

encoded1 = Dense(3000, activation = 'relu')(test_input_dim)
encoded2 = Dense(2750, activation = 'relu')(encoded1)
encoded3 = Dense(2500, activation = 'relu')(encoded2)
encoded4 = Dense(2250, activation = 'relu')(encoded3)
encoded5 = Dense(2000, activation = 'relu')(encoded4)
encoded6 = Dense(1750, activation = 'relu')(encoded5)
encoded7 = Dense(1500, activation = 'relu')(encoded6)
encoded8 = Dense(1250, activation = 'relu')(encoded7)
encoded9 = Dense(1000, activation = 'relu')(encoded8)
encoded10 = Dense(750, activation = 'relu')(encoded9)
encoded11 = Dense(500, activation = 'relu')(encoded10)
encoded12 = Dense(250, activation = 'relu')(encoded11)
encoded13 = Dense(encoding_dim, activation = 'relu')(encoded12)

decoded1 = Dense(250, activation = 'relu')(encoded13)
decoded2 = Dense(500, activation = 'relu')(decoded1)
decoded3 = Dense(750, activation = 'relu')(decoded2)
decoded4 = Dense(1000, activation = 'relu')(decoded3)
decoded5 = Dense(1250, activation = 'relu')(decoded4)
decoded6 = Dense(1500, activation = 'relu')(decoded5)
decoded7 = Dense(1750, activation = 'relu')(decoded6)
decoded8 = Dense(2000, activation = 'relu')(decoded7)
decoded9 = Dense(2250, activation = 'relu')(decoded8)
decoded10 = Dense(2500, activation = 'relu')(decoded9)
decoded11 = Dense(2750, activation = 'relu')(decoded10)
decoded12 = Dense(3000, activation = 'relu')(decoded11)
decoded13 = Dense(test_col_no, activation = 'sigmoid')(decoded12)

test_autoencoder = Model(inputs = test_input_dim, outputs = decoded13)

test_autoencoder.compile(optimizer = 'adadelta', loss = 'binary_crossentropy')

In [None]:
test_autoencoder.summary()

In [None]:
test_autoencoder.fit(Y_train, Y_train, epochs = 10, batch_size = 32, shuffle = False, validation_data = (Y_test, Y_test))

In [None]:
test_encoder = Model(inputs = test_input_dim, outputs = encoded13)
test_encoded_input = Input(shape = (encoding_dim, ))

In [None]:
new_test = pd.DataFrame(test_encoder.predict(test_scaled))
new_test = new_test.add_prefix('feature_')

**Adding back the columns dropped before encoding**

In [None]:
# We then add back the target and the Id code we droped earlier 
train_df_1 = pd.concat([train_data[['ID', 'target']], new_train], axis=1)
print(train_df_1.shape)
train_df_1.head()

In [None]:
# Viewing the shape of the new test data 
test_df_1 = pd.concat([test_data[['ID']], new_test], axis=1)
print(test_df_1.shape)
test_df_1.head()


## Exporting the reduced Datasets

In [None]:
# We then extract the reduced datasets for further pre processing
train_df_1.to_csv(r'reduced_train.csv', index=False)
test_df_1.to_csv(r'reduced_test.csv', index=False)