# Predicting Customer Satisfaction Level from Santander 

This project is on the kaggle platform (link in cell below). The dataset is anonymized and consists of a large number of numeric variables.

https://www.kaggle.com/c/santander-customer-satisfaction

In [113]:
# Importing libraries and frameworks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats

## Importing dataset

In [4]:
# Importing train data
df_train = pd.read_csv("data/train.csv")
df_train.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [6]:
# Importing test data
df_data_test = pd.read_csv("data/test.csv")
df_result_test = pd.read_csv("data/sample_submission.csv")
df_test = df_data_test.merge(df_result_test, on = 'ID')
df_test.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,2,2,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40532.1,0
1,5,2,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45486.72,0
2,6,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46993.95,0
3,7,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187898.61,0
4,9,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,73649.73,0


## Feature Engineering

In [9]:
print(len(df_train))
print(len(df_test))

76020
75818


In [16]:
# Saving columns names and modifing it
original_col_names = df_train.columns

In [47]:
# Setting new columns names for to facilitate data manipulation.
new_col_names = ["ID"]
new_col_names = new_col_names + (["var" + str(i) for i in range(1,370)])
new_col_names = new_col_names + ["TARGET"]

df_train.columns = new_col_names
df_test.columns = new_col_names

As the number of characteristics is very large I applied the Principal Component Analysis algorithm to reduce the size of the dataset and facilitate the analyzes and transformations that precede the creation of the predictive model.

In [53]:
# Checking for missing values
print(pd.isna(df_train).any().any())
print(pd.isna(df_test).any().any())
print(pd.isnull(df_train).any().any())
print(pd.isnull(df_test).any().any())

False
False
False
False


### Removing Data Columns with Too Many zero values

I saved dataset before applying transformations. Then I dropped "ID" variable and converting all independent variables to float. So I got the features with a proportion of non-zero values greater than 60%.

In [64]:
# Saving dataset
df_train.to_csv("data/df_train.csv")
df_test.to_csv("data/df_test.csv")

In [73]:
# Dropping "ID" variable and converting all independent variables to float
df1_train = df_train.drop("ID", axis=1)
df1_test = df_test.drop("ID", axis=1)

df1_train.iloc[:,:-1] = df1_train.iloc[:,:-1].astype(float)
df1_test.iloc[:,:-1] = df1_test.iloc[:,:-1].astype(float)

In [86]:
# Filtering variables with a proportion of non-zero values greater than 60%.
vars = (df1_train == 0.0).sum()/len(df1_train)
var_filtered = vars[vars < 0.4]

In [109]:
vars_list = list(var_filtered.index) + ["TARGET"]
df_reduced_train = df1_train[vars_list]
df_reduced_test = df1_test[vars_list]

In [112]:
len(df_reduced_test.columns)

30

In [None]:
df_reduced_train.to_csv("data/df_train.csv")
df_reduced_test.to_csv("data/df_test.csv")

### Reducing Highly Correlated Columns

In [126]:
col = df_reduced_train.drop("TARGET", axis=1).columns


[('var24', 'var90', 0.9337194030695292),
 ('var25', 'var64', 0.8483377284983373),
 ('var25', 'var89', 0.6504750976567577),
 ('var25', 'var91', 0.9975002244913979),
 ('var25', 'var139', 0.7900380759734712),
 ('var25', 'var159', 0.857746285932058),
 ('var25', 'var165', 0.8374652421920914),
 ('var25', 'var281', 0.8761550509748453),
 ('var25', 'var329', 0.6675351595659501),
 ('var25', 'var331', 0.800429734294878),
 ('var25', 'var332', 0.7914452050929008),
 ('var64', 'var25', 0.8483377284983373),
 ('var64', 'var89', 0.8079478202215183),
 ('var64', 'var91', 0.846217074621614),
 ('var64', 'var139', 0.9357796710872539),
 ('var64', 'var148', 0.8053624445577469),
 ('var64', 'var159', 0.9199366882620539),
 ('var64', 'var165', 0.7104533612575488),
 ('var64', 'var183', 0.7761459813550806),
 ('var64', 'var191', 0.7574701438942257),
 ('var64', 'var281', 0.8119061005232864),
 ('var64', 'var329', 0.6899094159803387),
 ('var64', 'var331', 0.6960663367822173),
 ('var64', 'var332', 0.6893844775683339),
 (

In [None]:
scipy.stats.spearmanr(x, y).correlation

In [123]:
col

Index(['var1', 'var2', 'var24', 'var25', 'var63', 'var64', 'var77', 'var80',
       'var89', 'var90', 'var91', 'var138', 'var139', 'var148', 'var152',
       'var155', 'var158', 'var159', 'var165', 'var183', 'var191', 'var194',
       'var281', 'var290', 'var329', 'var330', 'var331', 'var332', 'var369'],
      dtype='object')