In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import Counter
pd.pandas.set_option('display.max_columns',None)

In [2]:
train_data = pd.read_excel("../data/bank_telemaketing.xlsx")
print(train_data.shape)

(41188, 21)


In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [3]:
# there were no Zero Variances while Doing the EDA
# there were no Missing Values while Doing the EDA

In [4]:
train_data["y"] = train_data['y'].map({'yes':1, 'no':0})

In [9]:
categorical_features=[feature for feature in train_data.columns if train_data[feature].dtype=='O']

In [10]:
for feature in categorical_features:
    labels_ordered=train_data.groupby([feature])['y'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    train_data[feature]=train_data[feature].map(labels_ordered)

In [11]:
train_data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,3,0,2,2,1,2,0,0,0,261,1,999,0,0,1.1,93.994,-36.4,4.857,5191.0,0
1,57,1,0,3,1,1,2,0,0,0,149,1,999,0,0,1.1,93.994,-36.4,4.857,5191.0,0
2,37,1,0,3,2,2,2,0,0,0,226,1,999,0,0,1.1,93.994,-36.4,4.857,5191.0,0
3,40,8,0,1,2,1,2,0,0,0,151,1,999,0,0,1.1,93.994,-36.4,4.857,5191.0,0
4,56,1,0,3,2,1,1,0,0,0,307,1,999,0,0,1.1,93.994,-36.4,4.857,5191.0,0


In [12]:
def coorilation_detection(data, features = None):
    corrmat = data.corr()
    columns = features if features != None else [ i for i in data.columns if (data[i].dtype in ['int32','int64','float32','float64'] ) ]
    coorilated_variables = {}
    for variable in columns:
        for variable1 in columns:
            if variable == variable1:
                continue
            elif abs(corrmat[variable][variable1]) >= 0.75:
                if ((variable not in coorilated_variables.keys()) and 
                    (variable1 not in coorilated_variables.keys())): 
                    coorilated_variables[variable] = []
                    coorilated_variables[variable].append(variable1)
                elif (variable1 not in coorilated_variables.keys()):  
                    coorilated_variables[variable].append(variable1)
    return coorilated_variables

In [14]:
lst = [i for i in train_data.columns if (i != 'y' and train_data[i].dtypes != 'O')]
coorilation_detection(train_data, lst)

{'previous': ['poutcome'],
 'emp.var.rate': ['cons.price.idx', 'euribor3m', 'nr.employed'],
 'euribor3m': ['nr.employed']}

In [15]:
train_data.drop(['previous','nr.employed','emp.var.rate'], axis = 1,inplace=True)
train_data.shape

(41188, 18)

In [16]:
def detect_outliers(df,features):
    outlier_indices = []
    
    for c in features:
        #1st quartile
        Q1 = np.percentile(df[c],25)
        #3rd quartile
        Q3 = np.percentile(df[c],75)
        #IQR
        IQR = Q3-Q1
        #Outlier Step
        outlier_step= IQR * 1  # Consedering 1.5 for 3mu which is 99% ,Consider 1 for only 2 mu which is 95 %
        #Detect outlier and their indices
        outlier_list_col = df[(df[c]<Q1 - outlier_step) | (df[c]> Q3 + outlier_step)].index
        #store indices
        outlier_indices.extend(outlier_list_col)    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i,v in outlier_indices.items() if v > 1)
    
    return multiple_outliers

In [17]:
lst = [i for i in train_data.columns if i != 'y' and i not in categorical_features]
print(len(detect_outliers(train_data,lst)))

1208


In [18]:
train_data = train_data.drop(detect_outliers(train_data,lst),axis=0).reset_index(drop = True) 

In [20]:
train_data.shape

(39980, 18)

In [31]:
train_data.to_csv('../data/train_processed_data.csv',index=False)

In [19]:
# now we need to do the SMOTE Upsampeling 

In [22]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.8.0-py3-none-any.whl (206 kB)
Collecting scikit-learn>=0.24
  Downloading scikit_learn-0.24.2-cp38-cp38-win_amd64.whl (6.9 MB)
Installing collected packages: scikit-learn, imbalanced-learn, imblearn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.2
    Uninstalling scikit-learn-0.23.2:
      Successfully uninstalled scikit-learn-0.23.2
Successfully installed imbalanced-learn-0.8.0 imblearn-0.0 scikit-learn-0.24.2


In [27]:
from sklearn.model_selection import train_test_split
x = train_data.drop(['y'],axis =1).values
y = train_data['y'].values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size =0.3, random_state=303)

In [28]:
print("Shape of Training Data",x_train.shape)
print("Shape of Testing Data",x_test.shape)
print("Response Rate in Training Data",y_train.mean())
print("Response Rate in Testing Data",y_test.mean())

Shape of Training Data (27986, 17)
Shape of Testing Data (11994, 17)
Response Rate in Training Data 0.09904952476238119
Response Rate in Testing Data 0.09971652492913123


In [30]:
x_test

array([[ 55.   ,   0.   ,   0.   , ...,  93.918, -42.7  ,   4.96 ],
       [ 30.   ,   8.   ,   2.   , ...,  94.027, -38.3  ,   0.886],
       [ 32.   ,   7.   ,   2.   , ...,  92.963, -40.8  ,   1.281],
       ...,
       [ 58.   ,   8.   ,   0.   , ...,  93.444, -36.1  ,   4.965],
       [ 29.   ,   5.   ,   0.   , ...,  93.444, -36.1  ,   4.965],
       [ 51.   ,   0.   ,   0.   , ...,  93.918, -42.7  ,   4.961]])

In [23]:
from imblearn.over_sampling import SMOTE
oversampler= SMOTE(random_state=0)
os_features, os_labels= oversampler.fit_resample(train_data[[i for i in train_data.columns if i != 'y']],train_data[['y']])