# Data Preparation and Imputation

<b> As we have completed the Exploratory Data Analysis, we have obtained an overview on the specifics of each attributes in the dataset.
Further on, we proceed to handle outliers and missing values in few attributes that were found from previous EDA.</b>

## Topic of Contents:
* [Understanding Data](#first-bullet)
* [Basic Data Analysis](#second-bullet)
* [Effects of banking data on Term Deposit](#third-bullet)
* [Effect of Campaign on Term Deposit](#four-bullet)
* [Additional Attribute Effects](#five-bullet)
* [Data Preprocessing](#six-bullet)
* [Data Modelling](#seven-bullet)
* [Model Analysis](#eight-bullet)
* [Results](#nine-bullet)
* [Future Leads to Marketing Campaigns](#ten-bullet)


In [None]:
# Importing Data

In [None]:
# import basic libraries
import pandas as pd
import numpy as np
import seaborn as sns
import random
import warnings

from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [None]:
path = 'C:/Users/MANEET/bank_marketing'
data_add = '/data'
report_add = '/report'
figures_add = '/figures'
experiment_add = '/experiments'

In [None]:
df = pd.read_csv(path + data_add +'./raw/bank-additional-full.csv', delimiter = ';')

In [None]:
df.head()

## Data Imputation

In [None]:
df['job'][df['age']>65].value_counts()

In [None]:
df.loc[(df["age"]>65) & (df["job"]=='unknown'), 'job'] = 'retired'
df.loc[(df["age"]>65) & (df["job"]=='unemployed'), 'job'] = 'retired'

As we know the dataset is obtained from real world instances, we can generate relation between education and occupation. As there are high "unknown" instances in job and education, we can hypothesize education based on job and vice-versa.

In [None]:
job_education_ct = pd.crosstab(df.job,df.education)

In [None]:
job_education_ct

In [None]:
job_education_max = job_education_ct.idxmax(axis=1) #this stores key-value pair of job-education to predict education
education_job_max = job_education_ct.idxmax(axis=0)#this stores key-value pair of educaiton-job to predict job

In [None]:
education_job_max.pop('unknown')
job_education_max.pop('unknown')

In [None]:
df.loc[(df['education']=='unknown') & (df['job']=='unknown'),'job'] = random.choice(education_job_max)

In [None]:
for i in education_job_max.keys():
    df.loc[(df['education'] =='unknown') & (df['job'] == i),'education'] = education_job_max[i]

In [None]:
for i in job_education_max.keys():
    df.loc[(df['job'] =='unknown') & (df['education'] == i),'job'] = job_education_max[i]

Pdays attribute has < 5% of the positive values and so we will drop them as an edge case.

In [None]:
df_copy = df.drop('pdays',axis=1)
df_copy.head() 
df = df_copy

In [None]:
df.isna().sum()

In [None]:
df.to_csv(path + data_add + '/processed/data_imputation.csv')

## Data Preprocessing <a class="anchor" id="six-bullet"></a>

In [None]:
train_df = df.loc[:,df.columns !='y']
target_df = df.loc[:,df.columns == 'y']
train_df.to_csv(path + data_add + '/processed/Raw_sample_features.csv')
target_df.to_csv(path + data_add + '/processed/Raw_sample_labels.csv')

In [None]:
def find_cat_cols(df):
    num_cols = list(df._get_numeric_data().columns)
    cat_cols = list(set(df.columns) - set(num_cols))
    print("Found {0} Numerical columns in DataFrame".format(len(num_cols)))
    print("Found {0} Categorical columns in DataFrame".format(len(cat_cols)))
    return num_cols, cat_cols
    

In [None]:
numeric_features, categorical_features = find_cat_cols(train_df)

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [None]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_over,y_over = oversample.fit_resample(train_df,target_df)
X_over.to_csv(path + data_add + '/processed/Over_sample_features.csv')
y_over.to_csv(path + data_add + '/processed/Over_sample_labels.csv')

In [None]:

undersample = RandomUnderSampler(sampling_strategy='majority')
X_under,y_under = undersample.fit_resample(train_df,target_df)
X_over.to_csv(path + data_add + '/processed/Under_sample_features.csv')
y_over.to_csv(path + data_add + '/processed/Under_sample_labels.csv')

In [None]:
print('Oversampled dataset shape %s' % len(X_over))
print('Undersampled dataset shape %s' % len(X_under))

In [None]:
def preprocess_data(X,y):
    numeric_features, categorical_features = find_cat_cols(X)
    scaler = MinMaxScaler()
    label_encoder = LabelEncoder()
    X[numeric_features] = scaler.fit_transform(X[numeric_features])
    y_preprocessed = pd.Series(label_encoder.fit_transform(y))
    X_preprocessed = pd.concat([X,pd.get_dummies(X[categorical_features])],axis=1)
    X_preprocessed.drop(labels = categorical_features,axis=1,inplace=True)
    # df.to_csv(path + data_add + './processed/processed_data.csv')
    return X_preprocessed, y_preprocessed

In [None]:
X_over_preprocessed, y_over_preprocessed = preprocess_data(X_over,y_over)
print(X_over_preprocessed.shape, y_over_preprocessed.shape)
X_over_preprocessed.to_csv(path + data_add + '/processed/Over_processed_features.csv')
y_over_preprocessed.to_csv(path + data_add + '/processed/Over_processed_labels.csv')

In [None]:
X_under_preprocessed,y_under_preprocessed = preprocess_data(X_under,y_under)
print(X_under_preprocessed.shape, y_under_preprocessed.shape)
X_under_preprocessed.to_csv(path + data_add + '/processed/Under_processed_features.csv')
y_under_preprocessed.to_csv(path + data_add + '/processed/Under_processed_labels.csv')

In [None]:
X_raw_preprocessed,y_raw_preprocessed = preprocess_data(train_df,target_df)
print(X_raw_preprocessed.shape,y_raw_preprocessed.shape)
X_raw_preprocessed.to_csv(path + data_add + '/processed/Raw_processed_features.csv')
y_raw_preprocessed.to_csv(path + data_add + '/processed/Raw_processed_labels.csv')