## Import Packages

In [12]:
# Import Packages
import time
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

#Just to suppress some warnings about copying data on a slice of a dataframe
import warnings
warnings.filterwarnings("ignore")

% matplotlib inline

## Import Data

In [13]:
df = pd.read_csv('./bank-additional/bank-additional-full.csv',delimiter=';')
df = df.drop('duration',axis=1)
print("Size of dataset - ",len(df))
print(" Columns - ", list(df.columns))
#encoding y
df['y'] = df['y'].map({'yes':1,'no':0})

41188
['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']


In [None]:
df.job.v

### Classifiying the columns with data-types -
#### 1.Numerical Data
#### 2. Categorical Data
##### a) Two Categories (default, housing, loan)
    Can be encoded by mapping 0 and 1 directly
##### b) Multiple Categories - no hierarcy (job, marital, contact, month, day_of_week, poutcome)
    Can be encoded by create a separate column for each category
##### c) Multiple Categories - hierarcy (education)
    Can be encoded by assigning levels in a single category (0,1,2,etc)

## Convert pdays into Categorical

In [14]:
df.loc[df["pdays"] == 999, "pdayscat"] = 0 #cutomer was not contacted for previous campaign
df.loc[df["pdays"] != 999, "pdayscat"] = 1 #cutomer was contacted for previous campaign

## Convert age into Categorical

In [15]:
l1 = 23
l2 = 62
df.loc[df["age"] <= l1, "agecat"] = 'young'
df.loc[((df["age"] > l1) & (df["age"] < l2)), "agecat"] = 'adult'
df.loc[df["age"] >= l2, "agecat"] = 'senior'

l = df.agecat.value_counts()
a = l.index

for i in a:
    n = df.y[df.agecat == i].sum() * 100 / l[i] #% subsription for each of the categories
    print(n)

10.2853117295
45.6391875747
28.0991735537


## Convert Campaign into Categorical

In [16]:
l1 = 12
print (len(df.loc[df.campaign > l1]))

df.loc[df["campaign"] <= l1, "campcat"] = 'low'
df.loc[df["campaign"] > l1, "campcat"] = 'high'

l = df.campcat.value_counts()
a = l.index

for i in a:
    n = df.y[df.campcat == i].sum() * 100 / l[i] #% subsription for each of the categories
    print(n)

567
11.3931217843
2.1164021164


In [17]:
numeric = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 
           'euribor3m', 'nr.employed']
binary = ['default', 'housing', 'loan','contact','campcat']
categorical = ['poutcome', 'job', 'marital', 'month','day_of_week','agecat']
hierarchy = ['education']

## Unknowns - dealing with null values

To start with let's calculate how much is 1% of the data. If a column contains null values less than that, we'll simply delete those rows.

### Deleting smaller unknowns

<pre><code>for i in binary+categorical+hierarchy:
    print(df[i].value_counts())</code></pre>
    
It was observed from this code that there are unknown values in the categorical columns, which is essentially null values. Let us only entertain the unknowns if the total number unknowns is more than 1% of that data (round that up to 500)

In [18]:
null_cols = []
for column in binary+categorical+hierarchy:
    missing_count = len(df.loc[df[column] == 'unknown']['y'])
    print ("Null Values - \n")
    #Delete few unknowns
    if missing_count < 500:
        print (column, " - ", missing_count)
        df = df[df[column] != 'unknown']
    elif missing_count > 500:
        #Find which columns have still retained unkowns
        number = missing_count
        print("{} - {}".format(column,number))
        #Turn them into np.nan
#         df.loc[df[column] == 'unknown',column] = np.nan
        null_cols.append(column)

Column default - still contains 8597 unknowns
Column housing - still contains 990 unknowns
Column loan - still contains 990 unknowns
contact  -  0
campcat  -  0
poutcome  -  0
job  -  330
marital  -  71
month  -  0
day_of_week  -  0
agecat  -  0
Column education - still contains 1596 unknowns


### Encoding categorical variables
#### 1. Hierarchical

In [8]:
""" This is specific for education. We need a manually created hierarcy as an input here."""
values = ["illiterate", "basic.4y", "basic.6y", "basic.9y", "high.school",  "professional.course", "university.degree"]
levels = range(1,len(values)+1)
dictionary = dict(zip(values,levels))
df['education']=df['education'].map(dictionary)
df['education'] = df['education'].replace('unknown', np.nan, regex=True)
#df['education']

** 2. Categorical **
* Option 1

In [9]:
def categorical_encoding(df,categorical=categorical):
    for column in categorical:
        
        dummies = pd.get_dummies(df[column])
        dummies = dummies.rename(columns = lambda x: column + '_' + str(x))
        
        df = pd.concat([df,dummies],axis=1)
        df = df.drop(column,axis=1)
    return df

df = categorical_encoding(df)


#list(df.columns)

* Option 2

In [115]:
from sklearn.preprocessing import LabelEncoder

for column in categorical:
    
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])

** 3. Binary **

In [10]:
from sklearn.preprocessing import LabelEncoder

for column in binary:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    
    #Convert 'unknown' into np.nan
    l = list(le.classes_)
    try:
        i = l.index('unknown')
        df.loc[df[column] == i, column] = np.nan
    except:
        pass

Let's indentify the unknown category and replace the values with np.nan

### Investigate for null values in numerical columns

In [11]:
print ("Null Values in numerical columns - ")
for column in numeric:
    null = sum(df[column].isnull())
    print (" {} - {}".format(column, str(null)))
    if null > 0:
        null_cols.append(column)

Null Values - 
 age - 0
 campaign - 0
 pdays - 0
 previous - 0
 emp.var.rate - 0
 cons.price.idx - 0
 cons.conf.idx - 0
 euribor3m - 0
 nr.employed - 0


Now, we can move forward and predict the missing data using random forest.

### Use random forest to fill null values in default, housing, loan and education

Before we start this process. We have to encode y as well.

In [13]:
#encoding y
# df['y'] = df['y'].map({'yes':1,'no':0})

In [12]:
def predict_unknown(trainX, trainY, testX):
    """ Predicting unknown data using random forest"""
    forest = RandomForestClassifier(n_estimators=100)
    forest = forest.fit(trainX, trainY)
    test_predictY = forest.predict(testX).astype(int)
    return pd.DataFrame(test_predictY,index=testX.index)

In [57]:
df.education.isnull().sum()

1596

In [13]:
print (null_cols)
for column in null_cols:
    
    test_data = df[df[column].isnull()]
    testX = test_data.drop(null_cols, axis=1)
    train_data = df[df[column].notnull()]        
    trainY = train_data[column]
    trainX = train_data.drop(null_cols, axis=1)
    #print(trainX.isnull().sum())
    #print(trainY.value_counts())
    test_data[column] = predict_unknown(trainX, trainY, testX)
    df = pd.concat([train_data, test_data])
    print(column, end=' ')

['default', 'housing', 'loan', 'education']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


default 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


housing 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


loan education 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Scaling

I have moved this part in the inference notebook as I think the test set should not be included while defining the normalization parameters

In [17]:
# def normalizer(data):
    
#     normalized = (data-min(data))/(max(data)-min(data))
#     return normalized


# for column in numeric+binary+hierarchy:
#     df[column] = normalizer(df[column])

# df.education = normalizer(df.education)

## Here's is how features looks like -

In [15]:
# df = df.drop('pdays',axis=1)
df.head()

Unnamed: 0,age,education,default,housing,loan,contact,campaign,pdays,previous,emp.var.rate,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,agecat_adult,agecat_senior,agecat_young
0,56,2.0,0.0,0.0,0.0,1,1,999,0,1.1,...,0,0,0,1,0,0,0,1,0,0
2,37,5.0,0.0,2.0,0.0,1,1,999,0,1.1,...,0,0,0,1,0,0,0,1,0,0
3,40,3.0,0.0,0.0,0.0,1,1,999,0,1.1,...,0,0,0,1,0,0,0,1,0,0
4,56,5.0,0.0,0.0,2.0,1,1,999,0,1.1,...,0,0,0,1,0,0,0,1,0,0
6,59,6.0,0.0,0.0,0.0,1,1,999,0,1.1,...,0,0,0,1,0,0,0,1,0,0


In [17]:
len(df.columns)

52

In [18]:
df.to_csv("clean_data_52_features.csv",index=False)