In [1]:
# Importing the needed modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
warnings.simplefilter('ignore', FutureWarning)

# To display plots inside the iPython Notebook itself
%matplotlib inline

In [3]:
# To verify how data is orgainzed in file(to find the delimiter) and then
# use corresponding function to open the file. eg
# data could be in .csv. .tsv, excel format etc.
pathOfDataFile = "data/bank-full.csv"
firstFewLines = list()
noOfLinesToView = 5

with open(pathOfDataFile) as dataFile:
    firstFewLines = [next(dataFile) for i in range(noOfLinesToView)]
    for line in firstFewLines:
        print(line)

# Import the semi-colon delimited data file into pandas dataFrame
bankPromo_df = pd.read_csv(pathOfDataFile, sep = ";")

# Rename the Target/Final Outcome column from "y" to "Subscribed" as based on data description.
bankPromo_df = bankPromo_df.rename(columns={"y":"Subscribed"})


"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"

58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"

44;"technician";"single";"secondary";"no";29;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"

33;"entrepreneur";"married";"secondary";"no";2;"yes";"yes";"unknown";5;"may";76;1;-1;0;"unknown";"no"

47;"blue-collar";"married";"unknown";"no";1506;"yes";"no";"unknown";5;"may";92;1;-1;0;"unknown";"no"



In [4]:
bankPromo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age           45211 non-null int64
job           45211 non-null object
marital       45211 non-null object
education     45211 non-null object
default       45211 non-null object
balance       45211 non-null int64
housing       45211 non-null object
loan          45211 non-null object
contact       45211 non-null object
day           45211 non-null int64
month         45211 non-null object
duration      45211 non-null int64
campaign      45211 non-null int64
pdays         45211 non-null int64
previous      45211 non-null int64
poutcome      45211 non-null object
Subscribed    45211 non-null object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [5]:
# Get the unique values(Levels) for categorical variables.
# List to hold names of categorical variables
categoricalVars = list()
# List to hold names of numerical variables
numericalVars = list()

for colName in bankPromo_df.columns:
    if bankPromo_df[colName].dtype == np.int64:
        numericalVars.append(colName)
    elif bankPromo_df[colName].dtype == np.object:
        categoricalVars.append(colName)
    else:
        pass
    
# Remove Target column from final categorical Var list
categoricalVars.remove('default')

print(numericalVars)
print(categoricalVars)

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
['job', 'marital', 'education', 'housing', 'loan', 'contact', 'month', 'poutcome', 'Subscribed']


In [7]:
bankPromo_df['dummyCat'] = np.random.choice([0,1], size=(len(bankPromo_df),), p=[0.5,0.5])


AttributeError: 'list' object has no attribute 'add'

In [17]:
categoricalVars.append('dummyCat')

In [24]:
import scipy.stats as stats
from scipy.stats import chi2_contingency

class ChiSquare:
    def __init__(self, dataframe):
        self.df = dataframe
        self.p = None #P-Value
        self.chi2 = None #Chi Test Statistic
        self.dof = None
        
        self.dfTabular = None
        self.dfExpected = None
        
    def TestIndependence(self,colX,colY, alpha=0.05):
        X = self.df[colX].astype(str)
        Y = self.df[colY].astype(str)
        
        self.dfObserved = pd.crosstab(Y,X) 
        chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
        self.p = p
        self.chi2 = chi2
        self.dof = dof 
        
        self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns, index = self.dfObserved.index)
        
        self._print_chisquare_result(colX, alpha)
    
    def _print_chisquare_result(self, colX, alpha):
        result = ""
        if self.p<alpha:
            result="{0} is IMPORTANT for Prediction".format(colX)
        else:
            result="{0} is NOT an important predictor. (Discard {0} from model)".format(colX)

        print(result)

In [27]:


#Initialize ChiSquare Class
cT = ChiSquare(bankPromo_df)

#Feature Selection
print("Chi-sq test for default")
print("=======================")
for var in categoricalVars:
    cT.TestIndependence(colX=var,colY="default" )  

Chi-sq test for default
job is IMPORTANT for Prediction
marital is IMPORTANT for Prediction
education is IMPORTANT for Prediction
housing is NOT an important predictor. (Discard housing from model)
loan is IMPORTANT for Prediction
contact is IMPORTANT for Prediction
month is IMPORTANT for Prediction
poutcome is IMPORTANT for Prediction
Subscribed is IMPORTANT for Prediction


In [29]:

#Initialize ChiSquare Class
cT = ChiSquare(bankPromo_df)

#Feature Selection
print("Chi-sq test for housing")
print("=======================")
for var in categoricalVars:
    cT.TestIndependence(colX=var,colY="housing" )  

Chi-sq test for housing
job is IMPORTANT for Prediction
marital is IMPORTANT for Prediction
education is IMPORTANT for Prediction
housing is IMPORTANT for Prediction
loan is IMPORTANT for Prediction
contact is IMPORTANT for Prediction
month is IMPORTANT for Prediction
poutcome is IMPORTANT for Prediction
Subscribed is IMPORTANT for Prediction


In [30]:

#Initialize ChiSquare Class
cT = ChiSquare(bankPromo_df)

#Feature Selection
print("Chi-sq test for loan")
print("=======================")
for var in categoricalVars:
    cT.TestIndependence(colX=var,colY="loan" )  

Chi-sq test for loan
job is IMPORTANT for Prediction
marital is IMPORTANT for Prediction
education is IMPORTANT for Prediction
housing is IMPORTANT for Prediction
loan is IMPORTANT for Prediction
contact is IMPORTANT for Prediction
month is IMPORTANT for Prediction
poutcome is IMPORTANT for Prediction
Subscribed is IMPORTANT for Prediction
