# Exercise 4 -  Machine Learning with Python

## Setups

we will start by importing the required packeges and setup our enviornment

In [37]:
%pylab inline
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.cross_validation import KFold
from sklearn import svm
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.naive_bayes import GaussianNB

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


we will use the generic function from the tutorial to estimate our accuracy

In [38]:
def classification_model(model, data, predictors, outcome):
  #Fit the model:
  model.fit(data[predictors],data[outcome])
  
  #Make predictions on training set:
  predictions = model.predict(data[predictors])
  
  #Print accuracy
  accuracy = metrics.accuracy_score(predictions,data[outcome])
  print("Accuracy : %s" % "{0:.3%}".format(accuracy))

  #Perform k-fold cross-validation with 5 folds
  kf = KFold(data.shape[0], n_folds=5)
  error = []
  for train, test in kf:
    # Filter training data
    train_predictors = (data[predictors].iloc[train,:])
    
    # The target we're using to train the algorithm.
    train_target = data[outcome].iloc[train]
    
    # Training the algorithm using the predictors and target.
    model.fit(train_predictors, train_target)
    
    #Record error from each cross-validation run
    error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
 
  print("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))

  #Fit the model again so that it can be refered outside the function:
  model.fit(data[predictors],data[outcome]) 

## Handling missing values

First, lets see how many and in which columns we have missing values:

In [39]:
 train_df.apply(lambda x: sum(x.isnull()),axis=0) 

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [40]:
 test_df.apply(lambda x: sum(x.isnull()),axis=0) 

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

Since most of the applicants are not self employed (86%), it is safe to fill this column with 'NO' values: 

In [41]:
train_df['Self_Employed'].fillna('No',inplace=True)
test_df['Self_Employed'].fillna('No',inplace=True)

The same principle works for the other categorical columns

In [42]:
train_df['Loan_Amount_Term'].fillna(360, inplace=True)
train_df['Credit_History'].fillna(1, inplace=True)
train_df['Married'].fillna('Yes', inplace=True)
train_df['Dependents'].fillna(0, inplace=True)
train_df['Gender'].fillna('Male', inplace=True)

test_df['Loan_Amount_Term'].fillna(360, inplace=True)
test_df['Credit_History'].fillna(1, inplace=True)
test_df['Married'].fillna('Yes', inplace=True)
test_df['Dependents'].fillna(0, inplace=True)
test_df['Gender'].fillna('Male', inplace=True)

we will fill the 'LoanAmount' column using the hypothesis from the tutorial

In [43]:
table = train_df.pivot_table(values='LoanAmount', index='Self_Employed' ,columns='Education', aggfunc=np.median)
def fage(x):
 return table.loc[x['Self_Employed'],x['Education']]
train_df['LoanAmount'].fillna(train_df[train_df['LoanAmount'].isnull()].apply(fage, axis=1), inplace=True)
test_df['LoanAmount'].fillna(test_df[test_df['LoanAmount'].isnull()].apply(fage, axis=1), inplace=True)

## taking care of extreme values

In [44]:
train_df['LoanAmount_log'] = np.log(train_df['LoanAmount'])
train_df['TotalIncome'] = train_df['ApplicantIncome'] + train_df['CoapplicantIncome']
train_df['TotalIncome_log'] = np.log(train_df['TotalIncome'])

test_df['LoanAmount_log'] = np.log(test_df['LoanAmount'])
test_df['TotalIncome'] = test_df['ApplicantIncome'] + test_df['CoapplicantIncome']
test_df['TotalIncome_log'] = np.log(test_df['TotalIncome'])

##  Converting categorical data to  numeric

In [45]:
var_mod = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']
le = LabelEncoder()
for i in var_mod:
    train_df[i] = le.fit_transform(train_df[i].astype(str))
    if(i != 'Loan_Status'):
        test_df[i] = le.fit_transform(test_df[i].astype(str))

# Algorithm #1 - SVM

for our First attempt we try to predict the Loan_Status by all other fields

In [46]:
outcome_var = 'Loan_Status'
model = svm.SVC()
predictor_var = ['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Loan_Amount_Term', 'Credit_History', 'Property_Area','LoanAmount_log','TotalIncome_log']
classification_model(model, train_df,predictor_var,outcome_var)

Accuracy : 80.782%
Cross-Validation Score : 79.807%


Dropping the unneccesery columns and making the predictions

In [47]:
test_df2 = test_df.drop(test_df.columns[[0,6,7,8,13]], axis=1, inplace=False)

In [48]:
pred = model.predict(test_df2)
def rep(x):
    if(x==1):
        return 'Y' 
    else:
        return 'N'
fixed_pred = list(map(rep, pred))

In [49]:
IDs = test_df['Loan_ID']

In [50]:
submission1 =  pd.DataFrame.from_dict(IDs, orient='columns', dtype=None)
submission1['Loan_Status'] = fixed_pred
submission1.to_csv('submission1.csv', index = False)

The results are:

![first submission](attempt 1.PNG)

## Algorithm #2 - Naive Bayes

we can infer from our last attempt that using all fields is not necessarily the way to go. 
this time, we will use feature selection technique to improve our results.

### Feature Engineering

we will start by droping fields that are not relevant

In [51]:
train_df2 = train_df.drop(train_df.columns[[0,6,7,8,12,14]], axis=1, inplace=False)

Then, using the sklearn feature selection, we will choose the best 3 out of the entire 10 features

In [52]:
selector = SelectKBest(f_classif,k=3).fit(train_df2,train_df['Loan_Status'])
transformed = selector.transform(train_df2)

Now, we will train the model

In [53]:
model2 = GaussianNB()
model2.fit(transformed,train_df['Loan_Status'])

GaussianNB(priors=None)

Checking accuracy and K-fold score

In [54]:
#Make predictions on training set:
predictions = model2.predict(transformed)
  
#Print accuracy
accuracy = metrics.accuracy_score(predictions,train_df['Loan_Status'])
print("Accuracy : %s" % "{0:.3%}".format(accuracy))

#Perform k-fold cross-validation with 5 folds
kf = KFold(train_df.shape[0], n_folds=5)
error = []
for train, test in kf:
    # Filter training data
    train_predictors = (pd.DataFrame(data=transformed).iloc[train,:])
    
    # The target we're using to train the algorithm.
    train_target = train_df['Loan_Status'].iloc[train]
    
    # Training the algorithm using the predictors and target.
    model2.fit(train_predictors, train_target)
    
    #Record error from each cross-validation run
    error.append(model2.score(pd.DataFrame(data=transformed).iloc[test,:], train_df['Loan_Status'].iloc[test]))
 
print("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))

Accuracy : 80.945%
Cross-Validation Score : 80.946%


Now, lets try to use this model to make a better submission

In [55]:
transformed2 = selector.transform(test_df2)
pred2 = model2.predict(transformed2)
fixed_pred2 = list(map(rep, pred2))
submission2 =  pd.DataFrame.from_dict(IDs, orient='columns', dtype=None)
submission2['Loan_Status'] = fixed_pred2
submission2.to_csv('submission2.csv', index = False)

![second submission](attempt2.PNG)

Our results:

which is a little better than the previous one 