In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

### Import the data

In [2]:
df = pd.read_csv("clean_data_basic_pdayscat_agecat.csv",index_col=False)

In [3]:
df.head()

Unnamed: 0,age,education,default,housing,loan,contact,campaign,pdays,previous,emp.var.rate,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,agecat_adult,agecat_senior,agecat_young
0,56,2.0,0.0,0.0,0.0,1,1,999,0,1.1,...,0,0,0,1,0,0,0,1,0,0
1,37,5.0,0.0,2.0,0.0,1,1,999,0,1.1,...,0,0,0,1,0,0,0,1,0,0
2,40,3.0,0.0,0.0,0.0,1,1,999,0,1.1,...,0,0,0,1,0,0,0,1,0,0
3,56,5.0,0.0,0.0,2.0,1,1,999,0,1.1,...,0,0,0,1,0,0,0,1,0,0
4,59,6.0,0.0,0.0,0.0,1,1,999,0,1.1,...,0,0,0,1,0,0,0,1,0,0


In [None]:
# df.columns

In [4]:
#Dropping one column each from categorical variables to avoid redunduncy
# df = df.drop([
#  'poutcome_success',
#  'job_unemployed',
#  'marital_divorced',
#  'month_apr',
#  'day_of_week_fri'],axis=1)
# df = df.drop([])
# df = df.drop('Unnamed: 0',axis=1)
features = df.drop('y',axis=1)

### Split the data into training and test sets for final testing

In [5]:
X, Xtest, y, ytest = train_test_split(features, df['y'],random_state=5,test_size=0.3)

## Normalization

In [6]:
def normalizer(traindata,testdata):
    
    trainnormalized = (traindata-min(traindata))/(max(traindata)-min(traindata))
    testnormalized = (testdata-min(traindata))/(max(traindata)-min(traindata))
    return trainnormalized, testnormalized


for column in X:
    X[column], Xtest[column] = normalizer(X[column],Xtest[column])

# Xtest.head()
# df.education = normalizer(df.education)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Tuning

In [7]:
def cv_score(clf, x, y, score_func=accuracy_score): #Apply k-fold cross velidation
    result = 0
    nfold = 5
    for train, test in KFold(nfold,random_state=0).split(x): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf.predict(x[test]), y[test]) # evaluate score function on held-out data
    return result / nfold # average

In [None]:
y.shape

In [None]:
clf = LogisticRegression(random_state=0)
score = cv_score(clf, X.values, y.values)
print(score)

#### Tune C

In [None]:
#the grid of parameters to search over
Cs = [0.001,0.01, 0.1, 1, 10, 100]
max_score = 0
# your turn
for C in Cs:
    clf = LogisticRegression(C=C,random_state=1)
    score = cv_score(clf, X.values, y.values)
#     print (score)
    if score > max_score:
        max_score = score
        Cfinal = C
print ("Maximum score of {} is achieved at C = {}".format(max_score,Cfinal))

In [None]:
# your turn
clf = LogisticRegression(C=Cfinal,random_state=1)
clf.fit(X, y)
print(accuracy_score(clf.predict(Xtest), ytest))
#clf.score(Xtestlr,ytestlr)

* **Attempt 1 : Basic Clean Data                : 0.901201274822**
* **Attempt 2 : Basic Clean Data + Categorical age and pdays              : 0.901201274822**
* **Attempt 3 : Basic Clean Data + Normalization : 0.902263626706**
* **Attempt 4 : Basic Clean Data + Normalization + Categorical age and pdays: 0.902263626706**

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, clf.predict(Xtest))
cm

## SVC

In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

#Initial Run
svc = SVC()
score = cv_score(svc, X.values, y.values)
print(score)

0.8976882662


In [9]:
#Tuning Using GridSearchCV

parameters = {'kernel':('linear', 'rbf'),'C':[1,5,10],  'gamma': 
              [0.01,0.10,0.5]}
# 
svr = SVC()
grid = GridSearchCV(svr, parameters)
grid.fit(X.values, y.values)
predicted = grid.predict(Xtest)
score = grid.score(Xtest)
print(score)
print(accuracy_score(predicted, ytest))
cnf_matrix = confusion_matrix(ytest, predicted)
print(cnf_matrix)

ValueError: Expected array-like (array or non-string sequence), got None

In [12]:
print(accuracy_score(predicted, ytest))

0.898504535425


In [13]:
grid.best_estimator_
grid.
# SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
#   decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
#   max_iter=-1, probability=False, random_state=None, shrinking=True,
#   tol=0.001, verbose=False)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [14]:
# your turn
svcfinal = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
svcfinal.fit(X, y)

NameError: name 'clf' is not defined

In [15]:
print(accuracy_score(svcfinal.predict(Xtest), ytest))
#clf.score(Xtestlr,ytestlr)

0.898504535425


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=50)
rf.fit(X,y)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

predictors = list(features.columns)

importances=rf.feature_importances_

indices = np.argsort(importances)[::-1]

sorted_important_features=[]
for i in indices:
    sorted_important_features.append(predictors[i])
    
plt.figure()
plt.title("Feature Importances By Random Forest Model")
plt.bar(range(np.size(predictors)), importances[indices],
       color="r")
plt.xticks(range(np.size(predictors)), sorted_important_features, rotation='vertical')

plt.xlim([-1, np.size(predictors)]);

In [None]:
features=features[sorted_important_features[:-5]]
sorted_important_features[-5:]

In [None]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()

parameters = {'n_estimators':[20,30,50]}

gs = GridSearchCV(rf,parameters)
gs.fit(X,y)