In [1]:
import numpy as np
import pandas as pd
import os
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import IPython.parallel
import psutil

import warnings
warnings.simplefilter('ignore', DeprecationWarning)

In [5]:
# Get the data
%run fetch_data.py

Creating datasets folder: C:\Users\Dan\1) Python Notebooks\PyCon2015\datasets
Checking availability of the 20 newsgroups dataset
Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)
Decompressing C:\Users\Dan\1) Python Notebooks\PyCon2015\datasets\20news-bydate.tar.gz
Checking that the 20 newsgroups files exist...
=> Success!


In [8]:
os.listdir(os.getcwd())

['.ipynb_checkpoints',
 'datasets',
 'fetch_data.py',
 'Machine Learning PyCon - 2015.ipynb']

In [7]:
for fname in os.listdir('../datasets/'):
    print(fname)

ad-dataset
movie_review_train.tsv
SMSSpamCollection
winequality-red.csv
winequality-white.csv


## TITANIC DATASET

In [2]:
#data = pd.read_csv('https://dl.dropboxusercontent.com/u/5743203/data/titanic/titanic_train.csv')
#data = pd.read_csv('titanic_train.csv')
#data.to_csv("datasets\\Titanic_Dataset.csv")
data = pd.read_csv("datasets\\Titanic_Dataset.csv")

In [3]:
data.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S


In [5]:
#Provides a count of values where the information is not null
data.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [5]:
data.groupby('Survived').count()

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,549,549,549,549,424,549,549,549,549,68,549
1,342,342,342,342,290,342,342,342,342,136,340


In [6]:
numerical_features = data[['Fare','Pclass','Age']]
# Need to handle values where Age is missing
median_features = numerical_features.dropna().median()
imputed_features = numerical_features.fillna(median_features)
features_array = imputed_features.values

In [12]:
survived_column = data['Survived'] 
target = survived_column.values

In [8]:
from sklearn.cross_validation import train_test_split
# Use 80% for training and 20% for testing model
features_train, features_test, target_train, target_test = train_test_split(
    features_array, target, test_size=0.20, random_state=0)

In [9]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=1)
logreg.fit(features_train, target_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [10]:
target_predicted = logreg.predict(features_test)

In [11]:
from sklearn.metrics import classification_report

print(classification_report(target_test, target_predicted,
                            target_names=['not survived', 'survived']))

              precision    recall  f1-score   support

not survived       0.73      0.89      0.80       110
    survived       0.73      0.48      0.58        69

 avg / total       0.73      0.73      0.72       179



In [12]:
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(logreg, features_array, target, cv=5)
print 'Primary'
print 'min - ', scores.min(), 'mean - ', scores.mean(), 'max - ', scores.max()

print 'roc_auc'
scores = cross_val_score(logreg, features_array, target, cv=5, scoring='roc_auc')
print 'min - ', scores.min(), 'mean - ', scores.mean(), 'max - ', scores.max()

print 'precision'
scores = cross_val_score(logreg, features_array, target, cv=5, scoring='precision')
print 'min - ', scores.min(), 'mean - ', scores.mean(), 'max - ', scores.max()

print 'recall'
scores = cross_val_score(logreg, features_array, target, cv=5, scoring='recall')
print 'min - ', scores.min(), 'mean - ', scores.mean(), 'max - ', scores.max()

print 'f1'
scores = cross_val_score(logreg, features_array, target, cv=5, scoring='f1')
print 'min - ', scores.min(), 'mean - ', scores.mean(), 'max - ', scores.max()

Primary
min -  0.631284916201 mean -  0.693706829629 max -  0.730337078652
roc_auc
min -  0.61093544137 mean -  0.721231816511 max -  0.787767379679
precision
min -  0.538461538462 mean -  0.66517008141 max -  0.75
recall
min -  0.304347826087 mean -  0.409505541347 max -  0.470588235294
f1
min -  0.388888888889 mean -  0.504923635951 max -  0.561403508772


# Encoding text categorical variables (i.e. 1 hot encoding)

In [8]:
pd.get_dummies(data['Sex'], prefix='Sex').head(5)

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [9]:
pd.get_dummies(data['Embarked'], prefix='Embarked').head(5)

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [3]:
rich_features = pd.concat([data[['Fare', 'Pclass', 'Age']],
                           pd.get_dummies(data['Sex'], prefix='Sex'),
                           pd.get_dummies(data['Embarked'], prefix='Embarked')],
                          axis=1)
rich_features.head(5)

Unnamed: 0,Fare,Pclass,Age,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,7.25,3,22,0,1,0,0,1
1,71.2833,1,38,1,0,1,0,0
2,7.925,3,26,1,0,0,0,1
3,53.1,1,35,1,0,0,0,1
4,8.05,3,35,0,1,0,0,1


In [4]:
# Because sex can only be male or female, we can drop the Sex_male column
rich_features_no_male = rich_features.drop('Sex_male', 1)
rich_features_no_male.head(5)

#Again adding age
rich_features_final = rich_features_no_male.fillna(rich_features_no_male.dropna().median())
rich_features_final.head(5)


Unnamed: 0,Fare,Pclass,Age,Sex_female,Embarked_C,Embarked_Q,Embarked_S
0,7.25,3,22,0,0,0,1
1,71.2833,1,38,1,1,0,0
2,7.925,3,26,1,0,0,1
3,53.1,1,35,1,0,0,1
4,8.05,3,35,0,0,0,1


In [5]:
#look at median age by class and sex
rich_features_no_male.groupby(['Pclass','Sex_female']).median()

Unnamed: 0_level_0,Unnamed: 1_level_0,Fare,Age,Embarked_C,Embarked_Q,Embarked_S
Pclass,Sex_female,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,41.2625,40.0,0,0,1
1,1,82.66455,35.0,0,0,1
2,0,13.0,30.0,0,0,1
2,1,22.0,28.0,0,0,1
3,0,7.925,25.0,0,0,1
3,1,12.475,21.5,0,0,1


In [6]:
rich_features_no_male.head(1).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 0 to 0
Data columns (total 7 columns):
Fare          1 non-null float64
Pclass        1 non-null int64
Age           1 non-null float64
Sex_female    1 non-null float64
Embarked_C    1 non-null float64
Embarked_Q    1 non-null float64
Embarked_S    1 non-null float64
dtypes: float64(6), int64(1)
memory usage: 64.0 bytes


In [7]:
rich_features_final = rich_features_no_male

In [8]:
# Populate the Median Age info based on above groupings

rich_features_final[(rich_features_final.Pclass == 1) & (rich_features_final.Sex_female == 0)] = rich_features_final[(rich_features_final.Pclass == 1) & (rich_features_final.Sex_female == 0)].fillna(40)
rich_features_final[(rich_features_final.Pclass == 1) & (rich_features_final.Sex_female == 1)] = rich_features_final[(rich_features_final.Pclass == 1) & (rich_features_final.Sex_female == 1)].fillna(35)
rich_features_final[(rich_features_final.Pclass == 2) & (rich_features_final.Sex_female == 0)] = rich_features_final[(rich_features_final.Pclass == 2) & (rich_features_final.Sex_female == 0)].fillna(30)
rich_features_final[(rich_features_final.Pclass == 2) & (rich_features_final.Sex_female == 1)] = rich_features_final[(rich_features_final.Pclass == 2) & (rich_features_final.Sex_female == 1)].fillna(28)
rich_features_final[(rich_features_final.Pclass == 3) & (rich_features_final.Sex_female == 0)] = rich_features_final[(rich_features_final.Pclass == 3) & (rich_features_final.Sex_female == 0)].fillna(25)
rich_features_final[(rich_features_final.Pclass == 3) & (rich_features_final.Sex_female == 1)] = rich_features_final[(rich_features_final.Pclass == 3) & (rich_features_final.Sex_female == 1)].fillna(22)

In [67]:
%%time

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score

logreg = LogisticRegression(C=1)
scores = cross_val_score(logreg, rich_features_final, target, cv=5, scoring='accuracy')
print("Logistic Regression CV scores:")
print("min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(
    scores.min(), scores.mean(), scores.max()))

Logistic Regression CV scores:
min: 0.775, mean: 0.789, max: 0.804
Wall time: 46 ms


In [17]:
%%time

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score

logreg = LogisticRegression(C=1)
scores = cross_val_score(logreg, rich_features_final, target, cv=5, scoring='accuracy')
print("Logistic Regression CV scores:")
print("min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(
    scores.min(), scores.mean(), scores.max()))

Logistic Regression CV scores:
min: 0.770, mean: 0.786, max: 0.810
Wall time: 13 ms


In [68]:
%%time

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score

logreg = LogisticRegression()
scores = cross_val_score(logreg, rich_features_final, target, cv=5, scoring='accuracy')
print("Logistic Regression CV scores:")
print("min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(
    scores.min(), scores.mean(), scores.max()))

Logistic Regression CV scores:
min: 0.775, mean: 0.789, max: 0.804
Wall time: 12 ms


In [None]:
# Non linear models work well if there are not too many features

In [69]:
%%time
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(rf, rich_features_final, target, cv=5, scoring='accuracy')
print("Random Forest CV scores:")
print("min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(
    scores.min(), scores.mean(), scores.max()))

Random Forest CV scores:
min: 0.781, mean: 0.801, max: 0.843
Wall time: 472 ms


In [71]:
%%time
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.3,subsample=.8, max_features=.5)
scores = cross_val_score(gb, rich_features_final, target, cv=5, scoring='accuracy')
print("Gradient Boosted Trees CV scores:")
print("min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(scores.min(), scores.mean(), scores.max()))

Gradient Boosted Trees CV scores:
min: 0.816, mean: 0.829, max: 0.860
Wall time: 324 ms


In [31]:
# Automated Parameter Tuning

In [72]:
from sklearn.grid_search import GridSearchCV

gb = GradientBoostingClassifier(n_estimators=100, subsample=.8)

params = {
    'learning_rate': [0.05, 0.1, 0.5],
    'max_features': [0.5, 1],
    'max_depth': [3, 4, 5],
}
gs = GridSearchCV(gb, params, cv=5, scoring='roc_auc')
gs.fit(rich_features_final, target)

GridSearchCV(cv=5,
       estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2, n_estimators=100,
              random_state=None, subsample=0.8, verbose=0,
              warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'max_features': [0.5, 1], 'learning_rate': [0.05, 0.1, 0.5], 'max_depth': [3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='roc_auc', verbose=0)

In [33]:
gs.best_score_

0.8751210267720011

In [34]:
gs.best_params_

{'learning_rate': 0.05, 'max_depth': 5, 'max_features': 1}

# Support Vector Classifiers

In [10]:
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [15]:
clf = SVC(kernel='linear')
scores = cross_val_score(clf, rich_features_final, target, cv=5, scoring='accuracy')
print("Support Vector Classifers CV scores:")
print("min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(scores.min(), scores.mean(), scores.max()))

Gradient Boosted Trees CV scores:
min: 0.753, mean: 0.787, max: 0.804


In [16]:
clf = SVC(kernel='rbf')
scores = cross_val_score(clf, rich_features_final, target, cv=5, scoring='accuracy')
print("Support Vector Classifers CV scores:")
print("min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(scores.min(), scores.mean(), scores.max()))

Support Vector Classifers CV scores:
min: 0.615, mean: 0.686, max: 0.742


In [None]:
clf = SVC(kernel='poly')
scores = cross_val_score(clf, rich_features_final, target, cv=5, scoring='accuracy')
print("Support Vector Classifers CV scores:")
print("min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(scores.min(), scores.mean(), scores.max()))

In [None]:
clf = SVC(kernel='precomputed')
scores = cross_val_score(clf, rich_features_final, target, cv=5, scoring='accuracy')
print("Support Vector Classifers CV scores:")
print("min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(scores.min(), scores.mean(), scores.max()))

In [None]:
clf = SVC(kernel='sigmoid')
scores = cross_val_score(clf, rich_features_final, target, cv=5, scoring='accuracy')
print("Support Vector Classifers CV scores:")
print("min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(scores.min(), scores.mean(), scores.max()))

In [None]:
from sklearn.grid_search import GridSearchCV
svc = SVC()
params = {
    'learning_rate': [0.05, 0.1, 0.5],
    'max_features': [0.5, 1],
    'max_depth': [3, 4, 5],
}
gs = GridSearchCV(gb, params, cv=5, scoring='roc_auc')
gs.fit(rich_features_final, target)

In [None]:
print(classification_report(target_test, target_predicted,
                            target_names=['not survived', 'survived']))