In [2]:
import csv
import numpy as np

with open('titanic-train.csv', 'rt') as f:
    data = list(csv.DictReader(f))
data[:1]

[{'Age': '22',
  'Cabin': '',
  'Embarked': 'S',
  'Fare': '7.25',
  'Name': 'Braund, Mr. Owen Harris',
  'Parch': '0',
  'PassengerId': '1',
  'Pclass': '3',
  'Sex': 'male',
  'SibSp': '1',
  'Survived': '0',
  'Ticket': 'A/5 21171'}]

In [3]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

_all_xs = [{k: v for k, v in row.items() if k != 'Survived'} for row in data]
_all_ys = np.array([int(row['Survived']) for row in data])

all_xs, all_ys = shuffle(_all_xs, _all_ys, random_state=0)
train_xs, valid_xs, train_ys, valid_ys = train_test_split(
    all_xs, all_ys, test_size=0.25, random_state=0)
print('{} items total, {:.1%} true'.format(len(all_xs), np.mean(all_ys)))

891 items total, 38.4% true


In [4]:
for x in all_xs:
    if x['Age']:
        x['Age'] = float(x['Age'])
    else:
        x.pop('Age')
    x['Fare'] = float(x['Fare'])
    x['SibSp'] = int(x['SibSp'])
    x['Parch'] = int(x['Parch'])

In [5]:
import warnings
# xgboost <= 0.6a2 shows a warning when used with scikit-learn 0.18+
warnings.filterwarnings('ignore', category=DeprecationWarning)
from xgboost import XGBClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score


clf = XGBClassifier()
vec = DictVectorizer()
pipeline = make_pipeline(vec, clf)

def evaluate(_clf):
    scores = cross_val_score(_clf, all_xs, all_ys, scoring='accuracy', cv=10)
    print('Accuracy: {:.3f} ± {:.3f}'.format(np.mean(scores), 2 * np.std(scores)))
    _clf.fit(train_xs, train_ys)  # so that parts of the original pipeline are fitted

evaluate(pipeline)

Accuracy: 0.823 ± 0.071


In [6]:
booster = clf.booster()
original_feature_names = booster.feature_names
booster.feature_names = vec.get_feature_names()
print(booster.get_dump()[0])
# recover original feature names
booster.feature_names = original_feature_names

0:[Sex=female<-9.53674e-07] yes=1,no=2,missing=1
	1:[Age<13] yes=3,no=4,missing=4
		3:[SibSp<2] yes=7,no=8,missing=7
			7:leaf=0.145455
			8:leaf=-0.125
		4:[Fare<26.2687] yes=9,no=10,missing=9
			9:leaf=-0.151515
			10:leaf=-0.0727273
	2:[Pclass=3<-9.53674e-07] yes=5,no=6,missing=5
		5:[Fare<12.175] yes=11,no=12,missing=12
			11:leaf=0.05
			12:leaf=0.175194
		6:[Fare<24.8083] yes=13,no=14,missing=14
			13:leaf=0.0365591
			14:leaf=-0.152



In [7]:
from eli5 import show_weights
show_weights(clf, vec=vec)

Weight,Feature
0.4278,Sex=female
0.1949,Pclass=3
0.0665,Embarked=S
0.051,Pclass=2
0.042,SibSp
0.0417,Cabin=
0.0385,Embarked=C
0.0358,Ticket=1601
0.0331,Age
0.0323,Fare


In [8]:
from eli5 import show_prediction
show_prediction(clf, valid_xs[1], vec=vec, show_feature_values=True)

Contribution?,Feature,Value
1.673,Sex=female,1.000
0.479,Embarked=S,Missing
0.07,Fare,7.879
-0.004,Cabin=,1.000
-0.006,Parch,0.000
-0.009,Pclass=2,Missing
-0.009,Ticket=1601,Missing
-0.012,Embarked=C,Missing
-0.071,SibSp,0.000
-0.073,Pclass=1,Missing


In [9]:
type(valid_xs[1])

dict

In [10]:
type(vec)

sklearn.feature_extraction.dict_vectorizer.DictVectorizer

In [11]:
import pandas as pd

data = pd.read_csv('titanic-train.csv')

In [14]:
data.loc[0]

PassengerId                          1
Survived                             0
Pclass                               3
Name           Braund, Mr. Owen Harris
Sex                               male
Age                                 22
SibSp                                1
Parch                                0
Ticket                       A/5 21171
Fare                              7.25
Cabin                              NaN
Embarked                             S
Name: 0, dtype: object

In [19]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [26]:
columns_list = list(data.columns)
print(columns_list)
columns_list.remove('Survived')
print(columns_list)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [69]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

train_xs, valid_xs, train_ys, valid_ys = train_test_split(data[columns_list], data['Survived'], test_size=0.25, random_state=0)

In [70]:
import warnings
# xgboost <= 0.6a2 shows a warning when used with scikit-learn 0.18+
warnings.filterwarnings('ignore', category=DeprecationWarning)
from xgboost import XGBClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score



clf = XGBClassifier()
#vec = DictVectorizer()
pipeline = make_pipeline(clf)
dummies_train = pd.get_dummies(train_xs[['Sex','Ticket','Cabin','Embarked']])
train_xs.drop(['Name','Sex','Ticket','Cabin','Embarked'],inplace = True, axis = 1)
train_xs = pd.concat([train_xs,dummies_train],axis = 1)

dummies_test = pd.get_dummies(valid_xs[['Sex','Ticket','Cabin','Embarked']])
valid_xs.drop(['Name','Sex','Ticket','Cabin','Embarked'],inplace = True,axis = 1)
valid_xs = pd.concat([valid_xs,dummies_test],axis = 1)

common_columns= list(set(train_xs.columns) & set(valid_xs.columns))

train_xs2 = train_xs[common_columns]
clf.fit(train_xs2,train_ys)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [72]:
valid_xs.columns

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female',
       'Sex_male', 'Ticket_110413', 'Ticket_111361',
       ...
       'Cabin_E33', 'Cabin_E34', 'Cabin_E67', 'Cabin_F G73', 'Cabin_F2',
       'Cabin_F33', 'Cabin_G6', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object', length=263)

In [74]:
train_xs2.columns

Index(['Ticket_230080', 'Ticket_C.A. 33112', 'Ticket_250649', 'Ticket_17421',
       'Ticket_110413', 'Ticket_2627', 'Ticket_PC 17758', 'Ticket_230433',
       'Cabin_C52', 'Ticket_13502', 'Ticket_3101295', 'Ticket_113798',
       'Ticket_371110', 'Cabin_D36', 'Ticket_LINE', 'Cabin_D20', 'PassengerId',
       'Ticket_1601', 'Cabin_C126', 'Ticket_PC 17757', 'Ticket_19928',
       'Ticket_19996', 'Ticket_16966', 'SibSp', 'Ticket_111361',
       'Ticket_C.A. 2673', 'Ticket_STON/O2. 3101279', 'Ticket_19950',
       'Ticket_PC 17604', 'Ticket_C.A. 2315', 'Ticket_S.O.C. 14879',
       'Cabin_B96 B98', 'Ticket_2651', 'Ticket_237736', 'Cabin_F2',
       'Ticket_244252', 'Ticket_113781', 'Ticket_113760', 'Cabin_E67',
       'Ticket_367226', 'Ticket_P/PP 3381', 'Ticket_F.C.C. 13529',
       'Ticket_36928', 'Ticket_347088', 'Ticket_248727', 'Pclass',
       'Ticket_2665', 'Ticket_36947', 'Ticket_113505', 'Ticket_244367',
       'Ticket_CA 2144', 'Cabin_C124', 'Cabin_B58 B60', 'Ticket_35273',
    

In [75]:
order_list = clf.booster().feature_names

In [76]:
clf.predict_proba(valid_xs[order_list])

array([[ 0.84309262,  0.15690739],
       [ 0.94615322,  0.05384678],
       [ 0.91994107,  0.0800589 ],
       [ 0.05155241,  0.94844759],
       [ 0.31986612,  0.68013388],
       [ 0.75204951,  0.24795048],
       [ 0.07347041,  0.92652959],
       [ 0.10314101,  0.89685899],
       [ 0.48974121,  0.51025879],
       [ 0.32275409,  0.67724591],
       [ 0.82817972,  0.1718203 ],
       [ 0.14835775,  0.85164225],
       [ 0.86457872,  0.13542131],
       [ 0.04182297,  0.95817703],
       [ 0.03134012,  0.96865988],
       [ 0.14006585,  0.85993415],
       [ 0.88354975,  0.11645024],
       [ 0.93424553,  0.06575447],
       [ 0.82015395,  0.17984602],
       [ 0.06794041,  0.93205959],
       [ 0.93010831,  0.06989167],
       [ 0.18278867,  0.81721133],
       [ 0.94833612,  0.05166389],
       [ 0.75919342,  0.24080655],
       [ 0.52514112,  0.47485888],
       [ 0.03218102,  0.96781898],
       [ 0.74829042,  0.25170958],
       [ 0.27810788,  0.72189212],
       [ 0.11397916,

In [79]:
eli5.show_prediction(clf,valid_xs[order_list].loc[1])

ValueError: feature_names mismatch: ['Ticket_230080', 'Ticket_C.A. 33112', 'Ticket_250649', 'Ticket_17421', 'Ticket_110413', 'Ticket_2627', 'Ticket_PC 17758', 'Ticket_230433', 'Cabin_C52', 'Ticket_13502', 'Ticket_3101295', 'Ticket_113798', 'Ticket_371110', 'Cabin_D36', 'Ticket_LINE', 'Cabin_D20', 'PassengerId', 'Ticket_1601', 'Cabin_C126', 'Ticket_PC 17757', 'Ticket_19928', 'Ticket_19996', 'Ticket_16966', 'SibSp', 'Ticket_111361', 'Ticket_C.A. 2673', 'Ticket_STON/O2. 3101279', 'Ticket_19950', 'Ticket_PC 17604', 'Ticket_C.A. 2315', 'Ticket_S.O.C. 14879', 'Cabin_B96 B98', 'Ticket_2651', 'Ticket_237736', 'Cabin_F2', 'Ticket_244252', 'Ticket_113781', 'Ticket_113760', 'Cabin_E67', 'Ticket_367226', 'Ticket_P/PP 3381', 'Ticket_F.C.C. 13529', 'Ticket_36928', 'Ticket_347088', 'Ticket_248727', 'Pclass', 'Ticket_2665', 'Ticket_36947', 'Ticket_113505', 'Ticket_244367', 'Ticket_CA 2144', 'Cabin_C124', 'Cabin_B58 B60', 'Ticket_35273', 'Cabin_G6', 'Ticket_367230', 'Ticket_S.C./PARIS 2079', 'Ticket_364516', 'Ticket_31027', 'Ticket_28403', 'Cabin_D17', 'Cabin_E33', 'Ticket_2666', 'Embarked_C', 'Sex_male', 'Ticket_358585', 'Parch', 'Cabin_C125', 'Ticket_PC 17569', 'Embarked_S', 'Ticket_PC 17558', 'Cabin_C65', 'Ticket_11767', 'Ticket_PP 9549', 'Ticket_PC 17572', 'Ticket_239853', 'Ticket_347054', 'Ticket_PC 17582', 'Cabin_F33', 'Cabin_B18', 'Cabin_C23 C25 C27', 'Ticket_2691', 'Ticket_C.A. 37671', 'Embarked_Q', 'Ticket_349909', 'Sex_female', 'Age', 'Ticket_11967', 'Ticket_347742', 'Cabin_B49', 'Ticket_3101278', 'Ticket_382652', 'Ticket_26360', 'Cabin_C78', 'Fare', 'Ticket_347082'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95']
expected Ticket_230080, Ticket_C.A. 33112, Ticket_250649, Ticket_17421, Ticket_110413, Ticket_2627, Ticket_PC 17758, Ticket_230433, Cabin_C52, Ticket_13502, Ticket_3101295, Ticket_113798, Ticket_371110, Cabin_D36, Ticket_LINE, Cabin_D20, PassengerId, Ticket_1601, Cabin_C126, Ticket_PC 17757, Ticket_19928, Ticket_19996, Ticket_16966, SibSp, Ticket_111361, Ticket_C.A. 2673, Ticket_STON/O2. 3101279, Ticket_19950, Ticket_PC 17604, Ticket_C.A. 2315, Ticket_S.O.C. 14879, Cabin_B96 B98, Ticket_2651, Ticket_237736, Cabin_F2, Ticket_244252, Ticket_113781, Ticket_113760, Cabin_E67, Ticket_367226, Ticket_P/PP 3381, Ticket_F.C.C. 13529, Ticket_36928, Ticket_347088, Ticket_248727, Pclass, Ticket_2665, Ticket_36947, Ticket_113505, Ticket_244367, Ticket_CA 2144, Cabin_C124, Cabin_B58 B60, Ticket_35273, Cabin_G6, Ticket_367230, Ticket_S.C./PARIS 2079, Ticket_364516, Ticket_31027, Ticket_28403, Cabin_D17, Cabin_E33, Ticket_2666, Embarked_C, Sex_male, Ticket_358585, Parch, Cabin_C125, Ticket_PC 17569, Embarked_S, Ticket_PC 17558, Cabin_C65, Ticket_11767, Ticket_PP 9549, Ticket_PC 17572, Ticket_239853, Ticket_347054, Ticket_PC 17582, Cabin_F33, Cabin_B18, Cabin_C23 C25 C27, Ticket_2691, Ticket_C.A. 37671, Embarked_Q, Ticket_349909, Sex_female, Age, Ticket_11967, Ticket_347742, Cabin_B49, Ticket_3101278, Ticket_382652, Ticket_26360, Cabin_C78, Fare, Ticket_347082 in input data
training data did not have the following fields: f21, f86, f83, f85, f4, f6, f44, f89, f67, f29, f19, f91, f84, f50, f0, f3, f95, f32, f38, f56, f14, f60, f24, f27, f46, f5, f92, f23, f25, f22, f87, f54, f8, f80, f10, f39, f93, f2, f55, f26, f1, f12, f72, f64, f53, f20, f36, f15, f11, f79, f69, f90, f34, f17, f75, f61, f76, f73, f30, f82, f31, f13, f59, f52, f33, f18, f58, f9, f47, f45, f88, f40, f43, f63, f62, f16, f65, f48, f28, f37, f68, f70, f77, f51, f74, f41, f78, f7, f94, f66, f81, f57, f42, f35, f49, f71