In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn import cross_validation, linear_model, metrics, pipeline, preprocessing
import math
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
survivals = pd.read_csv("real_output.csv")

**Data analysis**

In [3]:
train_data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [4]:
#let's check if we have missing data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
#let's take a look at data distributions
fig, axes = plt.subplots(3, 3, figsize=(9, 10))

sns.countplot(x='Survived', data=train_data, ax=axes[0,0])
sns.countplot(x='Pclass', data=train_data, ax=axes[0,1])
sns.countplot(x='Sex', data=train_data, ax=axes[0,2])
train_data[["Age", "SibSp", "Parch"]].hist(ax=axes[1,:])
train_data[["Fare"]].hist(ax=axes[2,0])
sns.countplot(x='Embarked', data=train_data, ax=axes[2,1])

<matplotlib.axes._subplots.AxesSubplot at 0xd092b00>

In [6]:
#from first chart we can see that females had higher chances to survive
#also we see that higher cabin class seems to give higher chance to survive too
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(9,4))
sns.countplot(x='Survived', hue="Sex", data=train_data, ax=axis1)
sns.countplot(x='Survived', hue="Pclass", data=train_data, ax=axis2)

<matplotlib.axes._subplots.AxesSubplot at 0xe0edb70>

In [7]:
select = train_data[["Survived", "Pclass", "Age", "SibSp", "Parch", "Fare"]]

sex_mapping = {'female' : 0, 'male' : 1}
embarked_mapping = {'S' : 0, 'Q' : 1, 'C' : 2}

select = select.assign(Sex=[sex_mapping[item] for item in train_data["Sex"]])
select = select.assign(Embarked=[embarked_mapping[item] for item in train_data["Embarked"].fillna('S')])

In [8]:
#what we see here are 1) already mentioned correlation between Sex/Survived and Class/Survived 
#2) Fare/Class as expected 3) Age/Class (probably more aged people can afford to buy more expensive ones)
sns.heatmap(select.corr().abs(), square=True, annot=True, cmap="OrRd")

<matplotlib.axes._subplots.AxesSubplot at 0xe0edb70>

**Missing data**

In [9]:
#There are only 2 missing point in Embarked column which could be filled with S 
train_data["Embarked"] = train_data["Embarked"].fillna('S')

In [10]:
#There are almost 20% NA data points in Age so things might be little more complicated
train_data["Age"].describe()
#We can fill them with mean according to Sex but let's try out another approach



count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%             NaN
50%             NaN
75%             NaN
max       80.000000
Name: Age, dtype: float64

In [11]:
sns.boxplot(x="Pclass", y="Age", data=train_data);
# According to this chart we might fill NA ages with means according to passenger cabin class
# what would be better then previous  

In [12]:
# But let's try another approach and look at Name column. We can get passenger's Title from it:
train_data["Title"] = train_data["Name"].map(lambda x: re.search(' ([A-Za-z]+)\.', x).group(1))
sns.countplot(y='Title', data=train_data, orient="v")

<matplotlib.axes._subplots.AxesSubplot at 0xe0edb70>

In [13]:
#now let's take a look who have a missing age by Title value
train_data["Title"][pd.isnull(train_data["Age"])].value_counts()

Mr        119
Miss       36
Mrs        17
Master      4
Dr          1
Name: Title, dtype: int64

In [14]:
# we fill them by mean value according to Title and Class
train_data["Age"] = train_data.groupby(['Title', 'Pclass'])['Age'].transform(lambda x: x.replace(np.nan, x.mean()))

In [15]:
# also there are few rows with zero ticket price
train_data[train_data["Fare"] == 0].head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
179,180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,0.0,,S,Mr
263,264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0,B94,S,Mr
271,272,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,0.0,,S,Mr


In [16]:
#let's replace them with mean by class
train_data["Fare"] = train_data.groupby(['Pclass'])['Fare'].transform(lambda x: x.replace(0.0, x.mean()))

In [17]:
sns.boxplot(x="Title", 
            y="Age", 
            order=["Capt","Col","Major","Sir","Lady","Rev",
                   "Dr","Don","Jonkheer","Countess","Mrs", 
                    "Ms","Mr","Mme","Mlle","Miss","Master"], 
            data=train_data);

In [18]:
#Let's keep only 5 groups: Aristocratic, Mr, Mrs, Miss и Master
def change_title(title, fltr, new_title):
    if title in fltr: return new_title
    else: return title

train_data["Title"] = train_data["Title"].map(lambda x: change_title(x, ["Capt", "Col", "Don", 
                                                                         "Dr", "Jonkheer", "Lady", 
                                                                         "Major", "Rev", "Sir", "Countess"], "Aristocratic"))
train_data["Title"] = train_data["Title"].map(lambda x: change_title(x, ["Ms"], "Mrs"))
train_data["Title"] = train_data["Title"].map(lambda x: change_title(x, ["Mlle", "Mme"], "Miss"))

In [19]:
#looks nice!
sns.boxplot(x="Title", 
            y="Age",
            order=["Aristocratic", "Mrs", "Mr", "Miss", "Master"],
            data=train_data);

In [20]:
#here we can see percentage survived according to Title
title_perc = train_data[["Title", "Survived"]].groupby(['Title'],as_index=False).mean()
sns.barplot(x='Title', y='Survived', data=title_perc, palette=sns.color_palette("hls", 8))

<matplotlib.axes._subplots.AxesSubplot at 0xe0edb70>

In [21]:
#So we can prove again that it was more likely to survive for a women of 1 and 2 classes
#or for men of 1 class mostly
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(9,6))
sns.stripplot(x="Pclass", y="Age", hue="Survived", 
              data=train_data[train_data["Sex"] == "female"], jitter=True, ax=axis1)
sns.stripplot(x="Pclass", y="Age", hue="Survived", 
              data=train_data[train_data["Sex"] == "male"], jitter=True, ax=axis2)

<matplotlib.axes._subplots.AxesSubplot at 0x10495630>

In [22]:
#what about relatives onboard? Let's see how survival depends on Family size
train_data["Family"] = train_data["SibSp"] + train_data["Parch"]
sns.countplot(x='Family', hue="Survived", data=train_data[train_data["Sex"] == "male"])

<matplotlib.axes._subplots.AxesSubplot at 0x10495630>

In [23]:
#let's also add new binary column Single if psngr doesn't have family onboard
train_data["Single"] = (train_data["Family"] == 0).astype(int)

In [24]:
#CABIN. We don't have many cabin numbers available (20%) so there are no point in analysing it 
# but what can we get from availability of these numbers?
train_data["Cabin"] = train_data["Cabin"].map(lambda x: 0 if (pd.isnull(x)) else 1)

In [25]:
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(9,6))
sns.countplot(x='Cabin', hue="Survived", data=train_data, ax=axis1)
sns.countplot(x='Cabin', hue="Survived", 
              data=train_data[train_data["Sex"] == "male"], ax=axis2)

#so we see it may be important one specially for men

<matplotlib.axes._subplots.AxesSubplot at 0x10821828>

In [28]:
#so we grab all features and let's look at a correlation between them:
X = train_data[["Survived", "Age", "Family", "Fare", "Single", "Cabin"]]

X["BigFamily"] = (train_data["Family"] > 3).astype(int)
X["SmallFamily"] = (train_data["Family"] < 4).astype(int)

#Sex as Male
X = X.assign(Male=[{'female' : 0, 'male' : 1}[item] for item in train_data["Sex"]])

#Titles
title_dummies_titanic  = pd.get_dummies(train_data['Title'])
title_dummies_titanic.columns = ['Aristocratic','Mr','Master','Miss','Mrs']
X = X.join(title_dummies_titanic)

#Pclass
class_dummies_titanic  = pd.get_dummies(train_data['Pclass'])
class_dummies_titanic.columns = ['Class1','Class2','Class3']
X = X.join(class_dummies_titanic)

#Embarked
embarked_dummies_titanic  = pd.get_dummies(train_data['Embarked'])
embarked_dummies_titanic.columns = ['EmbarkedP','EmbarkedQ','EmbarkedS']
X = X.join(embarked_dummies_titanic)

sns.heatmap(X.corr(), square=True, annot=True, annot_kws={'fontsize' :8}, cmap="OrRd")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


<matplotlib.axes._subplots.AxesSubplot at 0x10821828>

In [131]:
#All columns:
X.columns

Index([u'Survived', u'Age', u'Family', u'Fare', u'Single', u'Cabin',
       u'BigFamily', u'SmallFamily', u'Male', u'Aristocratic', u'Mr',
       u'Master', u'Miss', u'Mrs', u'Class1', u'Class2', u'Class3',
       u'EmbarkedP', u'EmbarkedQ', u'EmbarkedS', u'Mr_Class3'],
      dtype='object')

In [134]:
def run_model(regressor, data, X_features, y_features):    
    X_ = data[X_features]
    y_ = data[y_features]
    
    accuracy_scoring = cross_validation.cross_val_score(regressor, X_, y_, scoring = accuracy_scorer, cv = cv_strategy)
    roc_auc_scoring = cross_validation.cross_val_score(regressor, X_, y_, scoring = roc_auc_scorer, cv = cv_strategy)
    regressor.fit(X_, y_)
    
    print("Accuracy mean:", accuracy_scoring.mean())
    print("ROC AUC mean:", roc_auc_scoring.mean())
    print("Real accuracy:", metrics.accuracy_score(survivals["survived"], regressor.predict(T[X_features])))
    
#    for w in list(zip([ '%.2f' % elem for elem in regressor.coef_[0]], X_.columns)):
#        print(w)
    
#lets create 2 scores to score our models
accuracy_scorer = metrics.make_scorer(metrics.accuracy_score)
roc_auc_scorer = metrics.make_scorer(metrics.roc_auc_score) 

cv_strategy = cross_validation.StratifiedShuffleSplit(X["Survived"], n_iter = 20 , 
                                                      test_size = 0.2, 
                                                      random_state = 2)

lr = linear_model.LogisticRegression(C=0.1)
run_model(lr, X, ['Age', 'Family', 'Fare', 'Single', 'Cabin', 'BigFamily',
           'Male', 'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3', 'EmbarkedQ',
           'EmbarkedS'], "Survived")

('Accuracy mean:', 0.8273743016759777)
('ROC AUC mean:', 0.80944334650856375)
('Real accuracy:', 0.77751196172248804)


In [135]:
#let's remove Fare (not statistical reasonable) and Sex (as Title already contains it) and EmbarkedQ

run_model(lr, X, ['Age', 'Family', 'Single', 'Cabin', 'BigFamily',
           'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")

('Accuracy mean:', 0.82821229050279332)
('ROC AUC mean:', 0.81161067193675895)
('Real accuracy:', 0.78947368421052633)


In [138]:
#let's remove Single (we have Family) and create separate feature for 3class men

X["Mr_Class3"] = (X["Class3"] * X["Mr"] == 1).astype(int)
T["Mr_Class3"] = (T["Class3"] * T["Mr"] == 1).astype(int)

run_model(lr, X, ['Age', 'Family', 'Cabin', 'Mr_Class3',
           'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")

('Accuracy mean:', 0.82709497206703908)
('ROC AUC mean:', 0.80840579710144933)
('Real accuracy:', 0.78708133971291872)


In [139]:
#Ok last chance. Lets do feature scalling for Age
X["Age"] = (X["Age"] - X["Age"].mean())/X["Age"].std()
run_model(lr, X, ['Age', 'Family', 'Cabin', 'Mr_Class3',
           'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")

('Accuracy mean:', 0.82709497206703908)
('ROC AUC mean:', 0.80840579710144933)
('Real accuracy:', 0.78708133971291872)


In [301]:
#Now lets try with Random forest
def run_model(regressor, data, X_features, y_features):    
    X_ = data[X_features]
    y_ = data[y_features]
    
    #accuracy_scoring = cross_validation.cross_val_score(regressor, X_, y_, scoring = accuracy_scorer, cv = cv_strategy)
    #roc_auc_scoring = cross_validation.cross_val_score(regressor, X_, y_, scoring = roc_auc_scorer, cv = cv_strategy)
    regressor.fit(X_, y_)
    
    #print("Accuracy mean:", accuracy_scoring.mean())
    #print("ROC AUC mean:", roc_auc_scoring.mean())
    sc = metrics.accuracy_score(survivals["survived"], regressor.predict(T[X_features]))
    if (sc > 0.79):
        print("Real accuracy:", sc)

rfc = RandomForestClassifier(n_estimators=140, max_features=2, max_depth=4)

run_model(rfc, X, ['Age', 'Family', 'Fare', 'Single', 'Cabin', 'BigFamily', 'Aristocratic',
           'Male', 'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3', 'EmbarkedP', 'EmbarkedQ',
           'EmbarkedS'], "Survived")

def rf_model(n, f, d):
    rfc = RandomForestClassifier(n_estimators=n, max_features=f, max_depth=f,random_state=2)
    run_model(rfc, X, ['Age', 'Family', 'Cabin', 'Mr_Class3',
           'Master', 'Miss', 'Mrs', 'Class2', 'Class3'], "Survived")

#run_model(rfc, X, ['Age', 'Family', 'Single', 'Cabin', 'BigFamily',
#           'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
#           'EmbarkedS'], "Survived")
#
#run_model(rfc, X, ['Age', 'Family', 'Cabin', 'Mr_Class3',
#           'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
#           'EmbarkedS'], "Survived")

In [302]:
n_estimators_= [120, 130, 140, 150]
max_features_= [2, 3, 4]
max_depth_= [1, 2, 3, 4, 5]

for n in n_estimators_:
    for f in max_features_:
        for d in max_depth_:
            print(n, f, d)
            rf_model(n, f, d)

#120, 2, 2, 08038 rs=3

(120, 2, 1)
(120, 2, 2)
(120, 2, 3)
(120, 2, 4)
(120, 2, 5)
(120, 3, 1)
(120, 3, 2)
(120, 3, 3)
(120, 3, 4)
(120, 3, 5)
(120, 4, 1)
(120, 4, 2)
(120, 4, 3)
(120, 4, 4)
(120, 4, 5)
(130, 2, 1)
('Real accuracy:', 0.80143540669856461)
(130, 2, 2)
('Real accuracy:', 0.80143540669856461)
(130, 2, 3)
('Real accuracy:', 0.80143540669856461)
(130, 2, 4)
('Real accuracy:', 0.80143540669856461)
(130, 2, 5)
('Real accuracy:', 0.80143540669856461)
(130, 3, 1)
(130, 3, 2)
(130, 3, 3)
(130, 3, 4)
(130, 3, 5)
(130, 4, 1)
(130, 4, 2)
(130, 4, 3)
(130, 4, 4)
(130, 4, 5)
(140, 2, 1)
(140, 2, 2)
(140, 2, 3)
(140, 2, 4)
(140, 2, 5)
(140, 3, 1)
(140, 3, 2)
(140, 3, 3)
(140, 3, 4)
(140, 3, 5)
(140, 4, 1)
(140, 4, 2)
(140, 4, 3)
(140, 4, 4)
(140, 4, 5)
(150, 2, 1)
('Real accuracy:', 0.79425837320574166)
(150, 2, 2)
('Real accuracy:', 0.79425837320574166)
(150, 2, 3)
('Real accuracy:', 0.79425837320574166)
(150, 2, 4)
('Real accuracy:', 0.79425837320574166)
(150, 2, 5)
('Real accuracy:', 0.79425837320574166)


In [209]:
#we see that we don't increase random forest score anyhow. lets try GradientBoostingClassifier
cls = GradientBoostingClassifier()
run_model(cls, X, ['Age', 'Family', 'Fare', 'Single', 'Cabin', 'BigFamily',
           'Male', 'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3', 'EmbarkedQ',
           'EmbarkedS'], "Survived")
run_model(cls, X, ['Age', 'Family', 'Single', 'Cabin', 'BigFamily',
           'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")
run_model(cls, X, ['Age', 'Family', 'Cabin', 'Mr_Class3',
           'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")

('Real accuracy:', 0.76555023923444976)
('Real accuracy:', 0.75598086124401909)
('Real accuracy:', 0.76555023923444976)


In [212]:
#that was little better. let's try out with SVM:
from sklearn.svm import SVC

#SVC is very sensitive to unscalled data
X["Fare"] = (X["Fare"] - X["Fare"].mean())/X["Fare"].std()

lin_svm = SVC(C=0.5)
run_model(lin_svm, X, ['Age', 'Family', 'Fare', 'Single', 'Cabin', 'BigFamily',
           'Male', 'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3', 'EmbarkedQ',
           'EmbarkedS'], "Survived")
run_model(lin_svm, X, ['Age', 'Family', 'Single', 'Cabin', 'BigFamily',
           'Mr', 'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")
run_model(lin_svm, X, ['Age', 'Family', 'Cabin', 'Mr_Class3',
           'Master', 'Miss', 'Mrs', 'Class2', 'Class3',
           'EmbarkedS'], "Survived")

('Real accuracy:', 0.77511961722488043)
('Real accuracy:', 0.77033492822966509)
('Real accuracy:', 0.77033492822966509)


In [37]:
#So as we see SVC predict better than other methods so let's try to predict survivals for our test data
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [38]:
#let's fill missing data as we did for train data AGE
test_data["Title"] = test_data["Name"].map(lambda x: re.search(' ([A-Za-z]+)\.', x).group(1))
test_data["Title"] = test_data["Title"].map(lambda x: change_title(x, ["Capt", "Col", "Don", "Dona", 
                                                                         "Dr", "Jonkheer", "Lady", 
                                                                         "Major", "Rev", "Sir", "Countess"], "Aristocratic"))
test_data["Title"] = test_data["Title"].map(lambda x: change_title(x, ["Ms"], "Mrs"))
test_data["Title"] = test_data["Title"].map(lambda x: change_title(x, ["Mlle", "Mme"], "Miss"))

test_data["Age"] = test_data.groupby(['Title', 'Pclass'])['Age'].transform(lambda x: x.replace(np.nan, x.mean()))

In [39]:
#missing Fare
test_data["Fare"] = test_data.groupby(['Pclass'])['Fare'].transform(lambda x: x.replace(0.0, x.mean()))
test_data["Fare"] = test_data.groupby(['Pclass'])['Fare'].transform(lambda x: x.replace(np.nan, x.mean()))

In [40]:
#missing cabin
test_data["Cabin"] = test_data["Cabin"].map(lambda x: 0 if (pd.isnull(x)) else 1)

In [41]:
test_data["Family"] = test_data["SibSp"] + test_data["Parch"]
test_data["Single"] = (test_data["Family"] == 0).astype(int)
test_data.drop("Name", axis=1, inplace=True)
test_data.drop("Ticket", axis=1, inplace=True)

In [129]:
T = test_data[["Age", "Fare", "Single", "Family", "Cabin"]]

T["BigFamily"] = (test_data["Family"] > 3).astype(int)
T["SmallFamily"] = (test_data["Family"] < 4).astype(int)

#Sex as Male
T = T.assign(Male=[{'female' : 0, 'male' : 1}[item] for item in test_data["Sex"]])

#Titles
title_dummies_test  = pd.get_dummies(test_data['Title'])
title_dummies_test.columns = ['Aristocratic','Mr','Master','Miss','Mrs']
T = T.join(title_dummies_test)

#Pclass
class_dummies_test  = pd.get_dummies(test_data['Pclass'])
class_dummies_test.columns = ['Class1','Class2','Class3']
T = T.join(class_dummies_test)

#Embarked
embarked_dummies_test  = pd.get_dummies(test_data['Embarked'])
embarked_dummies_test.columns = ['EmbarkedP','EmbarkedQ','EmbarkedS']
T = T.join(embarked_dummies_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [130]:
#also scalling for test data
T["Fare"] = (T["Fare"] - T["Fare"].mean())/T["Fare"].std()
T["Age"] = (T["Age"] - T["Age"].mean())/T["Age"].std()

In [44]:
X.head()

Unnamed: 0,Survived,Age,Family,Fare,Single,Cabin,BigFamily,SmallFamily,Male,Aristocratic,...,Master,Miss,Mrs,Class1,Class2,Class3,EmbarkedP,EmbarkedQ,EmbarkedS,Mr_Class3
0,0,-0.548191,1,-0.515736,0,0,0,1,1,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
1,1,0.633104,1,0.772917,0,1,0,1,0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0
2,1,-0.252867,0,-0.502152,1,0,0,1,0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
3,1,0.411611,1,0.406983,0,1,0,1,0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0
4,0,0.411611,0,-0.499636,1,0,0,1,1,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0


In [45]:
cols = ['Age', 'Fare', 'Single', 'Cabin', 'BigFamily', 'SmallFamily',
           'Male', 'Mr', 'Master', 'Miss', 'Mrs', 'Aristocratic', 'Class1', 'Class2', 'Class3']

In [46]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesClassifier(n_estimators=200)
clf = clf.fit(X[cols], X["Survived"])

features = pd.DataFrame()
features['feature'] = cols
features['importance'] = clf.feature_importances_
features.sort(['importance'],ascending=False)



Unnamed: 0,feature,importance
1,Fare,0.241879
0,Age,0.232694
9,Miss,0.128112
6,Male,0.117745
14,Class3,0.041673
3,Cabin,0.038381
8,Master,0.037416
10,Mrs,0.035072
5,SmallFamily,0.026213
12,Class1,0.024986


In [232]:
cols = ['Age', 'Fare', 'Single', 'Cabin', 'Family',
           'Male', 'Mr', 'Master', 'Miss', 'Mrs', 'Class3']

In [1]:
#and finally let's predict survivals!!!
rfc = RandomForestClassifier(n_estimators=150, max_depth=3, max_features=4)

rfc.fit(X[cols], X["Survived"])
#print(metrics.accuracy_score(rfc.predict(X[cols]), X["Survived"]))

print(metrics.accuracy_score(rfc.predict(T[cols]), survivals["survived"]))

NameError: name 'RandomForestClassifier' is not defined

In [248]:
svm = SVC(C=1.0)
svm.fit(X[cols], X["Survived"])
print(metrics.accuracy_score(svm.predict(T[cols]), survivals["survived"]))

predictions = svm.predict(T[cols])

0.775119617225


In [None]:
submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": predictions
    })
submission.to_csv('titanic.csv', index=False)