In [162]:
%matplotlib inline
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [242]:
from sklearn.model_selection import train_test_split

from  sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [243]:
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")
data=train.append(test).reset_index(drop=True)

## Convert catagorical data

### Embarked

In [165]:
train["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [166]:
data["Embarked"].fillna(value="S",inplace=True)
new_embarked=pd.get_dummies(data['Embarked'],prefix='Embarked')
data = pd.concat([data,new_embarked],axis=1)

### Sex

In [167]:
data["Sex"]=data["Sex"].replace(to_replace='male',value=0)
data["Sex"]=data["Sex"].replace(to_replace='female',value=1)


## Adding an _accompanied_ feature

In [168]:
def accompanied(df):
    df["Accompanied"]=df["Parch"]+df["SibSp"]
    df['Accompanied'].loc[df['Accompanied'] > 0] = 1
    df['Accompanied'].loc[df['Accompanied'] == 0] = 0
    return df

In [169]:
data=accompanied(data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


## Adding a _Family Size_ Feature

In [170]:
def FamSize(df):
    df["FamSize"]= df["Parch"] + df["SibSp"] +1 
    return df

In [171]:
data=FamSize(data)

## Dealing with _Name_ feature

In [172]:
data["Title"]=data['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip())

In [173]:
def married(title):
    MarriedWomanTitles=["Mrs","Lady","Mme","the Countess"]
    if title in MarriedWomanTitles:
        return 1
    else:
        return 0

In [174]:
data["MarriedWoman"]=data["Title"].apply(married)

In [175]:
def special(title):
    SpecialTitles=['Don','Major','Lady', 'Sir','Col','the Countess','Jonkheer']
    if title in SpecialTitles:
        return 1
    else:
        return 0

In [176]:
data["SpecialTitle"]=data["Title"].apply(special)

In [177]:
data=data.drop("Title",axis=1)

## Dealing with _Cabin_ feature

In [178]:
def cabin(df):
    df["Cabin"]=df["Cabin"].apply(lambda x: str(x)[0])
    new_cabins=pd.get_dummies(df['Cabin'],prefix='Cabin')
    df=pd.concat([df,new_cabins],axis=1)
    #df=df.drop("Cabin",axis=1)
    return df

In [179]:
data=cabin(data)

In [180]:
def cabin2float(cabin):
    if str(cabin)=="A":
        return 0
    elif str(cabin)=="B":
        return 1
    elif str(cabin)=="C":
        return 2
    elif str(cabin)=="D":
        return 3
    elif str(cabin)=="E":
        return 4
    elif str(cabin)=="F":
        return 5
    elif str(cabin)=="G":
        return 6
    elif str(cabin)=="T":
        return 7
    else:
        return 8

In [181]:
data["Cabin"]=data["Cabin"].apply(lambda x: cabin2float(x))

## _Fare_

In [182]:
data["Fare"].isnull().value_counts()

False    1308
True        1
Name: Fare, dtype: int64

In [183]:
data[data["Fare"].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,...,SpecialTitle,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_n
1043,60.5,8,S,,"Storey, Mr. Thomas",0,1044,3,0,0,...,0,0,0,0,0,0,0,0,0,1


In [184]:
data.loc[((data["Pclass"]==3) & (data["Sex"]==0)),"Fare"].median()

7.8958

In [185]:
data.loc[1043,"Fare"]=7.8958

In [186]:
data["Fare"].isnull().value_counts()

False    1309
Name: Fare, dtype: int64

## _Ages_

In [187]:
old_ages=data["Age"]

In [188]:
data["AgeBand"]=pd.cut(data["Age"],5)
data[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(0.0902, 16.136]",0.55
1,"(16.136, 32.102]",0.369942
2,"(32.102, 48.068]",0.404255
3,"(48.068, 64.034]",0.434783
4,"(64.034, 80.0]",0.090909


In [189]:
age2band={0:[0.092,16.136],1: [16.136, 32.102],2:[32.102, 48.068],3:[48.068, 64.034],4:[64.034, 80.0]}

In [190]:
data.loc[data["Age"]<=16.136,"AgeBin"]=0
data.loc[(data["Age"]>16.136) & (train["Age"]<=32.102),"AgeBin"]=1
data.loc[(data["Age"]>32.102) & (train["Age"]<=48.068),"AgeBin"]=2
data.loc[(data["Age"]>48.068) & (train["Age"]<=64.034),"AgeBin"]=3
data.loc[(data["Age"]>64.034) & (train["Age"]<=80.0),"AgeBin"]=4
data=data.drop("AgeBand",axis=1)

In [191]:
data[['AgeBin', 'Survived']].groupby(['AgeBin'], as_index=False).mean().sort_values(by='AgeBin', ascending=True)

Unnamed: 0,AgeBin,Survived
0,0.0,0.55
1,1.0,0.369942
2,2.0,0.404255
3,3.0,0.434783
4,4.0,0.090909


In [203]:
age_features=['Pclass', 'Sex','SibSp',
       'Parch', 'Fare', 'Cabin', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Accompanied', 'FamSize', 'MarriedWoman',
       'SpecialTitle', 'AgeBin']

In [204]:
#age_features=["Sex","Pclass","Fare",'AgeBin']

In [205]:
X=data[age_features].dropna().drop("AgeBin",axis=1)
y=data[age_features].dropna()["AgeBin"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [206]:
Age_rf = RandomForestClassifier(n_estimators=100)
Age_rf = Age_rf.fit(X_train,y_train)
Age_rf.score(X_train,y_train)

0.87878787878787878

In [207]:
Age_rf.score(X_test,y_test)

0.48128342245989303

In [208]:
Age_rf = RandomForestClassifier(n_estimators=100)
Age_rf = Age_rf.fit(X,y)
Age_rf.score(X,y)

0.85828877005347592

In [209]:
age_pred=Age_rf.predict(X)

In [210]:
missing_age_indexes=data['Age'].index[data['Age'].apply(np.isnan)]


In [211]:
new_age_bands=Age_dt.predict(data.loc[missing_age_indexes,age_features].drop("AgeBin",axis=1))

In [212]:
def random_age_from_band(data):
    random_ages=[]
    for i in data:
        random_ages.append(np.random.randint(low=age2band[i][0],high=age2band[i][1]))
        #print(i,np.random.randint(low=age2band[i][0],high=age2band[i][1]))
    return np.array(random_ages)

In [213]:
new_ages=random_age_from_band(new_age_bands)

In [214]:
data["AgeBin"].loc[missing_age_indexes]=new_age_bands

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [215]:
data["Age"].loc[missing_age_indexes]=new_ages

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


# Model Data

In [238]:
features=['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Cabin', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Accompanied', 'FamSize', 'MarriedWoman', 'SpecialTitle']

In [234]:
features=['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch',
       'Fare', 'Cabin', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Accompanied', 'FamSize', 'MarriedWoman', 'SpecialTitle']

In [239]:
X=data[features].dropna().drop("Survived",axis=1)
y=data[features].dropna()["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(len(X),len(y))
print(y.mean())

891 891
0.383838383838


In [240]:
gb=GradientBoostingClassifier()
gb=gb.fit(X_train,y_train)
gb.score(X_train,y_train)

0.91467065868263475

In [241]:
gb.score(X_test,y_test)

0.79372197309417036

In [232]:
test_pred=gb.predict(data[data["Survived"].isnull()][features].drop("Survived",axis=1))

In [229]:
submit=pd.DataFrame({"PassengerId":test["PassengerId"],"Survived":test_pred})