In [116]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from IPython.display import display 
import seaborn as sns; sns.set()
import visuals as vs
from sklearn.preprocessing import MinMaxScaler, Imputer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import pairwise_distances_argmin, silhouette_score, accuracy_score, confusion_matrix
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
%matplotlib inline 

In [117]:
# Load the dataset
try:
    dataset = pd.read_csv("data/train.csv")
    print("Passengers dataset has {} samples with {} features each.".format(*dataset.shape))
except Exception as e:
    print("Dataset could not be loaded. Is the dataset missing? ", str(e))

Passengers dataset has 891 samples with 12 features each.


In [118]:
dataset.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [129]:
#Delete PassengerId
def deleteId(dataset) :
    #dataset.drop('PassengerId',axis=1, inplace=True)
    dataset.drop('Ticket', axis=1, inplace=True)
    return dataset

In [130]:
#Encode Sex and get dummies for Embarked and Pclass
def encodeColumns(dataset) :
    dataset['Embarked'] = dataset['Embarked'].fillna(dataset['Embarked'].value_counts().index[0])
    dataset = pd.get_dummies(dataset, columns=['Embarked'])
    dataset['Sex'].replace(['female','male'],[0,1],inplace=True)
    dataset = pd.get_dummies(dataset, columns=['Pclass'])
    return dataset

In [131]:
def processName(dataset) :
    dataset['NameLength'] = dataset.apply(lambda row: len(row.Name), axis = 1) 
    dataset.drop('Name', axis=1, inplace=True)
    return dataset

In [132]:
def fillInAge(dataset) : 
    imputer = Imputer(missing_values='NaN' ,strategy='mean')
    imputer = imputer.fit(dataset[['Age']])
    dataset[['Age']] = imputer.transform(dataset[['Age']])
    return dataset

In [133]:
def processCabin(dataset) :
    dataset.loc[dataset['Cabin'].notnull(), 'Cabin'] = 1
    dataset['Cabin'].fillna(0, inplace=True)
    return dataset

In [134]:
def scalerTransform(dataset) :
    scaled_features = dataset.copy()
    col_names = ['Age', 'Fare','SibSp','Parch']
    features = scaled_features[col_names]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    scaled_features[col_names] = features
    return scaled_features

In [135]:
try:
    dataset = pd.read_csv("data/train.csv")
    print("Passengers dataset has {} samples with {} features each.".format(*dataset.shape))
except Exception as e:
    print("Dataset could not be loaded. Is the dataset missing? ", str(e))
dataset = deleteId(dataset)
dataset = encodeColumns(dataset)
dataset = processName(dataset)
dataset = fillInAge(dataset)
dataset = processCabin(dataset)
dataset = scalerTransform(dataset)

Passengers dataset has 891 samples with 12 features each.




In [136]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,NameLength
0,1,0,1,-0.592481,0.432793,-0.473674,-0.502445,0,0,0,1,0,0,1,23
1,2,1,0,0.638789,0.432793,-0.473674,0.786845,1,1,0,0,1,0,0,51
2,3,1,0,-0.284663,-0.474545,-0.473674,-0.488854,0,0,0,1,0,0,1,22
3,4,1,0,0.407926,0.432793,-0.473674,0.42073,1,0,0,1,1,0,0,44
4,5,0,1,0.407926,-0.474545,-0.473674,-0.486337,0,0,0,1,0,0,1,24


In [137]:
survival = dataset['Survived']
features = dataset.drop('Survived', axis = 1)

In [138]:
X_train, X_test, y_train, y_test = train_test_split(features, survival, test_size=0.2, random_state=0)
random_forest = RandomForestClassifier(n_estimators=1000, max_depth=100, random_state=1)
random_forest.fit(X_train, y_train)

Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
print(acc_random_forest)
print(random_forest.score(X_test, y_test))

100.0
0.8268156424581006


In [143]:
test_dataset = pd.read_csv("data/test.csv")
dataset = deleteId(test_dataset)
dataset = encodeColumns(dataset)
dataset = processName(dataset)
dataset = fillInAge(dataset)
dataset = processCabin(dataset)
dataset = scalerTransform(dataset)
dataset.head(10)
predictions = random_forest.predict(dataset)



ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [81]:
#define 10-fold cross validation test harness
X = features
Y = survival
kfold = StratifiedKFold(n_splits=10, shuffle=True)
cvscores = []
for train, test in kfold.split(X, Y):
     
    random_forest = RandomForestClassifier(n_estimators=1000)
    random_forest.fit(X.iloc[train], Y[train])
    scores = random_forest.score(X.iloc[test], Y[test])
    print(scores)
    cvscores.append(scores * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

0.8111111111111111
0.8
0.8202247191011236
0.8089887640449438
0.7865168539325843
0.8764044943820225
0.8202247191011236
0.898876404494382
0.7528089887640449
0.8181818181818182
81.93% (+/- 3.95%)
