In [34]:
import pandas as pd
train_set = pd.read_csv('titanic_data/train.csv')
test_set = pd.read_csv('titanic_data/test.csv')

In [35]:
train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [36]:
train_set = train_set.set_index("PassengerId")
test_set = test_set.set_index("PassengerId")

In [37]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [38]:
train_set.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699113,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526507,1.102743,0.806057,49.693429
min,0.0,1.0,0.4167,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [39]:
import numpy as np

y_train = np.array(train_set["Survived"])
x_train = train_set.drop("Survived", axis=1).drop("Name", axis=1).drop("Cabin", axis=1).drop("Ticket", axis=1)

x_test = test_set.drop("Name", axis=1).drop("Cabin", axis=1).drop("Ticket", axis=1)

In [40]:
print(train_set[train_set["Sex"] == "male"]['Age'].mean())
print(train_set[train_set["Sex"] == "female"]['Age'].mean())

30.726637306843266
27.915708812260537


In [41]:
train_set["Parch"].value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [42]:
train_set["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

num_pipeline = Pipeline([("imputer", SimpleImputer(strategy="mean")),
                         ("scaler", MinMaxScaler())])

In [44]:
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), 
                        ("onehotencoder", OneHotEncoder())])

In [45]:
from sklearn.compose import ColumnTransformer

num_columns = ["Age", "Fare", "SibSp", "Parch"]
cat_columns = ["Pclass", "Sex", "Embarked"]

preprocess = ColumnTransformer([
        ("num", num_pipeline, num_columns),
        ("cat", cat_pipeline, cat_columns),
    ])

In [46]:
x_train = preprocess.fit_transform(x_train)

In [47]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(x_train, y_train)

RandomForestClassifier(random_state=42)

In [48]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_clf, x_train, y_train, cv=10)
forest_scores.mean()

0.8115106117353308

In [49]:
x_test

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,male,34.5,0,0,7.8292,Q
893,3,female,47.0,1,0,7.0000,S
894,2,male,62.0,0,0,9.6875,Q
895,3,male,27.0,0,0,8.6625,S
896,3,female,22.0,1,1,12.2875,S
...,...,...,...,...,...,...,...
1305,3,male,,0,0,8.0500,S
1306,1,female,39.0,0,0,108.9000,C
1307,3,male,38.5,0,0,7.2500,S
1308,3,male,,0,0,8.0500,S


In [50]:
predictions = forest_clf.predict(preprocess.fit_transform(x_test))


In [51]:
submission = pd.DataFrame({"PassengerId": x_test.index, "Survived": predictions})

In [52]:
submission.set_index("PassengerId", inplace=True)

In [53]:
submission.to_csv("submission.csv")