In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
df = pd.read_csv('./train.csv', header=0)
df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Fare'])
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


In [3]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Embarked      2
dtype: int64

In [4]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [5]:
df['Embarked'] = df['Embarked'].fillna('S')

In [6]:
sex_map = {
    'male' : 0,
    'female' : 1
}

emb_map = {
    'S': 0,
    'C': 1,
    'Q': 2
}

df['Sex'] = df['Sex'].map(sex_map)
df['Embarked'] = df['Embarked'].map(emb_map)

In [7]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,0,22.0,1,0,0
1,1,1,1,38.0,1,0,1
2,1,3,1,26.0,0,0,0
3,1,1,1,35.0,1,0,0
4,0,3,0,35.0,0,0,0


In [8]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Embarked      0
dtype: int64

In [9]:
df_ = df.dropna(axis=0)

In [10]:
na_idx = df[df.isna().any(axis=1)].index
df_na = df.iloc[na_idx]
df_na.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
5,0,3,0,,0,0,2
17,1,2,0,,0,0,0
19,1,3,1,,0,0,1
26,0,3,0,,0,0,1
28,1,3,1,,0,0,2


# Impute Missing Age

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score , mean_squared_error
X = df_.drop(columns=["Survived", "Age"])
y = df_.loc[:, 'Age']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

reg = RandomForestRegressor(n_estimators=100,
                            criterion="squared_error",
                            max_depth=10,
                            min_samples_leaf=1,
                            random_state=7)

reg.fit(X, y)
y_pred = reg.predict(X_test)
print("R2 score : {}".format(r2_score(y_test, y_pred)))
print("MSE : {}".format(mean_squared_error(y_test, y_pred)))

R2 score : 0.33517481237360125
MSE : 141.53854055950507


In [19]:
temp = df_na.drop(columns=["Survived", "Age"])
df_na.loc[:, 'Age'] = reg.predict(temp)
df_na

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na.loc[:, 'Age'] = reg.predict(temp)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
5,0,3,0,36.421952,0,0,2
17,1,2,0,33.581157,0,0,0
19,1,3,1,15.368960,0,0,1
26,0,3,0,28.354059,0,0,1
28,1,3,1,20.015355,0,0,2
...,...,...,...,...,...,...,...
859,0,3,0,28.354059,0,0,1
863,0,3,1,14.282472,8,2,0
868,0,3,0,28.902348,0,0,0
878,0,3,0,28.902348,0,0,0


In [20]:
df = pd.concat([df_, df_na], axis=0)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,0,22.000000,1,0,0
1,1,1,1,38.000000,1,0,1
2,1,3,1,26.000000,0,0,0
3,1,1,1,35.000000,1,0,0
4,0,3,0,35.000000,0,0,0
...,...,...,...,...,...,...,...
859,0,3,0,28.354059,0,0,1
863,0,3,1,14.282472,8,2,0
868,0,3,0,28.902348,0,0,0
878,0,3,0,28.902348,0,0,0


In [21]:
df.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
dtype: int64

In [26]:
df_test = pd.read_csv('./test.csv', header=0)
test_id = df_test['PassengerId']
df_test = df_test.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Fare'])
df_test['Sex'] = df_test['Sex'].map(sex_map)
df_test['Embarked'] = df_test['Embarked'].map(emb_map)

df_test.isna().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Embarked     0
dtype: int64

In [27]:
df_test_ = df_test.dropna(axis=0)
na_idx = df_test[df_test.isna().any(axis=1)].index
df_na = df_test.iloc[na_idx]
temp = df_na.drop(columns=["Age"])
df_na.Age = reg.predict(temp)
df_test = pd.concat([df_test_, df_na], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na.Age = reg.predict(temp)


In [28]:
df_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,0,34.500000,0,0,2
1,3,1,47.000000,1,0,0
2,2,0,62.000000,0,0,2
3,3,0,27.000000,0,0,0
4,3,1,22.000000,1,1,0
...,...,...,...,...,...,...
408,3,1,20.015355,0,0,2
410,3,1,20.015355,0,0,2
413,3,0,28.902348,0,0,0
416,3,0,28.902348,0,0,0


In [29]:
X_train = df.drop("Survived", axis=1)
y_train = df["Survived"]
X_test = df_test
X_train.shape, y_train.shape, X_test.shape

((891, 6), (891,), (418, 6))

In [30]:
from sklearn.ensemble import RandomForestClassifier

train_acc = []
train_f1 = []

model = RandomForestClassifier(
    n_estimators=1000,
    criterion='gini',
    max_depth=10
)

model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [31]:
acc = round(model.score(X_train, y_train) * 100, 2)
print(acc)

92.48


In [32]:
len(y_test_pred)

418

In [33]:
sub_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
sub_df["PassengerId"] = test_id
sub_df['Survived'] = y_test_pred

In [34]:
sub_df.to_csv("./submission_RF.csv", index=None)