9/18-9/19/23

This is more code for the Kaggle Titanic Competition. Last time, I found that Sex is basically the only trait that matters. This time around, I'll see what sklearn's RandomForestClassifier shows about feature importances, and maybe try making a few feature of my own.

In [1]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import random
from scipy.stats import randint

In [2]:
train_set = pd.read_csv("train.csv")
test_set = pd.read_csv("test.csv")
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
train_set.dropna(subset=["Embarked"], inplace=True)
survived = np.array(pd.DataFrame(train_set["Survived"].copy())).ravel()

ids2 = np.array(test_set["PassengerId"].copy()).ravel() # contest submission requires passenger ids

nec_data = train_set.copy()
nec_data = train_set.drop("Name", axis=1) # string and too individual
nec_data = nec_data.drop("PassengerId", axis=1) # too individual
nec_data = nec_data.drop("Cabin", axis=1) # too many null values
nec_data = nec_data.drop("Ticket", axis=1) # too individual and discordant
nec_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    object 
 3   Age       712 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 62.5+ KB


The well-known phrase associated with the Titanic evacuation is "Women and children first". My previous feature importance testing found that Age, the original feature, wasn't very correlated with survival. Maybe simplifying it to child or adult will be more pertinent.

In [4]:
impute = SimpleImputer(strategy="median")
ages = np.array(nec_data["Age"]).reshape(-1, 1)
impute.fit(ages)
no_nan_age = impute.transform(ages)
print(no_nan_age)

[[22.  ]
 [38.  ]
 [26.  ]
 [35.  ]
 [35.  ]
 [28.  ]
 [54.  ]
 [ 2.  ]
 [27.  ]
 [14.  ]
 [ 4.  ]
 [58.  ]
 [20.  ]
 [39.  ]
 [14.  ]
 [55.  ]
 [ 2.  ]
 [28.  ]
 [31.  ]
 [28.  ]
 [35.  ]
 [34.  ]
 [15.  ]
 [28.  ]
 [ 8.  ]
 [38.  ]
 [28.  ]
 [19.  ]
 [28.  ]
 [28.  ]
 [40.  ]
 [28.  ]
 [28.  ]
 [66.  ]
 [28.  ]
 [42.  ]
 [28.  ]
 [21.  ]
 [18.  ]
 [14.  ]
 [40.  ]
 [27.  ]
 [28.  ]
 [ 3.  ]
 [19.  ]
 [28.  ]
 [28.  ]
 [28.  ]
 [28.  ]
 [18.  ]
 [ 7.  ]
 [21.  ]
 [49.  ]
 [29.  ]
 [65.  ]
 [28.  ]
 [21.  ]
 [28.5 ]
 [ 5.  ]
 [11.  ]
 [22.  ]
 [45.  ]
 [ 4.  ]
 [28.  ]
 [28.  ]
 [29.  ]
 [19.  ]
 [17.  ]
 [26.  ]
 [32.  ]
 [16.  ]
 [21.  ]
 [26.  ]
 [32.  ]
 [25.  ]
 [28.  ]
 [28.  ]
 [ 0.83]
 [30.  ]
 [22.  ]
 [29.  ]
 [28.  ]
 [28.  ]
 [17.  ]
 [33.  ]
 [16.  ]
 [28.  ]
 [23.  ]
 [24.  ]
 [29.  ]
 [20.  ]
 [46.  ]
 [26.  ]
 [59.  ]
 [28.  ]
 [71.  ]
 [23.  ]
 [34.  ]
 [34.  ]
 [28.  ]
 [28.  ]
 [21.  ]
 [33.  ]
 [37.  ]
 [28.  ]
 [21.  ]
 [28.  ]
 [38.  ]
 [28.  ]
 [47.  ]
 [14.5 ]
 

In [5]:
nec_data["is_child"] = (no_nan_age < 18).astype(float)
nec_data["is_adult"] = (no_nan_age >= 18).astype(float)
print(nec_data["is_child"])
print(nec_data["is_adult"])

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
886    0.0
887    0.0
888    0.0
889    0.0
890    0.0
Name: is_child, Length: 889, dtype: float64
0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
886    1.0
887    1.0
888    1.0
889    1.0
890    1.0
Name: is_adult, Length: 889, dtype: float64


In [6]:
nec_data.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,is_child,is_adult
0,0,3,male,22.0,1,0,7.25,S,0.0,1.0
1,1,1,female,38.0,1,0,71.2833,C,0.0,1.0
2,1,3,female,26.0,0,0,7.925,S,0.0,1.0
3,1,1,female,35.0,1,0,53.1,S,0.0,1.0
4,0,3,male,35.0,0,0,8.05,S,0.0,1.0
5,0,3,male,,0,0,8.4583,Q,0.0,1.0
6,0,1,male,54.0,0,0,51.8625,S,0.0,1.0
7,0,3,male,2.0,3,1,21.075,S,1.0,0.0
8,1,3,female,27.0,0,2,11.1333,S,0.0,1.0
9,1,2,female,14.0,1,0,30.0708,C,1.0,0.0


In [7]:
text_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore"))
num_norm_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
nothing = make_pipeline(SimpleImputer(strategy="median"))
preprocess = ColumnTransformer([
    ("text", text_pipeline, make_column_selector(dtype_include=object)),
    ("nothing", nothing, ["Survived", "is_child", "is_adult"])],
    remainder=num_norm_pipeline
)
pre_nec_data = preprocess.fit_transform(nec_data)
pre_nec_data = pd.DataFrame(pre_nec_data)
pre_nec_data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.825209,-0.563674,0.43135,-0.474326,-0.50024
1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,-1.572211,0.669217,0.43135,-0.474326,0.788947
2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.825209,-0.255451,-0.475199,-0.474326,-0.48665
3,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,-1.572211,0.43805,0.43135,-0.474326,0.422861
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.825209,0.43805,-0.475199,-0.474326,-0.484133
5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.825209,-0.10134,-0.475199,-0.474326,-0.475913
6,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.572211,1.902108,-0.475199,-0.474326,0.397946
7,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.825209,-2.104788,2.244449,0.765897,-0.2219
8,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.825209,-0.178396,-0.475199,2.006119,-0.422057
9,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,-0.373501,-1.18012,0.43135,-0.474326,-0.040787


The above order is is_female, is_male, from_Cherbourg(C), from_Queenstown(Q), from_Southampton(S), Survived, is_child, is_adult, Pclass, Age, SibSp, Parch, Fare.

In [8]:
nec_matrix = pre_nec_data.corr()
nec_titles = ["is_female", "is_male", "from_Cherbourg(C)", "from_Queenstown(Q)", "from_Southampton(S)", "Survived", "is_child", "is_adult", "Pclass", 
                               "Age", "SibSp", "Parch", "Fare"]
for k in range(len(nec_titles)):
    print(f"{nec_titles[k]}, {nec_matrix[5][k]}")

is_female, 0.5415849155511678
is_male, -0.5415849155511681
from_Cherbourg(C), 0.16996596681270024
from_Queenstown(Q), 0.00453572872398569
from_Southampton(S), -0.1517770485943329
Survived, 1.0
is_child, 0.12356875935781529
is_adult, -0.1235687593578151
Pclass, -0.33554885935682505
Age, -0.06982170767891915
SibSp, -0.03403999879674894
Parch, 0.0831507836662021
Fare, 0.2552904613046991


is_child and is_adult are seemingly not highly correlated. There's one more way I can test this, along with feature importances in general: a random tree classifier and its feature_importances variable.

In [9]:
def safe_log(x):
    return np.log(x + 1e-10)
#thanks to ChatGPT 3.5 for this function

log_transformer = FunctionTransformer(func=safe_log, inverse_func=np.exp)
num_tail_pipeline = make_pipeline(SimpleImputer(strategy="median"), log_transformer, StandardScaler())
preprocess2 = ColumnTransformer([
    ("tail", num_tail_pipeline, ["SibSp", "Parch", "Fare"]), # not for Pclass, since it's basically ordinally encoded
    ("nothing", nothing, ["is_child", "is_adult"]),
    ("text", text_pipeline, make_column_selector(dtype_include=object))],
    remainder=num_norm_pipeline
)

In [10]:
nec_data = nec_data.drop("Survived", axis=1)

rdf_clf = Pipeline([("pre", preprocess2), ("clf", RandomForestClassifier(random_state=446))])
rdf_clf.fit(nec_data, survived)
rmse = -cross_val_score(rdf_clf, nec_data, survived, scoring="neg_root_mean_squared_error", cv=10)
print(np.average(rmse))

0.42826843445747337


In [11]:
nec_features = rdf_clf["clf"].feature_importances_
nec_titles = ["SibSp", "Parch", "Fare", "is_child", "is_adult", "is_female", "is_male", "from_Cherbourg(C)", "from_Queenstown(Q)", "from_Southampton(S)", 
              "Pclass", "Age"]
for k in range(len(nec_titles)):
    print(f"{nec_titles[k]}, {nec_features[k]}")

SibSp, 0.05135387326148228
Parch, 0.036776002089142494
Fare, 0.2515163785245415
is_child, 0.010805475678611573
is_adult, 0.010062469890951494
is_female, 0.1318763679221925
is_male, 0.1562505432179034
from_Cherbourg(C), 0.013197921840731716
from_Queenstown(Q), 0.007129719576922866
from_Southampton(S), 0.012221833164927231
Pclass, 0.08755093670517926
Age, 0.23125847812741376


Once again, is_child and is_adult were pretty insignificant. Gender and Fare remained important, and even Age is important here.

In [12]:
# test_use = test_set.copy()
# impute = SimpleImputer(strategy="median")
# ages = np.array(test_use["Age"]).reshape(-1, 1)
# impute.fit(ages)
# no_nan_age = impute.transform(ages)
# test_use["is_child"] = (no_nan_age < 18).astype(float)
# test_use["is_adult"] = (no_nan_age >= 18).astype(float)


# test_pred = rdf_clf.predict(test_use)
# with open('sacreddeer_titanic_new_submission_5.csv', 'w', newline='') as f:
#     # create the csv writer
#     writer = csv.writer(f)
#     writer.writerow(["PassengerId", "Survived"])
#     for k in range(len(ids2)):
#         # write a row to the csv file
#         writer.writerow([ids2[k], test_pred[k]])

In [13]:
imp_data = nec_data[["Fare", "Sex", "Age"]]

preprocess3 = ColumnTransformer([
    ("tail", num_tail_pipeline, ["Fare"]), # not for Pclass, since it's basically ordinally encode
    ("text", text_pipeline, make_column_selector(dtype_include=object))],
    remainder=num_norm_pipeline
)

rdf_clf2 = Pipeline([("pre", preprocess3), ("clf", RandomForestClassifier(random_state=446))])
rdf_clf2.fit(imp_data, survived)
rmse = -cross_val_score(rdf_clf2, imp_data, survived, scoring="neg_root_mean_squared_error", cv=10)
print(np.average(rmse))

0.4655153270478422


In [14]:
imp_data2 = nec_data[["Sex"]]

preprocess4 = ColumnTransformer([
    ("text", text_pipeline, make_column_selector(dtype_include=object))],
    remainder=num_norm_pipeline
)

rdf_clf3 = Pipeline([("pre", preprocess4), ("clf", RandomForestClassifier(random_state=446))])
rdf_clf3.fit(imp_data2, survived)
rmse = -cross_val_score(rdf_clf3, imp_data2, survived, scoring="neg_root_mean_squared_error", cv=10)
print(np.average(rmse))

0.4612560463684951


Both attempts at removing features for generalization seem to fail, as their RMSE is even worse than the original random forest. My last option is hyperparameter tuning via random search.

In [15]:
randdepth = np.array([3, 4, 5, 6, 7, 8, 9])
param_distribs = {"clf__max_depth": randdepth,
                  "clf__min_samples_leaf": randint(low=1, high=10),
                  "clf__n_estimators": randint(low=200, high=230)}
rmd_search = RandomizedSearchCV(
    rdf_clf, param_distributions=param_distribs, 
    n_iter=20, cv=3, scoring="neg_root_mean_squared_error", random_state=446
)
rmd_search.fit(nec_data, survived)
final_rnd_model = rmd_search.best_estimator_
print(final_rnd_model.get_params)

final_rnd_model.fit(nec_data, survived)
rmse2 = -cross_val_score(final_rnd_model, nec_data, survived, scoring="neg_root_mean_squared_error", cv=10)
print(np.average(rmse2))

<bound method Pipeline.get_params of Pipeline(steps=[('pre',
                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                                              SimpleImputer(strategy='median')),
                                                             ('standardscaler',
                                                              StandardScaler())]),
                                   transformers=[('tail',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('functiontransformer',
                                                                   FunctionTransformer(func=<function safe_log at 0x0000015ABB894B80>,
                                                                                       inverse_func=<ufun...
                      

In [16]:
# test_use = test_set.copy()
# impute = SimpleImputer(strategy="median")
# ages = np.array(test_use["Age"]).reshape(-1, 1)
# impute.fit(ages)
# no_nan_age = impute.transform(ages)
# test_use["is_child"] = (no_nan_age < 18).astype(float)
# test_use["is_adult"] = (no_nan_age >= 18).astype(float)


# test_pred = final_rnd_model.predict(test_use)
# with open('sacreddeer_titanic_new_submission_6.csv', 'w', newline='') as f:
#     # create the csv writer
#     writer = csv.writer(f)
#     writer.writerow(["PassengerId", "Survived"])
#     for k in range(len(ids2)):
#         # write a row to the csv file
#         writer.writerow([ids2[k], test_pred[k]])

This got me the highest score I've ever had, 0.7790. This boosted me up to 2835th place (as of 9/19/23) out of 14653 teams (although the person in 2305th place also has my score). I'm going to see how AdaBoost does on the data - is_child and is_adult removed - and then move onto a Kaggle housing prices regression competition.

In [17]:
nec_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    889 non-null    int64  
 1   Sex       889 non-null    object 
 2   Age       712 non-null    float64
 3   SibSp     889 non-null    int64  
 4   Parch     889 non-null    int64  
 5   Fare      889 non-null    float64
 6   Embarked  889 non-null    object 
 7   is_child  889 non-null    float64
 8   is_adult  889 non-null    float64
dtypes: float64(4), int64(3), object(2)
memory usage: 69.5+ KB


In [18]:
nec_data = nec_data.drop("is_child", axis=1)
nec_data = nec_data.drop("is_adult", axis=1)

In [23]:
preprocess5 = ColumnTransformer([
    ("tail", num_tail_pipeline, ["SibSp", "Parch", "Fare"]), # not for Pclass, since it's basically ordinally encoded
    ("text", text_pipeline, make_column_selector(dtype_include=object))],
    remainder=num_norm_pipeline
)

ada_clf = Pipeline([("pre", preprocess5), ("clf", AdaBoostClassifier(DecisionTreeClassifier(max_depth=8), random_state=446))])
ada_clf.fit(nec_data, survived)
rmse = -cross_val_score(ada_clf, nec_data, survived, scoring="neg_root_mean_squared_error", cv=10)
print(np.average(rmse))

0.45433170958174757


In [26]:
randlr = np.zeros(10)
for k in range(len(randlr)):
    randlr[k] = random.uniform(0.01, 0.99)
param_distribs = {"clf__n_estimators": randint(low=100, high=500),
                  "clf__learning_rate": randlr,
                  }
rdm_search3 = RandomizedSearchCV(
    ada_clf, param_distributions=param_distribs, n_iter=10, cv=5, scoring="neg_root_mean_squared_error"
)
rdm_search3.fit(nec_data, survived)
best_ada_clf = rdm_search3.best_estimator_
print(best_ada_clf["clf"])

rmse = -cross_val_score(best_ada_clf, nec_data, survived, scoring="neg_root_mean_squared_error", cv=10)
print(np.average(rmse))

AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=8),
                   learning_rate=0.0838948441812057, n_estimators=239,
                   random_state=446)
0.4336358023356578
