In [44]:
import pandas as pd
from colorama import Fore, Style
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, cross_val_score

# Setting to make numbers easier to read on display
pd.options.display.float_format = '{:20.2f}'.format

# Show all columns on output
pd.set_option('display.max_columns', 999)

In [29]:
# training df
train_df = pd.read_csv('train.csv')

# test df
test_df = pd.read_csv('test.csv')

# saving the leght to separete later
train_len = len(train_df)

In [30]:
# merging both df into 1

# adding a temp Survived column to keep the same structure
test_df["Survived"] = None

# merged df
full_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

In [6]:
full_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
# number of rows and columns
print(Fore.CYAN + "df shape: " + Style.RESET_ALL)
print(f"{full_df.shape}\n")

# column names, data types, non-null values
print(Fore.GREEN + "df info: " + Style.RESET_ALL)
print(f"{full_df.info()}\n") 

# column names, data types, non-null values
print(Fore.GREEN + "df unique: " + Style.RESET_ALL)
print(f"{full_df.nunique()}\n") 

# NaN values
print(Fore.YELLOW + "df isnull sum: " + Style.RESET_ALL)
print(f"{full_df.isnull().sum()}\n")

# count, mean, std, min, max, etc.
print(Fore.MAGENTA + "df describe: " + Style.RESET_ALL)
print(f"{full_df.describe()}\n")

[36mdf shape: [0m
(1309, 12)

[32mdf info: [0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    object 
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(2), int64(4), object(6)
memory usage: 122.8+ KB
None

[32mdf unique: [0m
PassengerId    1309
Survived          2
Pclass            3
Name           1307
Sex               2
Age              98
SibSp             7
Parch             8
Tic

In [None]:
'''
Cleaning

Fix missing values Age and Cabin

Separete Pclass into 3 new columns = 1 = Upper_class, 2 = Middle_class and 3 = Lower_class

Separete Embarked into 3 columns = C = Cherbourg, Q = Queenstown and S = Southampton

Transform Sex into a binary Male and Female column

Drop - Name, Ticket, Suvived, PassengerId

'''

In [31]:
# missing values

# Age

# creating a median by group: Pclass + Sex and filling all the nans
full_df["Age"] = full_df.groupby(["Pclass", "Sex"])["Age"].transform(
    lambda x: x.fillna(x.median())
)

# Cabin

# instead of filling the nans, create a new Has_Cabin column
full_df["Has_Cabin"] = full_df["Cabin"].notna().astype(int)

# results
full_df.isna().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
Has_Cabin         0
dtype: int64

In [32]:
# Pclass

# creating dummies
full_df = pd.get_dummies(full_df, columns=["Pclass"])

# renaming
full_df = full_df.rename(columns={
    "Pclass_1": "Upper_class",
    "Pclass_2": "Middle_class",
    "Pclass_3": "Lower_class"
})

full_df[["Upper_class", "Middle_class", "Lower_class"]] = full_df[["Upper_class", "Middle_class", "Lower_class"]].astype(int)

# results
full_df.head()

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Has_Cabin,Upper_class,Middle_class,Lower_class
0,1,0,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C,1,1,0,0
2,3,1,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S,0,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,1,0,0
4,5,0,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,0,1


In [33]:
# Embarked

# creating dummies
full_df = pd.get_dummies(full_df, columns=["Embarked"])

# renaming
full_df = full_df.rename(columns={
    "Embarked_C": "Cherbourg",
    "Embarked_Q": "Queenstown",
    "Embarked_S": "Southampton"
})

full_df[["Cherbourg", "Queenstown", "Southampton"]] = full_df[["Cherbourg", "Queenstown", "Southampton"]].astype(int)

# results
full_df.head()

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Has_Cabin,Upper_class,Middle_class,Lower_class,Cherbourg,Queenstown,Southampton
0,1,0,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,0,0,0,1,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,1,1,0,0,1,0,0
2,3,1,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,0,0,0,1,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,1,1,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,0,0,0,1,0,0,1


In [34]:
# Sex

# creating dummies
full_df = pd.get_dummies(full_df, columns=["Sex"])

# rename
full_df = full_df.rename(columns={
    "Sex_male": "Male",
    "Sex_female": "Female"
})

full_df[["Male", "Female"]] = full_df[["Male", "Female"]].astype(int)

# results
full_df.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Has_Cabin,Upper_class,Middle_class,Lower_class,Cherbourg,Queenstown,Southampton,Female,Male
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,0,0,1,0,0,1,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.28,C85,1,1,0,0,1,0,0,1,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.92,,0,0,0,1,0,0,1,1,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,1,1,0,0,0,0,1,1,0
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,0,0,1,0,0,1,0,1


In [35]:
# dropping
full_df = full_df.drop(columns=["Name", "Ticket", "Cabin", "PassengerId"])

# final df
print(full_df.head().to_markdown())

|    |   Survived |   Age |   SibSp |   Parch |    Fare |   Has_Cabin |   Upper_class |   Middle_class |   Lower_class |   Cherbourg |   Queenstown |   Southampton |   Female |   Male |
|---:|-----------:|------:|--------:|--------:|--------:|------------:|--------------:|---------------:|--------------:|------------:|-------------:|--------------:|---------:|-------:|
|  0 |          0 |    22 |       1 |       0 |  7.25   |           0 |             0 |              0 |             1 |           0 |            0 |             1 |        0 |      1 |
|  1 |          1 |    38 |       1 |       0 | 71.2833 |           1 |             1 |              0 |             0 |           1 |            0 |             0 |        1 |      0 |
|  2 |          1 |    26 |       0 |       0 |  7.925  |           0 |             0 |              0 |             1 |           0 |            0 |             1 |        1 |      0 |
|  3 |          1 |    35 |       1 |       0 | 53.1    |           1 

In [48]:
# checking if the only nan values are the temp Survived
full_df.isna().sum()

Survived        418
Age               0
SibSp             0
Parch             0
Fare              1
Has_Cabin         0
Upper_class       0
Middle_class      0
Lower_class       0
Cherbourg         0
Queenstown        0
Southampton       0
Female            0
Male              0
dtype: int64

In [36]:
# replacing nan for 0 since RandomForestClassifier doesnt accept NaN values
full_df = full_df.fillna(0)

In [37]:
# Machine Learning

# separeting df
train_cleaned = full_df.iloc[:train_len].copy()
test_cleaned  = full_df.iloc[train_len:].copy()

# Target
y_train = train_cleaned["Survived"].astype(int)
X_train = train_cleaned.drop(columns=["Survived"])

# Test (dropping temp Survived)
X_test = test_cleaned.drop(columns=["Survived"])

In [38]:
# model RandomForestClassifier

# instantiating the model
rf_model = RandomForestClassifier(random_state=42)

# fitting the train and test data
rf_model.fit(X_train, y_train)

# train df prediction
y_pred_rf = rf_model.predict(X_train)

In [39]:
# model KNeighborsClassifier

# instantiating the model
knn_model = KNeighborsClassifier(n_neighbors=5)

# fitting the train and test data
knn_model.fit(X_train, y_train)

# train df prediction
y_pred_knn = knn_model.predict(X_train)


In [42]:
# accuracy in RandomForestClassifier
print("Random Forest Classifier (training data)")
print("Accuracy:", accuracy_score(y_train, y_pred_rf))

# accuracy in KNeighborsClassifier
print("\nK-Nearest Neighbors (training data)")
print("Accuracy:", accuracy_score(y_train, y_pred_knn))

Random Forest Classifier (training data)
Accuracy: 0.9865319865319865

K-Nearest Neighbors (training data)
Accuracy: 0.8159371492704826


In [45]:
# params grid
param_grid_rf = {
    "n_estimators": [100, 200, 500],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"]
}

# passing params through GridSearchCV
grid_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid_rf,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_rf.fit(X_train, y_train)

print("Best parameters for RandomForestClassifier:", grid_rf.best_params_)
print("Best score RandomForestClassifier:", grid_rf.best_score_)

Best parameters for RandomForestClassifier: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best score RandomForestClassifier: 0.8271859895800642
Cross-validation mean accuracy (Random Forest): 0.8271859895800642


In [62]:
# creating submission df
submission_rf = pd.DataFrame({"PassengerId": test_df["PassengerId"], "Survived": rf_predictions})
submission_rf.to_csv("Kaggle_submission.csv", index=False)

submission_rf.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1
