# Titanic Dataset with Scikit-Learn

## 1. Imports

In [66]:
# Basic imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

## 2. Utilizing functions created earlier for EDA

In [51]:
# Helper functions for preprocessing
def extract_first_letter(cabin):
	'''
	Extracting the first letter from a given cabin number. If the input is
	different, function returns 'NaN'.
	'''
	if type(cabin) == type('a'):
		return cabin[0]
	return 'X'

def extract_cabin_number(data):
	'''
	Extracting the number of the cabin. If there's none, returns NaN.
	'''
	if data['Cabin_letter'] != 'X':
		return data['Cabin'][1:]
	return 0

def title_extraction(title):
	'''
	Extracting the title from the 'Name' column.
	'''
	temp = title.split()

	for i in range(len(temp)):
		if '.' in temp[i]:
			return temp[i].split('.')[0]

def title_condensation(data):
	'''
	Condensing the amount of titles to smaller value.
	'''
	if data['Title'] in ['Miss', 'Mlle', 'Ms']:
		return 'Miss'
	
	elif data['Title'] in ['Mrs', 'Countess', 'Lady', 'Mme']:
		return 'Mrs'

	elif data['Title'] == 'Dr' and data['Sex'] == 'female':
		return 'Mrs'
		
	else:
		return 'Mr'

def age_filler2(data):
	'''
	Fills up the NaN with proper age mean according to 
	title_condensed value. If age is already there, it's being
	carried unchanged.
	'''
	miss_mean = np.float64(data.groupby('Title_condensed')['Age'].mean()['Miss'])
	mr_mean = np.float64(data.groupby('Title_condensed')['Age'].mean()['Mr'])
	mrs_mean = np.float64(data.groupby('Title_condensed')['Age'].mean()['Mrs'])

	if data['Age_nan_True']:
		if data['Title_condensed'] == 'Miss':
			return miss_mean

		elif data['Title_condensed'] == 'Mr':
			return mr_mean

		elif data['Title_condensed'] == 'Mrs':
			return mrs_mean
	
	return data['Age']

def age_filler(data):
	if data['Age_nan_True']:
		if data['Title_condensed'] == 'Mrs':
			return 35.99

		if data['Title_condensed'] == 'Mr':
			return 30.73

		if data['Title_condensed'] == 'Miss':
			return 21.76
	
	return np.float64(data['Age'])


In [52]:
# Function to preprocess the data engineering part
def preprocessing(titanic):
	'''
	Function that preprocesses the data enginnering/feature engineering
	part of loaded DataFrame.
	'''	

	print("PREPROCESSING...")

	# Dropping top 0,5% of 'Fare' entries
	if len(titanic)>500:
		for i in range(int(len(titanic)/200)):
			titanic.drop(titanic['Fare'].idxmax(), axis=0, inplace=True)

	print("Top 0,5% 'Fare' entries dropped")

	# Extract cabin letter and number
	titanic['Cabin_letter'] = titanic['Cabin'].apply(lambda x: extract_first_letter(x))
	titanic['Cabin_number'] = titanic.apply(extract_cabin_number, axis=1)
	
	# Change type of cabin number column to numeric values
	titanic['Cabin_number'] = pd.to_numeric(titanic['Cabin_number'], errors='ignore')
	titanic['Cabin_number'].fillna(0)
	print('Cabin column done')

	# Create family size value column
	titanic['Family_size'] = titanic['SibSp'] + titanic['Parch'] + 1
	print('Family size created')

	# Create Fare per person column
	titanic['Fare_per_person'] = titanic['Fare'] / titanic['Family_size']
	print('Fare per person created')

	# Extracting the title from the name and making the total number smaller
	titanic['Title'] = titanic['Name'].apply(lambda x: title_extraction(x))
	titanic['Title_condensed'] = titanic.apply(title_condensation, axis=1)
	print('Titles extracted and condensed')

	# Calculating the mean age values per each title
	#miss_mean = titanic.groupby('Title_condensed')['Age'].mean()['Miss']
	#mr_mean = titanic.groupby('Title_condensed')['Age'].mean()['Mr']
	#mrs_mean = titanic.groupby('Title_condensed')['Age'].mean()['Mrs']

	# Filling the missing age values
	titanic['Age_nan_True'] = titanic['Age'].isna()
	titanic['Age_filled'] = titanic.apply(age_filler, axis=1)
	print('Age column filled')

	# Filling the missing fare values
	titanic['Fare'] = titanic['Fare'].fillna(titanic['Fare'].mean())
	print('Filling the missing fare values')

	# Dropping unnecessary columns
	del_cols = ['Cabin', 'Age', 'Age_nan_True', 'Title', 'Name', 'Ticket', 'PassengerId', 'Cabin_number']
	titanic.drop(del_cols, axis=1, inplace=True)	
	print('Columns deleted')

	print('PREPROCESSING DONE\n')

	# Return the preprocessed DataFrame
	return titanic

In [67]:
# One hot encode the features and normalize the data in the set
def onehot_normalize(data):

	from sklearn.compose import make_column_transformer
	from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
	from sklearn.model_selection import train_test_split

	print('ONE-HOT ENCODING AND NORMALIZATION...')

	# Split the columns into linear and categorical ones
	linear_cols = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Family_size',
					'Fare_per_person', 'Age_filled']
	
	categorical_cols = ['Sex', 'Embarked', 'Cabin_letter', 'Title_condensed']

	# Create a column transformer
	ct = make_column_transformer(
		(MinMaxScaler(), linear_cols),
		(OneHotEncoder(handle_unknown='ignore'), categorical_cols)
	)
	print('Column transformer instentiated')

	# Create X and y sets
	X = data.drop('Survived', axis=1)
	y = data['Survived']

	# Split the data
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
	print('Data split into train and test sets')

	# Fit the column transformer
	ct.fit(X_train)

	# Transform the train and test sets
	X_train_norm = ct.transform(X_train)
	X_test_norm = ct.transform(X_test)
	print('Columns transformed')

	print('ONE-HOT ENCODING AND NORMALIZATION FINISHED')

	return X_train_norm, X_test_norm, y_train, y_test, X, y

In [139]:
# One hot encode the features and normalize the data in the set
def onehot_normalize_2(data):

	from sklearn.compose import make_column_transformer
	from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
	from sklearn.model_selection import train_test_split

	print('ONE-HOT ENCODING AND NORMALIZATION...')

	# Split the columns into linear and categorical ones
	linear_cols = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Family_size',
					'Fare_per_person', 'Age_filled']
	
	categorical_cols = ['Sex', 'Embarked', 'Cabin_letter', 'Title_condensed']
	
	# Create a column transformer
	ct = make_column_transformer(
		(MinMaxScaler(), linear_cols),
		(OneHotEncoder(handle_unknown='ignore'), categorical_cols)
	)
	print('Column transformer instentiated')

	# Create X and y sets
	X = data.drop('Survived', axis=1)
	#X = pd.get_dummies(data=X)
	y = data['Survived']

	# Split the data
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
	print('Data split into train and test sets')

	# Fit the column transformer
	ct.fit(X_train)

	# Transform the train and test sets
	X_train_norm = ct.transform(X_train)
	X_test_norm = ct.transform(X_test)
	print('Columns transformed')

	print('ONE-HOT ENCODING AND NORMALIZATION FINISHED')

	return X_train_norm, X_test_norm, y_train, y_test

In [140]:
# Load in the data
titanic_train = pd.read_csv('train.csv')

# Preprocessing
titanic_train = preprocessing(titanic_train)

# One-hot encoding, normalization, train/test split
X_train, X_test, y_train, y_test = onehot_normalize_2(titanic_train)

PREPROCESSING...
Top 0,5% 'Fare' entries dropped
Cabin column done
Family size created
Fare per person created
Titles extracted and condensed
Age column filled
Filling the missing fare values
Columns deleted
PREPROCESSING DONE

ONE-HOT ENCODING AND NORMALIZATION...
Column transformer instentiated
Data split into train and test sets
Columns transformed
ONE-HOT ENCODING AND NORMALIZATION FINISHED


## 3. Basic RFC model

In [163]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

# Creating a RFC model
rfc_basic = RandomForestClassifier(random_state=42, verbose=0)

# Fitting model to the data
rfc_basic.fit(X_train, y_train)

In [184]:
# Function to evaluate the model
def evaluate_model(model, model_name, y_true):

	# Calculate predictions
	pred = model.predict(X_test)

	# Basic metrics
	print(f'EVALUATION METRICS FOR MODEL: {model_name}\n')

	print('Confusion matrix: ')
	print(confusion_matrix(y_true, pred))

	print('\nClassification report: ')
	print(classification_report(y_true, pred))

	acc = accuracy_score(y_true, pred)
	f1 = accuracy_score(y_true, pred)

	print('Scores for basic model')
	print(f'Accuracy score: {acc:.3f}')
	print(f'F1-score: {f1:.3f}')
	"""
	# Feature importance
	feature_imp = pd.DataFrame(model.feature_importances_, 
								index = Xcolumns, 
								columns=['Feature importance score']).sort_values(ascending=False,
																				by=['Feature importance score'])

	sns.barplot(x=feature_imp['Feature importance score'], y=feature_imp.index)
	"""

In [185]:
evaluate_model(model=rfc_basic, model_name='rfc_basic', y_true=y_test)

EVALUATION METRICS FOR MODEL: rfc_basic

Confusion matrix: 
[[87 19]
 [20 52]]

Classification report: 
              precision    recall  f1-score   support

           0       0.81      0.82      0.82       106
           1       0.73      0.72      0.73        72

    accuracy                           0.78       178
   macro avg       0.77      0.77      0.77       178
weighted avg       0.78      0.78      0.78       178

Scores for basic model
Accuracy score: 0.781
F1-score: 0.781


Baseline to beat is 78.1%. Let's try using GridSearchCV to improve our score by trying different hyperparameters.

In [210]:
from sklearn.model_selection import GridSearchCV

param_grid = {
	'n_estimators': [256, 512, 1024],
	'max_features': ['sqrt', 'log2'],
	'max_depth': [5, 6, 7, 8, 9, 10],
	'criterion': ['gini', 'log_loss']
}

rfc_cv = RandomForestClassifier(random_state=42, verbose=0, n_jobs=-1)

cv_rfc = GridSearchCV(estimator=rfc_cv, 
					  param_grid=param_grid)

cv_rfc.fit(X_train, y_train)

print(f'Best params: {cv_rfc.best_params_}')
print(f'Best score: {cv_rfc.best_score_}')

13852.39s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
13852.40s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
13852.41s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
13852.42s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
13852.43s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
13852.44s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
13852.44s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
13852.45s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Best params: {'criterion': 'gini', 'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 1024}
Best score: 0.835001498351813


In [213]:
print(cv_rfc.best_score_)
print(cv_rfc.best_estimator_)
print(cv_rfc.best_params_)


0.835001498351813
RandomForestClassifier(max_depth=7, n_estimators=1024, n_jobs=-1,
                       random_state=42)
{'criterion': 'gini', 'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 1024}
<function BaseSearchCV._select_best_index at 0x2803c89d0>


In [218]:
# Training the RFC with GridSearchCV best params found
rfc_cv = RandomForestClassifier(random_state=42, 
								verbose=0,
								criterion=cv_rfc.best_params_['criterion'],
								max_depth=cv_rfc.best_params_['max_depth'],
								max_features=cv_rfc.best_params_['max_features'],
								n_estimators=cv_rfc.best_params_['n_estimators'])

rfc_cv.fit(X_train, y_train)

evaluate_model(model=rfc_cv, model_name='rfc_cv', y_true=y_test)

EVALUATION METRICS FOR MODEL: rfc_cv

Confusion matrix: 
[[93 13]
 [28 44]]

Classification report: 
              precision    recall  f1-score   support

           0       0.77      0.88      0.82       106
           1       0.77      0.61      0.68        72

    accuracy                           0.77       178
   macro avg       0.77      0.74      0.75       178
weighted avg       0.77      0.77      0.76       178

Scores for basic model
Accuracy score: 0.770
F1-score: 0.770


Other classifiers to try out:
* CatBoost
* Gradient Boosting Trees
* Decision Tree
* Logistic Regression
* Naive Bayes
* KNN
* Linear SVC
* Stochastic Gradient Descent

In [219]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train, y_train)

gnb.predict(X_test)

evaluate_model(model=gnb, model_name='gnb', y_true=y_test)

EVALUATION METRICS FOR MODEL: gnb

Confusion matrix: 
[[93 13]
 [40 32]]

Classification report: 
              precision    recall  f1-score   support

           0       0.70      0.88      0.78       106
           1       0.71      0.44      0.55        72

    accuracy                           0.70       178
   macro avg       0.71      0.66      0.66       178
weighted avg       0.70      0.70      0.68       178

Scores for basic model
Accuracy score: 0.702
F1-score: 0.702


In [220]:
import pandas as pd

def append_data(df, data):
    """
    Appends data to the bottom of a Pandas DataFrame.
    
    Args:
        df (pd.DataFrame): The DataFrame to which data will be appended.
        data (list or pd.DataFrame): The data to append. If a list is passed, it should be a list of lists 
                                     or a list of dictionaries, where each element represents a row of data.
                                     If a DataFrame is passed, it should have the same columns as the DataFrame
                                     to which data will be appended.
    
    Returns:
        pd.DataFrame: The DataFrame with the appended data.
    """
    if isinstance(data, pd.DataFrame):
        # If data is already a DataFrame, we can simply concatenate it to the original DataFrame
        new_df = pd.concat([df, data], ignore_index=True)
    elif isinstance(data, list):
        # If data is a list, we need to convert it to a DataFrame first
        new_df = pd.concat([df, pd.DataFrame(data, columns=df.columns)], ignore_index=True)
    else:
        raise ValueError("Data must be either a list or a DataFrame.")
    
    return new_df

In [226]:
results = pd.DataFrame(columns=['Model', 'Accuracy'])
rfc = pd.DataFrame({'Model': 'rfc', 'Accuracy': 0.8})
results, rfc
#append_data(results, rfc)

ValueError: If using all scalar values, you must pass an index