# Downloading the dataset from kaggle

## passing token

In [None]:
from google.colab import files
files.upload()

In [4]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

## Downloading the dataset

In [5]:
!kaggle competitions download -c titanic

Downloading titanic.zip to /content
  0% 0.00/34.1k [00:00<?, ?B/s]
100% 34.1k/34.1k [00:00<00:00, 152MB/s]


In [6]:
!unzip titanic.zip -d titanic

Archive:  titanic.zip
  inflating: titanic/gender_submission.csv  
  inflating: titanic/test.csv        
  inflating: titanic/train.csv       


# Exploring the dataset

In [30]:
import pandas as pd
import numpy as np

df=pd.read_csv('titanic/train.csv')

print(len(df['Ticket'].unique()))   # won't use it

df.head()

681


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Data Curation

In [31]:
df_length=len(df)
print(df_length)
nan_count=df.isna().sum()  # getting the number of Nan values in each column
nan_count

891


Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


### preprocessing data

In [72]:
"""
use --> pclass(passenger class) ,sex , Age(using mean as value for missing value),family_size=parch+sibsp+1,Fare(use mean since one value is missing at test, Embarked but drop the two rows where it doesn't exist

drop--> cabin (almost 3/4 values are NAN in training and testing so very harsh to estimate), ticket --> 681 distinct values and categorical so transfer it to one hot features will introduce many non important features,Name

categorical features: make sex , Embarked, Pclass one hot features
"""

def preprocess_titanic(data_frame,mode,train_age_mean=0,train_fare_mean=0):
  if mode=='train':
    data_frame=data_frame[data_frame['Embarked'].notna()]
    data_frame=data_frame.drop(columns=['Cabin','Ticket','Name'])
    data_frame['family_size']=data_frame['Parch']+data_frame['SibSp']
    age_mean=data_frame['Age'].mean()
    fare_mean=data_frame['Fare'].mean()
    data_frame['Age']=data_frame['Age'].fillna(age_mean)
    data_frame['Fare']=data_frame['Fare'].fillna(fare_mean)
    data_frame=pd.get_dummies(data_frame,columns=['Sex','Embarked','Pclass'])  # turn them to 1 hot encoded features
    return data_frame,age_mean,fare_mean

  else:
    data_frame=data_frame.drop(columns=['Cabin','Ticket','Name'])
    data_frame['family_size']=data_frame['Parch']+data_frame['SibSp']
    data_frame['Age']=data_frame['Age'].fillna(train_age_mean)
    data_frame['Fare']=data_frame['Fare'].fillna(train_fare_mean)
    data_frame=pd.get_dummies(data_frame,columns=['Sex','Embarked','Pclass'])  # turn them to 1 hot encoded features
    return data_frame







In [73]:
training_df,age_mean,fare_mean=preprocess_titanic(df,'train')
print(f"training :{training_df.head()}")
test_df=pd.read_csv('titanic/test.csv')
test_df=preprocess_titanic(test_df,'test',age_mean,fare_mean)
print(f"testing: {test_df.head()}")

training :   PassengerId  Survived   Age  SibSp  Parch     Fare  family_size  \
0            1         0  22.0      1      0   7.2500            1   
1            2         1  38.0      1      0  71.2833            1   
2            3         1  26.0      0      0   7.9250            0   
3            4         1  35.0      1      0  53.1000            1   
4            5         0  35.0      0      0   8.0500            0   

   Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S  Pclass_1  \
0       False      True       False       False        True     False   
1        True     False        True       False       False      True   
2        True     False       False       False        True     False   
3        True     False       False       False        True      True   
4       False      True       False       False        True     False   

   Pclass_2  Pclass_3  
0     False      True  
1     False     False  
2     False      True  
3     False     False  
4     Fals

# data splitting to labels and Features

In [74]:
Y_train=training_df['Survived'].to_numpy()
training_df=training_df.drop(columns=['Survived','PassengerId'])
X_train=training_df.to_numpy()

print(X_train.shape,Y_train.shape)


(889, 13) (889,)


# decision tree implementation

In [65]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
depths=[3,4,5,6,7,8,9,10,11,12,13,14]
classifier=None
best_depth=None
best_score=0
for depth in depths:
  classifier=DecisionTreeClassifier(random_state=42,criterion='entropy',max_depth=depth)
  scores = cross_val_score(classifier, X_train, Y_train, cv=10)  # cross validation training
  print(depth, np.mean(scores))

  if np.mean(scores)>best_score:
    best_score=np.mean(scores)
    best_depth=depth
print(f"best depth: {best_depth}")


3 0.8166624106230849
4 0.8009576098059246
5 0.8133171603677221
6 0.8166624106230846
7 0.820020429009193
8 0.8144279877425944
9 0.8144152196118488
10 0.8155388151174667
11 0.8065500510725231
12 0.8121807967313586
13 0.7953140960163433
14 0.7964504596527069
best depth: 7


# final decision tree model

## fitting the model with best depth

In [36]:
classifier=DecisionTreeClassifier(random_state=42,criterion='gini',max_depth=best_depth)
classifier.fit(X_train,Y_train)

## evaluation on testing data

In [37]:
ids=test_df['PassengerId']
test_df=test_df.drop(columns=['PassengerId'])
X_test=test_df.to_numpy()
predictions=classifier.predict(X_test)

final_result={'PassengerId':ids.astype(int),'Survived':predictions.astype(int)}
final_df=pd.DataFrame(final_result)
final_df.to_csv('submission.csv',index=False)

In [38]:
final_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [None]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Random forest classifier

## training

In [39]:
from sklearn.ensemble import RandomForestClassifier

random_forest_classifier=RandomForestClassifier(n_estimators=100,random_state=42,n_jobs=-1,max_depth=None)
random_forest_classifier.fit(X_train,Y_train)

## creating a submission

In [40]:
predictions = random_forest_classifier.predict(X_test)

final_result = {
    'PassengerId': ids.astype(int),
    'Survived': predictions.astype(int)
}

final_df = pd.DataFrame(final_result)
final_df.to_csv('submission.csv', index=False)

In [41]:
final_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0


In [None]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# logistic Regression classifier

## normalizing features

In [75]:
normalized_training_df=training_df.copy()
normalized_testing_df=test_df.copy()

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

numeric_features = ['Age', 'Fare', 'family_size','Parch','SibSp']
categorical_features = [col for col in normalized_training_df.columns if col not in numeric_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', 'passthrough', categorical_features)
    ])

normalized_X_train = preprocessor.fit_transform(training_df)   # fit with training
normalized_X_test = preprocessor.transform(test_df)            # apply on testing



## training

In [68]:
from sklearn.linear_model import LogisticRegression

logistic_regression_classifier=LogisticRegression(max_iter=1000,random_state=42)

scores = cross_val_score(logistic_regression_classifier, normalized_X_train, Y_train, cv=5)
print("CV scores:", scores)
print("Mean CV score:", np.mean(scores))

CV scores: [0.78089888 0.79775281 0.78651685 0.76966292 0.83615819]
Mean CV score: 0.7941979305529105


In [47]:
logistic_regression_classifier.fit(normalized_X_train,Y_train)

## testing

In [48]:
predictions=logistic_regression_classifier.predict(normalized_X_test)
final_result={'PassengerId':ids.astype(int),'Survived':predictions.astype(int)}
final_df=pd.DataFrame(final_result)
final_df.to_csv('logistic_regression_titanic_submission.csv',index=False)

final_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [49]:
from google.colab import files
files.download('logistic_regression_titanic_submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# neural net classifier

In [53]:
from sklearn.neural_network import MLPClassifier

neural_network_classifier=MLPClassifier(
    hidden_layer_sizes=(50,10),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42

)

neural_network_classifier.fit(normalized_X_train,Y_train)

predictions=neural_network_classifier.predict(normalized_X_test)

final_result={'PassengerId':ids.astype(int),'Survived':predictions.astype(int)}
final_df=pd.DataFrame(final_result)
final_df.to_csv('neural_net_titanic_submission.csv',index=False)

final_df.head()

from google.colab import files
files.download('neural_net_titanic_submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Ensemble

In [80]:
from sklearn.ensemble import VotingClassifier

random_forest_classifier=RandomForestClassifier(n_estimators=100,random_state=42,n_jobs=-1,max_depth=None)


logistic_regression_classifier=LogisticRegression(max_iter=1000,random_state=42)


neural_network_classifier=MLPClassifier(
    hidden_layer_sizes=(50,10),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42

)

ensemble_classifier = VotingClassifier(
    estimators=[
        ('random_forest', random_forest_classifier),
        ('logistic_regression', logistic_regression_classifier),
        ('neural_net', neural_network_classifier)
    ],
    voting='hard',
    n_jobs=-1
)

scores = cross_val_score(ensemble_classifier, normalized_X_train, Y_train, cv=10, scoring='accuracy')
print("CV ensemble accuracy:", np.mean(scores), "±", np.std(scores))


CV ensemble accuracy: 0.8290091930541369 ± 0.0423284929444449


In [81]:
ensemble_classifier.fit(normalized_X_train,Y_train)

In [82]:
predictions=ensemble_classifier.predict(normalized_X_test)

final_result={'PassengerId':ids.astype(int),'Survived':predictions.astype(int)}
final_df=pd.DataFrame(final_result)
final_df.to_csv('ensemble_hard_titanic_submission.csv',index=False)


from google.colab import files
files.download('ensemble_hard_titanic_submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>