In [1]:
import numpy as np
import pandas as pd
import seaborn as sb 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

Loading the data

In [2]:
X = pd.read_csv('Data/train.csv',index_col='PassengerId')
X_test = pd.read_csv('Data/test.csv',index_col='PassengerId')
X.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Dropping useless columns

In [3]:

X.drop(['Name','Ticket','Embarked','Cabin','Parch'], axis=1,inplace=True)
X_test.drop(['Name','Ticket','Embarked','Cabin','Parch'], axis=1,inplace=True)

X

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,3,male,22.0,1,7.2500
2,1,1,female,38.0,1,71.2833
3,1,3,female,26.0,0,7.9250
4,1,1,female,35.0,1,53.1000
5,0,3,male,35.0,0,8.0500
...,...,...,...,...,...,...
887,0,2,male,27.0,0,13.0000
888,1,1,female,19.0,0,30.0000
889,0,3,female,,1,23.4500
890,1,1,male,26.0,0,30.0000


In [4]:
X.isna().sum()


Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Fare          0
dtype: int64

In [5]:
#X.dropna(inplace=True)
y = X['Survived']
X.drop(['Survived'], axis=1,inplace=True)


In [6]:
X.dtypes

Pclass      int64
Sex        object
Age       float64
SibSp       int64
Fare      float64
dtype: object

In [7]:
X

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3,male,22.0,1,7.2500
2,1,female,38.0,1,71.2833
3,3,female,26.0,0,7.9250
4,1,female,35.0,1,53.1000
5,3,male,35.0,0,8.0500
...,...,...,...,...,...
887,2,male,27.0,0,13.0000
888,1,female,19.0,0,30.0000
889,3,female,,1,23.4500
890,1,male,26.0,0,30.0000


In [8]:
categorical_cols = ['Pclass', 'Sex']
numerical_cols = X.columns.difference(categorical_cols)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)],
    remainder='passthrough')
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)


## Logistic regression

In [9]:
grid={'C':[1, 10, 100, 1000], "penalty":["l1","l2"]}
logreg = LogisticRegression(solver='liblinear', random_state=0)
logreg_cv=GridSearchCV(logreg,grid,cv=10,scoring='accuracy')
logreg_cv.fit(X_processed,y)
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 1, 'penalty': 'l2'}
accuracy : 0.7934581772784021


In [10]:
logreg_model=LogisticRegression(solver='liblinear', random_state=0,C=1,penalty='l2')
logreg_model.fit(X_processed,y)

In [11]:
y_pred_test = pd.DataFrame(logreg_model.predict(X_test_processed), index=X_test.index,columns=['Survived'])

In [12]:
#y_pred_test.to_csv('submission.csv', index=True)


## Decision tree

In [13]:
from sklearn.tree import DecisionTreeClassifier
grid={'max_features': [ 'sqrt', 'log2'],
              'ccp_alpha': [0.1, .01, .001],
              'max_depth' : [5, 6, 7, 8, 9],
              'criterion' :['gini', 'entropy']}
dectree = DecisionTreeClassifier( random_state=0)
dectree_cv=GridSearchCV(dectree,grid,cv=10,scoring='accuracy')
dectree_cv.fit(X_processed,y)
print("tuned hpyerparameters :(best parameters) ",dectree_cv.best_params_)
print("accuracy :",dectree_cv.best_score_)


tuned hpyerparameters :(best parameters)  {'ccp_alpha': 0.001, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'log2'}
accuracy : 0.8249563046192259


In [14]:
dectree_model = DecisionTreeClassifier(random_state=0,ccp_alpha=0.001,criterion='gini',max_depth=8,max_features='log2')
dectree_model.fit(X_processed,y)

In [15]:
y_pred_test = pd.DataFrame(dectree_model.predict(X_test_processed), index=X_test.index,columns=['Survived'])
#y_pred_test.to_csv('submission.csv', index=True)


## Random Forest

In [16]:

rf_classifier = RandomForestClassifier(random_state=0)


param_grid={
            'n_estimators':[x for x in range(300, 500, 50)],
            'min_samples_leaf':[1,2,3],
            'min_samples_split':[2,3],
            'criterion':['entropy','gini']
            }

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, n_jobs=-1,scoring='accuracy')

grid_search.fit(X_processed, y)

print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best score:{grid_search.best_score_}")

Best parameters found: {'criterion': 'gini', 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 400}
Best score:0.8373109032703535


In [17]:
rf_model = RandomForestClassifier(criterion='gini',min_samples_leaf=3,min_samples_split=2,n_estimators=350)
rf_model.fit(X_processed, y)

In [18]:
y_pred_test = pd.DataFrame(rf_model.predict(X_test_processed), index=X_test.index,columns=['Survived'])
#y_pred_test.to_csv('submission.csv', index=True)


## SVM

In [19]:

sv_classifier = SVC(random_state=0)

param_grid = {'C': [0.1, 1, 10],  
              'gamma': [1, 0.1, 0.01], 
              'kernel': ['rbf','linear']}  
  
grid = GridSearchCV(sv_classifier, param_grid) 
  
grid.fit(X_processed, y) 
print(f"Best parameters found: {grid.best_params_}")
print(f"Best score:{grid.best_score_}")

Best parameters found: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Best score:0.8137153976523759


In [20]:
svm_model = SVC(C=1,gamma=1,kernel='rbf')
svm_model.fit(X_processed, y)

In [21]:
y_pred_test = pd.DataFrame(svm_model.predict(X_test_processed), index=X_test.index,columns=['Survived'])
#y_pred_test.to_csv('submission.csv', index=True)


# KNN

In [22]:

knn = KNeighborsClassifier()
k_range = list(range(1, 40))
param_grid = dict(n_neighbors=k_range)
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X_processed, y)
print(f"Best parameters found: {grid.best_params_}")
print(f"Best score:{grid.best_score_}")

Best parameters found: {'n_neighbors': 5}
Best score:0.8137578027465668


In [23]:
knn_model = KNeighborsClassifier(n_neighbors=30)
knn_model.fit(X_processed, y)

In [24]:
y_pred_test = pd.DataFrame(knn_model.predict(X_test_processed), index=X_test.index,columns=['Survived'])
#y_pred_test.to_csv('submission.csv', index=True)


## XGBOOST

In [25]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
param_grid = {
    'n_estimators': [50, 100, 200],       
    'max_depth': [3, 6, 10],               
    'learning_rate': [0.01, 0.1, 0.2],     
    'subsample': [0.8, 1.0],               
    'colsample_bytree': [0.8, 1.0],        
    'gamma': [0, 0.1, 0.3],                
}

grid_search = GridSearchCV(estimator=xgb_model, 
                           param_grid=param_grid, 
                           scoring='accuracy', 
                           cv=10,  
                           n_jobs=-1)  

grid_search.fit(X_processed, y)

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0.3, 'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 50, 'subsample': 1.0}
Best Accuracy: 0.8485268414481897


In [26]:
xg_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss',colsample_bytree=0.8,gamma=0,learning_rate=0.01,max_depth=6,n_estimators=200,subsample=1)
xg_model.fit(X_processed, y)

In [27]:
y_pred_test = pd.DataFrame(xg_model.predict(X_test_processed), index=X_test.index,columns=['Survived'])
y_pred_test.to_csv('submission.csv', index=True)
