In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB


In [16]:
df = pd.read_csv('titanic.csv')

In [17]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [19]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [20]:
df.shape

(891, 12)

In [21]:
df = df[['Fare', 'Age', 'Pclass', 'Sex', 'Survived']]
df.dropna(inplace=True)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})


In [22]:
X = df.drop('Survived', axis=1)
Y = df['Survived']

In [23]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size = 0.2, random_state=42)


***Logistic Regression***

In [26]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train,Y_train)
Y_pred_log = logreg.predict(X_test)

***Random Forest***

In [31]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, Y_train)
Y_pred_rf = rf.predict(X_test)

***Catboost***

In [34]:
cat = CatBoostClassifier(verbose=0, random_state=42)
cat.fit(X_train,Y_train)
Y_pred_cat = cat.predict(X_test)

***SVM***

In [36]:
svm = SVC(probability=True, random_state=42)
svm.fit(X_train, Y_train)
Y_pred_svm = svm.predict(X_test)

***KNN***

In [41]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, Y_train)
Y_pred_knn = knn.predict(X_test)

***XGB***

In [43]:
xgb = XGBClassifier(use_label_encoder = False, eval_metric='logloss', random_state = 42)
xgb.fit(X_train, Y_train)
Y_pred_xgb = xgb.predict(X_test)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


***Decision tree classifier***

In [50]:
dec = DecisionTreeClassifier(random_state=42)
dec.fit(X_train,Y_train)
Y_pred_dec = dec.predict(X_test)

***Naive Bayes***

In [46]:
nb = GaussianNB()
nb.fit(X_train, Y_train)
Y_pred_nb = nb.predict(X_test)

In [None]:
def print_metrics(name, Y_true, Y_pred):
    print(f"Metrics for {name}")
    print("Accuracy:", round(accuracy_score(Y_true,Y_pred),4))
    print("-" * 30)


In [51]:
print_metrics("Logistic Regression", Y_test, Y_pred_log)
print_metrics("Random Forest", Y_test, Y_pred_rf)
print_metrics("CatBoost", Y_test, Y_pred_cat)
print_metrics("SVM", Y_test, Y_pred_svm)
print_metrics("KNN", Y_test, Y_pred_knn)
print_metrics("XGBoost", Y_test, Y_pred_xgb)
print_metrics("Decision Tree", Y_test, Y_pred_dec)
print_metrics("Naive Bayes", Y_test, Y_pred_nb)

Metrics for Logistic Regression
Accuracy: 0.7552
------------------------------
Metrics for Random Forest
Accuracy: 0.7622
------------------------------
Metrics for CatBoost
Accuracy: 0.7762
------------------------------
Metrics for SVM
Accuracy: 0.7343
------------------------------
Metrics for KNN
Accuracy: 0.7622
------------------------------
Metrics for XGBoost
Accuracy: 0.7483
------------------------------
Metrics for Decision Tree
Accuracy: 0.7063
------------------------------
Metrics for Naive Bayes
Accuracy: 0.7413
------------------------------
