In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [10]:
df = pd.read_csv("Titanic.csv")


In [11]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])


In [12]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [13]:
sex = pd.get_dummies(df['Sex'], drop_first=True)
embark = pd.get_dummies(df['Embarked'], drop_first=True)

df = pd.concat([df, sex, embark], axis=1)

df = df.drop(['Sex', 'Embarked'], axis=1)


In [14]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,0,3,22.0,1,0,7.25,True,False,True
1,1,1,38.0,1,0,71.2833,False,False,False
2,1,3,26.0,0,0,7.925,False,False,True
3,1,1,35.0,1,0,53.1,False,False,True
4,0,3,35.0,0,0,8.05,True,False,True


In [15]:
def predict_survivors(model_type):
    X = df.drop('Survived', axis=1)
    y = df['Survived']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    if model_type == 'Logistic Regression':
        model = LogisticRegression(random_state=42)
    elif model_type == 'Decision Tree':
        model = DecisionTreeClassifier(random_state=42)
    else:
        print('Model type not recognized.')
        return
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred)}")
    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")


In [16]:
predict_survivors('Logistic Regression')


Confusion matrix:
[[107  13]
 [ 34  60]]
Accuracy score: 0.780373831775701


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
predict_survivors('Decision Tree')


Confusion matrix:
[[101  19]
 [ 32  62]]
Accuracy score: 0.7616822429906542
