### Logistic Regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train = pd.read_csv("titanic_train.csv")
train.head(10)

### EDA

In [None]:
train.isnull().sum()

In [None]:
sns.heatmap(train.isnull(), cbar=False, yticklabels=False, cmap='viridis')

In [None]:
sns.countplot(x='Survived', data=train)

In [None]:
sns.countplot(x='Survived', data=train, hue='Sex')

In [None]:
sns.countplot(x='Survived', data=train, hue='Pclass')

In [None]:
df_p3 = train[train['Pclass'] == 1]
df_p3

In [None]:
sns.countplot(x='Sex', data=df_p3, hue='Survived')

In [None]:
sns.distplot(train['Age'].dropna(), bins=30)

In [None]:
sns.countplot(x='SibSp', data=train)

In [None]:
train['Fare'].hist(color='g', bins=50)

In [None]:
%pip install cufflinks

In [None]:
import cufflinks as cf
cf.go_offline()

In [None]:
train['Fare'].iplot(kind='hist', bins=30, color='green')

### Data Cleaning

In [None]:
plt.figure(figsize=(12, 7))
sns.boxplot(x='Pclass', y='Age', data=train)

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        
        if Pclass == 1:
            return 38
        elif Pclass == 2:
            return 29
        else:
            return 25
        
    else:
        return Age
    

In [None]:
train['Age'] = train[['Age', 'Pclass']].apply(impute_age, axis=1)

In [None]:
sns.heatmap(train.isnull(), cbar=False, yticklabels=False, cmap='viridis')

In [None]:
train.drop('Cabin', axis=1, inplace=True)

In [None]:
train.head()

In [None]:
sns.heatmap(train.isnull(), cbar=False, yticklabels=False, cmap='viridis')

### Convert Categorical Features

In [None]:
sex = pd.get_dummies(train['Sex'])

In [None]:
sex

In [None]:
sex = pd.get_dummies(train['Sex'], drop_first=True)
embarked = pd.get_dummies(train['Embarked'], drop_first=True)

In [None]:
sex.head()

In [None]:
train.drop(['Sex', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)

In [None]:
train.head()


In [None]:
sex.head()

In [None]:
embarked.head()

In [None]:
train = pd.concat([train, sex, embarked], axis=1)

In [None]:
train.head()

In [None]:
train.drop('PassengerId', axis=1, inplace=True)


In [24]:
train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


### Build a Logistic Regression Model

In [25]:
from sklearn.model_selection import train_test_split


In [27]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('Survived', axis=1), train['Survived'], test_size=0.33, random_state=42)

In [28]:
X_train

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,male,Q,S
6,1,54.0,0,0,51.8625,1,0,1
718,3,25.0,0,0,15.5000,1,1,0
685,2,25.0,1,2,41.5792,1,0,0
73,3,26.0,1,0,14.4542,1,0,0
882,3,22.0,0,0,10.5167,0,0,1
...,...,...,...,...,...,...,...,...
106,3,21.0,0,0,7.6500,0,0,1
270,1,38.0,0,0,31.0000,1,0,1
860,3,41.0,2,0,14.1083,1,0,1
435,1,14.0,1,2,120.0000,0,0,1


### Training and Predicting

In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
logmodel = LogisticRegression()

In [32]:
logmodel.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [33]:
predictions = logmodel.predict(X_test)

### Classification Report

In [36]:
from sklearn.metrics import classification_report

In [38]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       175
           1       0.80      0.74      0.77       120

    accuracy                           0.82       295
   macro avg       0.82      0.81      0.81       295
weighted avg       0.82      0.82      0.82       295



In [39]:
from sklearn.metrics import confusion_matrix

In [42]:
print(confusion_matrix(y_test, predictions))

[[153  22]
 [ 31  89]]
