# Importing Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Loading data
We will load the titanic dataset obtained from kaggle using pandas library. 

In [None]:
data=pd.read_csv("train.csv")

Let us take a look at the first five rows to observe the features available and its entries. 

In [None]:
data.head()

# Evaluating Missing Data

In [None]:
data.isnull()

In [None]:
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis' )

It looks like age column has few NaN values and it can be imputed based on statistical methods. But as the NaN values in Cabin column is very high and it should be removed. 

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Survived', data=data)

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Sex', data=data, palette='RdBu')

Men are always pity. Look most of them who survived were women.. 

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass', data=data, palette='rainbow')

We can find that most of the poor people died. Also more nuber of rich people have survived. 

In [None]:
sns.distplot(data['Age'].dropna(), kde=True,color='darkred', bins=30)

We can find that more number of people aged between 20 and 30 are there on board.

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='SibSp', data=data)

So we can understand that most number of people actually travelled alone on this wonderful ship.

In [None]:
sns.distplot(data['Fare'].dropna(), kde=False,color='darkred', bins=40)

Therefore, it is evident that more number of people travelled with less fare. 

In [None]:
plt.figure(figsize=(12,7))
sns.boxplot(x='Pclass', y='Age', data=data, palette='winter')

We can find that the mean value of age varies for different passenger class. So let us replace the NaN values of age with mean of age across each class.

In [None]:
def impute_age(col):
    Age=col[0]
    Pclass=col[1]
    
    if pd.isnull(Age):
        
        if Pclass==1:
            return 38
        elif Pclass==2:
            return 29
        else:
            return 24
    
    else:
        return Age

In [None]:
data['Age']=data[['Age', 'Pclass']].apply(impute_age, axis=1)

In [None]:
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
data.drop('Cabin', inplace=True, axis=1)

In [None]:
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
data.head()

Here sex and Embarked are useful categorical variables. We cannot use them unless cobverted to numerical data. So let us first convert it into numerical data. 

In [None]:
embark=pd.get_dummies(data['Embarked'], drop_first=True)

In [None]:
sex=pd.get_dummies(data['Sex'], drop_first=True)

Now let us add those new dataframe to our dataset. 

In [None]:
data=pd.concat([data, sex, embark], axis=1)

In [None]:
data.head()

Now let us remove the unnecessary variable which does not provide so much value to us. 

In [None]:
data=data.drop(['PassengerId', 'Name', 'Sex', 'Ticket', 'Embarked'], axis=1)
data.head()

In [None]:
data

Now the data is ready for modelling phase. Let us use logistic regression for creating a baseline model

In [None]:
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('Survived', axis=1), data['Survived'], test_size=0.3, random_state=24)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model=LogisticRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
predicted=model.predict(X_test)

In [None]:
predicted

In [None]:
pip uninstall sklearn

In [None]:
pip uninstall scikit-learn

In [None]:
pip install sklearn