In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

Step 1: 
Cleaning the data:
1. Replacing Nans in Cabin data with unknown.
2. Extracting titles from names and compressing them into four categories:Mr,Mrs,Ms,Master
3. Getting Deck data from cabin
4. Dummy variables for Deck, titles, embarked columns
5. Replacing Sex with a binary variables : 1 for Men and 0 for Women.
6. Dropped rows with Nans - Age and Embarked

Step 2:
Feature Selection:
1. Created a new variable Family_Count as the sum of Sibsp and Parch.
2. Selected features that are only present in both train and test data.

Step 3:
Modelling 
1. Constructing a model using logicticregression function from sklearn on the train dataset.
2. Implementing the model on the test dataset.


In [2]:
 train = pd.read_csv('train.csv')

def substrings_in_string(big_string, substrings):
    if " " in big_string:
        big_string = big_string.split(" ")
        
    for substring in substrings:
        if substring in big_string:
            return substring
    return np.nan

def replace_titles(x):
    title=x['Titles']
    if title in ['Dona.', 'Don.', 'Major.', 'Capt.', 'Jonkheer.', 'Rev.', 'Col.', 'Sir.']:
        return 'Mr.'
    elif title in ['the Countess.', 'Mme.']:
        return 'Mrs.'
    elif title in ['Mlle.', 'Ms.', 'Lady.']:
        return 'Miss.'
    elif title =='Dr.':
        if x['Sex']=='Male':
            return 'Mr.'
        else:
            return 'Mrs.'
    else:
        return title

def get_decks(df):
    decks=[]
    for val in df:
        val_str = str(val)
        decks.append(val_str[0])
    unique_decks = list(set(decks))
    return pd.Series(decks)

def get_titles(df):
    new_arr=[]
    for row in df:
        x = row.split(", ")
        y = x[1].split(". ")[0]
        z = y+'.'
        new_arr.append(z)
    return list(set(new_arr))

train['Cabin']=train['Cabin'].fillna("Unknown")

train['Deck']=get_decks(train['Cabin'])
train['Deck'].value_counts()

titles = get_titles(train['Name'])

train['Titles']=train['Name'].map(lambda x: substrings_in_string(x, titles))
train['Titles']=train.apply(replace_titles, axis=1)
train.loc[train['Titles'].isna(),'Titles'] = 'the Countess.'

title_dummies = pd.get_dummies(train['Titles'])
embarked_dummies = pd.get_dummies(train['Embarked'],prefix='Embarked')
deck_dummies = pd.get_dummies(train['Deck'],prefix="Deck")
train = pd.concat([train,title_dummies,embarked_dummies,deck_dummies],axis=1)

train['Age']=train['Age'].fillna(train['Age'].mean())
train.isnull().sum()
train.shape[0]

891

In [3]:
test = pd.read_csv('test.csv')
test.head()
test['Cabin']=test['Cabin'].fillna("Unknown")

test['Deck'] = get_decks(test['Cabin'])
titles_test = get_titles(test['Name'])

test['Titles']=test['Name'].map(lambda x: substrings_in_string(x, titles_test))
test['Titles']=test.apply(replace_titles, axis=1)

title_dummies_test = pd.get_dummies(test['Titles'])
embarked_dummies_test = pd.get_dummies(test['Embarked'],prefix='Embarked')
deck_dummies_test = pd.get_dummies(test['Deck'],prefix='Deck')
test = pd.concat([test,title_dummies_test,embarked_dummies_test,deck_dummies_test],axis=1)

test.isna().sum()

test['Age']=test['Age'].fillna(test['Age'].mean())
test['Fare']=test['Fare'].fillna(test['Fare'].mean())


In [4]:
train.loc[train['Sex'] == 'male','Sex'] = 1
train.loc[train['Sex'] == 'female','Sex'] = 0
test.loc[test['Sex'] == 'male','Sex'] = 1
test.loc[test['Sex'] == 'female','Sex'] = 0

train['Family_Count'] = train['SibSp']+train['Parch']
test['Family_Count'] = test['SibSp']+test['Parch']

to_category = ['Survived','Pclass']
for col in to_category:
    train[col] = train[col].astype('category')

test['Pclass'] = test['Pclass'].astype('category')

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)


In [5]:
features = ['Pclass', 'Sex', 'Age','Fare', 'Master.','Miss.', 'Mr.', 'Mrs.', 'Embarked_C',
            'Embarked_Q', 'Embarked_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E',
            'Deck_F', 'Deck_G', 'Deck_U','Family_Count']

target = 'Survived'
model = LogisticRegression()
model.fit(train[features],train[target])
predicted_probs_train = model.predict_proba(train[features])
predicted_probs_test = model.predict_proba(test[features])

survival_pred_train = pd.Series(np.asarray(np.round(predicted_probs_train[:,1]),dtype=int))
survival_pred_test = pd.Series(np.asarray(np.round(predicted_probs_test[:,1]),dtype=int))

accuracy_train = sum(survival_pred_train == train['Survived'])/len(survival_pred_train)
print(sum(survival_pred_test)/len(survival_pred_test))
print(sum(survival_pred_train)/len(survival_pred_train))
print(accuracy_train)

0.39712918660287083
0.36924803591470257
0.835016835016835


In [15]:
test_survival = pd.DataFrame([test['PassengerId'],survival_pred_test],index=['PassengerId','Survived'])
test_survival = test_survival.transpose()
test_survival = test_survival.set_index('PassengerId')
test_survival.to_csv('test_survival.csv')