In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import re
import seaborn as sns

In [2]:
train = pd.read_csv('training_titanic_x_y_train.csv')
test = pd.read_csv('test_titanic_x_test.csv')

In [3]:
train.columns


Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked', 'Survived'],
      dtype='object')

In [4]:
def Ticket(ticket):
    if ticket.split(' ')[-1][0]=='L':
        return 8
    else:
        return int(ticket.split(' ')[-1][0])

In [5]:
def title(name):
    tit = re.search('([A-Za-z]+)\.',name)
    if tit:
        return tit.group(1)
    return ''
        

In [6]:
def clean_data(train):
    train.drop('Cabin',inplace = True, axis = 1)
    #train.drop('Ticket',inplace = True, axis = 1)
    
    
    train['Sex'].replace('female',1,inplace =True)
    train['Sex'].replace('male',0,inplace =True)

    train['Embarked'].fillna(train.Embarked.mode()[0],inplace = True)
        
    train['Embarked'].replace('S',0,inplace = True)
    train['Embarked'].replace('C',1,inplace = True)
    train['Embarked'].replace('Q',2,inplace = True)
    
    #train['Age'].fillna(train.Age.median(),inplace = True)
    train['Age'].fillna(-1,inplace = True)
    
    
    train['Fare_bin'] = pd.cut(train['Fare'], bins = [0,20,40,100,513], labels = ['Low','Median','Average','High'])
    
    train['FamilySize'] = train['SibSp'] + train['Parch']
    
    train['Title'] = train['Name'].apply(title)
    train['Ticket'] = train['Ticket'].apply(Ticket)
    
    train['Title'].replace('Mlle','Miss',inplace = True)
    train['Title'].replace('Ms','Miss',inplace = True)
    train['Title'].replace('Mme','Mr',inplace = True)
    
    train['Title'].replace(['Rev','Dr','Col','Countess','Lady','Sir','Major','Capt','Don','Jonkheer'],'Rare',inplace = True)
    
    
    train.loc[(train['Age'] == -1) &(train['Title'] == 'Master'), 'Age'] = 4.57
    train.loc[(train['Age'] == -1) &(train['Title'] == 'Miss'), 'Age'] = 21.84
    train.loc[(train['Age'] == -1) &(train['Title'] == 'Mr'), 'Age'] = 32.36
    train.loc[(train['Age'] == -1) &(train['Title'] == 'Mrs'), 'Age'] = 35.78
    train.loc[(train['Age'] == -1) &(train['Title'] == 'Rare'), 'Age'] = 45.54
    train['Age'] = train['Age'].astype(int)  
    
    train.loc[ train['Age'] <= 11, 'Age'] = 0
    train.loc[(train['Age'] > 11) & (train['Age'] <= 18), 'Age'] = 1
    train.loc[(train['Age'] > 18) & (train['Age'] <= 22), 'Age'] = 2
    train.loc[(train['Age'] > 22) & (train['Age'] <= 27), 'Age'] = 3
    train.loc[(train['Age'] > 27) & (train['Age'] <= 33), 'Age'] = 4
    train.loc[(train['Age'] > 33) & (train['Age'] <= 40), 'Age'] = 5
    train.loc[(train['Age'] > 40) & (train['Age'] <= 66), 'Age'] = 6
    train.loc[ train['Age'] > 66, 'Age'] = 7
    
   
    
    train.drop('Fare',inplace = True, axis = 1)
    train.drop('Name',inplace = True, axis = 1)
    train.drop('SibSp',inplace = True, axis = 1)
    train.drop('Parch',inplace = True, axis = 1)
    
    
    train = pd.get_dummies(train,columns = ['Pclass','Sex','Age','Fare_bin','Title','Embarked','Ticket'], prefix =  ['Pclass','Sex','Age','Fare_type','Title','Embarked_from','Ticket'])
    
    return train
   

    
    

In [7]:
train = clean_data(train)


In [8]:
test = clean_data(test)

In [9]:
train.shape


(668, 36)

In [10]:
test.shape

(223, 35)

In [11]:
x = train.drop('Survived', axis = 1).values
y = train['Survived']

In [12]:
clf = LogisticRegression(solver='liblinear',penalty = 'l2' , )

In [13]:
clf.fit(x,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
clf.score(x,y)

0.8368263473053892

In [15]:
y_pred =  clf.predict(test)

In [16]:
y_pred

array([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0], dtype=int64)

np.savetxt('y_pred.csv',y_pred,delimiter=  ',')

In [17]:
y_train_pred = clf.predict(x)

In [18]:
from sklearn.metrics import confusion_matrix

In [19]:
confusion_matrix(y,y_train_pred)

array([[358,  41],
       [ 68, 201]], dtype=int64)