In [None]:
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import precision_recall_curve
from sklearn.cross_validation import train_test_split

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
train.tail()

In [None]:
test.tail()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.shape

In [None]:
test.shape

Data Exploration

In [None]:
#Age
fig=plt.figure(figsize=(10,5))

age1=train[train['Survived']==1]['Age'].value_counts()
age2=train[train['Survived']==0]['Age'].value_counts()

sns.set_style('white')
sns.distplot(age1,hist=False,label='Survived')
sns.distplot(age2,hist=False,label='Died')

In [None]:
#Sex
sns.countplot(x='Sex',hue='Survived',data=train)

In [None]:
#Pclass
sns.barplot(x='Pclass',y='Survived',hue='Sex',data=train)

In [None]:
#SibSp & Parch
fig=plt.figure(figsize=(10,5))

train["Family"]=train['SibSp']+train['Parch']
sns.barplot(x='Family',y='Survived',data=train)

In [None]:
#Embarked
sns.countplot(x='Embarked',hue='Survived',data=train)

Data Cleaning

In [None]:
train.drop(['Family'],inplace=True,axis=1)
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
# Embarked  missing
train['Embarked']=train["Embarked"].fillna("S")

In [None]:
#Fare  missing
test['Fare'].fillna(test["Fare"].mean(),inplace=True)

In [None]:
# drop unnecessary columns
train.drop(['PassengerId','Name','Ticket','Cabin'], axis=1,inplace=True)
test.drop(['Name','Ticket','Cabin'], axis=1,inplace=True)

In [None]:
train.head()

Feature Engineering

In [None]:
#creature feature : Family
train["Family"]=train['SibSp']+train['Parch']
train.drop(['SibSp','Parch'],axis=1,inplace=1)

test["Family"]=test['SibSp']+test['Parch']
test.drop(['SibSp','Parch'],axis=1,inplace=1)

In [None]:
#Encoding sex, female: 0 and male: 1
train['Sex'].loc[train['Sex']=='female']=0
train['Sex'].loc[train['Sex']=='male']=1

test['Sex'].loc[test['Sex']=='female']=0
test['Sex'].loc[test['Sex']=='male']=1

In [None]:
train=pd.get_dummies(train,columns=["Pclass","Embarked"])
test=pd.get_dummies(test,columns=["Pclass","Embarked"])

In [None]:
#Age
mean1 = train["Age"].mean()
std1= train["Age"].std()
count1 = train["Age"].isnull().sum()

mean2= test["Age"].mean()
std2= test["Age"].std()
count2 = test["Age"].isnull().sum()

rand1=np.random.randint(mean1-std1,mean1+std1,size=count1)
rand2=np.random.randint(mean2-std2,mean2+std2,size=count2)

train['Age'][np.isnan(train['Age'])]=rand1
test['Age'][np.isnan(test['Age'])]=rand2

In [None]:
#Standardization
sd_train=train[['Age','Fare','Family']]
sd_test=test[['Age','Fare','Family']]

sds = StandardScaler()
sds.fit(sd_train)

sds_xtrain1 = sds.transform(sd_train)
sds_xtest1  = sds.transform(sd_test)
train[['Age','Fare','Family']]=sds_xtrain1
test[['Age','Fare','Family']]=sds_xtest1

In [None]:
train.head()

In [None]:
xdata=train.drop("Survived",axis=1)
ydata=train["Survived"]
xtrain,xtest,ytrain,ytest = train_test_split(xdata,ydata,test_size = 0.2)

LogisticRegression

In [None]:
lr= LogisticRegression()
param = {"C":[0.001,0.01,0.1,1],'max_iter':[10,50,100,200]}
gs = GridSearchCV(lr,param,cv = 5)
gs.fit(xtrain,ytrain)

In [None]:
lr=gs.best_estimator_

In [None]:
lr.fit(xtrain,ytrain)

In [None]:
lr.score(xtest,ytest)

Random Forest

In [None]:
rfc=RandomForestClassifier()
param1={"n_estimators":list(range(5,50))}
gs1=GridSearchCV(rfc,param1,cv=5)
gs1.fit(xtrain,ytrain)

In [None]:
rfc=gs1.best_estimator_

In [None]:
rfc.fit(xtrain,ytrain)

In [None]:
rfc.score(xtest,ytest)

SVM

In [None]:
clf=svm.SVC(probability=True)
param2={"kernel":("linear","rbf"),"C":[0.001,0.01,0.1,1]}
gs2=GridSearchCV(clf,param2,cv=5)
gs2.fit(xtrain,ytrain)

In [None]:
clf=gs2.best_estimator_

In [None]:
clf.fit(xtrain,ytrain)

In [None]:
clf.score(xtest,ytest)

Make submission

In [None]:
ID = test['PassengerId']
xtest=test.drop(['PassengerId'],axis=1)

In [None]:
ytest=clf.predict(xtest)

In [None]:
submission=pd.DataFrame({'PassengerId':ID,'Survived':ytest})

In [None]:
submission.to_csv("titanic.csv",index=False)