In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn import metrics 
import matplotlib as plt
import numpy as np

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
df_train.head(10)

In [None]:
df_train.describe()

In [None]:
df_train.shape[0],df_test.shape[0]

In [None]:
df_train.isnull().sum() 

In [None]:
def hist_age(data):
    return data.Age.hist(bins=20)

In [None]:
def sex_values(data):
    return data.Sex.value_counts().plot(kind='barh')

In [None]:
def p_class(data):
    return data['Pclass'].value_counts().plot(kind='barh')

In [None]:
def survived(data):
    return  df_train.groupby('Sex').Survived.mean().plot(kind='barh').set_xlabel('% survive')

In [None]:
sex_values(df_train)

In [None]:
p_class(df_train)

In [None]:
survived(df_train)

In [None]:
df_train.Survived.groupby(df_train.Title).mean()

In [None]:
df_train[('Title')]

In [None]:
df_train[('Title')] = df_train[('Title')].map(dict_last_name)

In [None]:
df_train[('Title')].isnull().sum(0)

In [14]:
df_train.Fare.describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [4]:
df_train.Fare.groupby(df_train.Survived).mean()

Survived
0    22.117887
1    48.395408
Name: Fare, dtype: float64

In [None]:
def clean_data(data):
    data[('Sex')] = (data[('Sex')] == 'male')*1
    data[('Embarked')] = data[('Embarked')].dropna().map({'S':0,'C':1,'Q':2}).astype(int) 
    data[('Title')] = data.Name.str.extract(' ([A-Za-z]+)\.')
    data[('Title')] = data[('Title')].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Rev','Sir',
                                             'Lady','Major'],'Other')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    dict_last_name = {'Mr':0,'Mrs':1,'Miss':2,'Master':4,'Other':5}
    data[('Title')] = data[('Title')].map(dict_last_name)
    data[('Age')] = data[('Age')].fillna(data[('Age')].mean())
    data = data.drop(['Ticket','Cabin','Name'],axis=1)
    data = data.fillna(0)
    return data   

In [None]:
df_train_clean = clean_data(df_train)

In [None]:
df_test_clean = clean_data(df_test)

In [None]:
df_test_clean.columns

In [None]:
selected_feature = ['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked','Survived']
df_train_clean[(selected_feature)].shape

In [None]:
df_train_clean.head()

In [None]:
X = df_train_clean.drop('Survived', axis=1)
y = df_train_clean['Survived']

In [None]:
X.Title.unique()

In [None]:
from sklearn.preprocessing import MinMaxScaler

def scaled(data):
    scaler = StandardScaler()
    data = pd.DataFrame(scaler.fit_transform(data))
    return data

In [None]:
X_scaled = scaled(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.20,random_state = 42, shuffle = True)

In [None]:
def model(X_train,y_train,X_test,y_test):
    lr = LogisticRegression()
    lr.fit(X_train,y_train)
    y_pred=lr.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='macro')
    f1_score
    return f1_score

In [None]:
model(X_train,y_train,X_test,y_test)

In [None]:
def prediction(df_test,X_train,y_train):
    y_eval = scaled(df_test_clean)
    lr.fit(X_train,y_train)
    prediction = lr.predict(y_eval)
    df = pd.DataFrame(df_test_clean.PassengerId).join(pd.DataFrame(prediction))
    df.rename(columns = {0:'Survived'}, inplace=True)
    evaluation = pd.DataFrame(df).to_csv('evaluation_4.csv', header=True, index = False)

In [None]:
prediction(df_test_clean,X_train,y_train)

In [None]:
from sklearn.svm import SVC, LinearSVC

In [None]:
svc = SVC()
svc.fit(X_train, y_train)
Y_pred = svc.predict(X_test)
f1_score = metrics.f1_score(y_test, Y_pred, average='macro')
f1_score

In [None]:
X_eval = scaled(df_test_clean)
Y_pred = svc.predict(X_eval)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
Y_pred = random_forest.predict(X_test)
f1_score = metrics.f1_score(y_test, Y_pred, average='macro')
f1_score

In [None]:
y_eval = scaled(df_test_clean)
prediction = random_forest.predict(y_eval)

In [None]:
df = pd.DataFrame(df_test_clean.PassengerId).join(pd.DataFrame(prediction))
df.rename(columns = {0:'Survived'}, inplace=True)
evaluation = pd.DataFrame(df).to_csv('prediction_2.csv', header=True, index = False)

In [None]:
df