In [351]:
import pandas as pd
import numpy as np

In [398]:
df_train = pd.read_csv('./dataset/Train.csv')
df_test = pd.read_csv('./dataset/Test.csv')
df_sample = pd.read_csv('./dataset/sample_submission.csv')

In [399]:
print(df_train.shape)
print(df_test.shape)

(1009, 14)
(300, 13)


### seperating the target from training data and cleaning the train and test data

In [400]:
y = df_train.survived
df_train = df_train.drop('survived', axis = 1)

In [401]:
print(df_train.shape)

(1009, 13)


In [402]:
# combine test and training data
all_df = df_train.append(df_test)

In [403]:
print(all_df.shape)

(1309, 13)


#### seperating numerical from categorical data

In [404]:
all_features = all_df.columns
numerical = all_df.select_dtypes(include=['number']).copy()
categorical = all_df.select_dtypes(include=['object']).copy()

In [405]:
numerical.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,body
0,3.0,,0.0,0.0,7.75,
1,2.0,39.0,0.0,0.0,26.0,
2,2.0,40.0,0.0,0.0,13.0,
3,3.0,31.0,1.0,1.0,20.525,
4,3.0,,2.0,0.0,23.25,


In [406]:
categorical.head()

Unnamed: 0,name,sex,ticket,cabin,embarked,boat,home.dest
0,"O'Donoghue, Ms. Bridget",female,364856,,Q,,
1,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,250655,,S,,
2,"Smith, Miss. Marion Elsie",female,31418,,S,9,
3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,363291,,S,C D,"Strood, Kent, England Detroit, MI"
4,"McCoy, Miss. Agnes",female,367226,,Q,16,


In [407]:
print(numerical.shape)
print(categorical.shape)

(1309, 6)
(1309, 7)


#### impute the numerical data ..means fill the col where value is null

In [408]:
from sklearn.preprocessing import Imputer 
numerical_values = numerical.values
# Impute missing
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp = imp.fit(numerical_values)
numerical_values = imp.transform(numerical_values)
numerical_values.shape


(1309, 6)

In [409]:
numerical_values[:5]

array([[  3.        ,  29.88113451,   0.        ,   0.        ,
          7.75      , 160.80991736],
       [  2.        ,  39.        ,   0.        ,   0.        ,
         26.        , 160.80991736],
       [  2.        ,  40.        ,   0.        ,   0.        ,
         13.        , 160.80991736],
       [  3.        ,  31.        ,   1.        ,   1.        ,
         20.525     , 160.80991736],
       [  3.        ,  29.88113451,   2.        ,   0.        ,
         23.25      , 160.80991736]])

In [410]:
import sklearn.preprocessing as pp
scaler = pp.StandardScaler()
scaler.fit(numerical_values)
X = scaler.transform(numerical_values)
X[0, :]

array([ 8.41916418e-01,  2.75868709e-16, -4.79086761e-01, -4.44999502e-01,
       -4.93927255e-01,  0.00000000e+00])

In [411]:
X.shape

(1309, 6)

#### handling categorical data

In [412]:
categorical.head(3)

Unnamed: 0,name,sex,ticket,cabin,embarked,boat,home.dest
0,"O'Donoghue, Ms. Bridget",female,364856,,Q,,
1,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,250655,,S,,
2,"Smith, Miss. Marion Elsie",female,31418,,S,9.0,


In [413]:
useless = ['name',  'ticket',  'home.dest']

categorical = categorical.drop(useless, axis = 1)

In [414]:
categorical = pd.get_dummies(categorical)

In [415]:
categorical.shape

(1309, 218)

In [416]:
categorical_values = categorical.values
# Impute missing
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp = imp.fit(categorical_values)
categorical_values = imp.transform(categorical_values)
categorical_values.shape

(1309, 218)

In [417]:
# check if null values still exist
# print(pd.DataFrame(numerical_values).isnull().sum())
# print(categorical.isnull().sum())

In [418]:
X = np.hstack((numerical_values, categorical_values))

In [419]:
X.shape

(1309, 224)

### Decomposition

In [420]:
X_train = X[:df_train.shape[0], :]
Y_train = y
X_test = X[df_train.shape[0]:, :]


In [421]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)

(1009, 224)
(1009,)
(300, 224)


In [422]:
from sklearn.linear_model import LogisticRegression

In [423]:
lr = LogisticRegression()

In [424]:
lr.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [425]:
lr.score(X_train, Y_train)

0.9643211100099108

In [426]:
from sklearn.tree import DecisionTreeClassifier

In [427]:
dtc = DecisionTreeClassifier()


In [428]:
dtc.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [429]:
dtc.score(X_train, Y_train)

0.998017839444995

In [430]:
from sklearn.tree import DecisionTreeRegressor

In [431]:
dtr = DecisionTreeRegressor()

In [432]:
dtr.fit(X_train, Y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [433]:
dtr.score(X_train, Y_train)

0.9927782330648134

In [434]:
df_sample.tail()

Unnamed: 0,Id,survived
295,295,1.0
296,296,1.0
297,297,1.0
298,298,1.0
299,299,1.0


In [435]:
predictions = dtc.predict(X_test)

In [436]:
predictions = np.reshape(predictions, (-1, 1))

In [437]:
Id = np.arange(300)

In [438]:
Id = np.reshape(Id, (-1, 1))

In [439]:
outcome = np.hstack((Id, predictions))

In [440]:
outcome.shape

(300, 2)

In [441]:
pd.DataFrame(outcome, columns=["Id", 'survived']).to_csv("outcome.csv", index=None)