In [1]:
import pandas 
import sklearn 
from sklearn import svm 
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
import seaborn as sb 
import matplotlib.pyplot as plt
from random import randint 

In [2]:
data_frame_train = pandas.read_csv('all/train.csv')
data_frame_test = pandas.read_csv('all/test.csv')

### Initial Test 

In [3]:
rows, col = data_frame_train.shape
tr_rows = int(rows * .80)
val_rows = rows - tr_rows 
train_df = data_frame_train[:tr_rows]
val_df = data_frame_train[tr_rows+1:]
val_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
713,714,0,3,"Larsson, Mr. August Viktor",male,29.0,0,0,7545,9.4833,,S
714,715,0,2,"Greenberg, Mr. Samuel",male,52.0,0,0,250647,13.0,,S
715,716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19.0,0,0,348124,7.65,F G73,S
716,717,1,1,"Endres, Miss. Caroline Louise",female,38.0,0,0,PC 17757,227.525,C45,C
717,718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27.0,0,0,34218,10.5,E101,S


In [4]:
df_train_ground_truth = train_df['Survived']
df_val_ground_truth = val_df['Survived']

In [5]:
train_df=train_df.drop(['Survived'],axis=1)
train_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
val_df = val_df.drop(['Survived'],axis=1)
val_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
713,714,3,"Larsson, Mr. August Viktor",male,29.0,0,0,7545,9.4833,,S
714,715,2,"Greenberg, Mr. Samuel",male,52.0,0,0,250647,13.0,,S
715,716,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19.0,0,0,348124,7.65,F G73,S
716,717,1,"Endres, Miss. Caroline Louise",female,38.0,0,0,PC 17757,227.525,C45,C
717,718,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27.0,0,0,34218,10.5,E101,S


In [7]:
train_df['Sex'] = train_df['Sex'].map({'female': 1, 'male': 0})
val_df['Sex'] = val_df['Sex'].map({'female':1, 'male':0})

In [8]:
train_df=train_df.drop(['Name'],axis=1)
train_df=train_df.drop(['Cabin'],axis=1)
train_df=train_df.drop(['Embarked'],axis=1)
val_df=val_df.drop(['Name'],axis=1)
val_df=val_df.drop(['Cabin'],axis=1)
val_df=val_df.drop(['Embarked'],axis=1)
val_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare
713,714,3,0,29.0,0,0,7545,9.4833
714,715,2,0,52.0,0,0,250647,13.0
715,716,3,0,19.0,0,0,348124,7.65
716,717,1,1,38.0,0,0,PC 17757,227.525
717,718,2,1,27.0,0,0,34218,10.5


In [9]:
train_df=train_df.drop(['Ticket'],axis=1)
val_df=val_df.drop(['Ticket'],axis=1)


In [10]:
train_df.fillna(0, inplace=True)
val_df.fillna(0, inplace=True)

In [11]:
clf = svm.SVC()
clf.fit(train_df.values, df_train_ground_truth.values)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
y_pred = clf.predict(val_df.values)

In [13]:
mean_absolute_error(df_val_ground_truth.values, y_pred)

0.3539325842696629

### Check for missing data 

In [14]:
data_frame_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [15]:
df_train_copy = data_frame_train.copy()
df_train_copy.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


My first data cleaning will be to drop the name, ticket, and cabin. I will turn embarked, and sex into categorical data. The age has missing data. I will attempt to augment this data with ages that are within the standdard deviation of the average age. Tomorrow, I plan to turn male/female and the embarked data into catergorical data 

In [16]:
df_train_copy = df_train_copy.drop(['Name'], axis=1)
df_train_copy = df_train_copy.drop(['PassengerId'], axis=1)
df_train_copy = df_train_copy.drop(['Ticket'], axis=1)
df_train_copy = df_train_copy.drop(['Cabin'], axis=1)
df_train_copy['Sex'] = df_train_copy['Sex'].map({'female': 1, 'male': 0})
df_train_copy['Embarked'] = df_train_copy['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
df_train_copy.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,0.0
1,1,1,1,38.0,1,0,71.2833,1.0
2,1,3,1,26.0,0,0,7.925,0.0
3,1,1,1,35.0,1,0,53.1,0.0
4,0,3,0,35.0,0,0,8.05,0.0


In [17]:
std = int(df_train_copy['Age'].std(skipna=True))
avg = int(df_train_copy['Age'].mean())
print(avg)
print(std)

29
14


In [18]:
df_train_copy['Age'].fillna(avg, inplace=True)
df_train_copy['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
5      29.0
6      54.0
7       2.0
8      27.0
9      14.0
10      4.0
11     58.0
12     20.0
13     39.0
14     14.0
15     55.0
16      2.0
17     29.0
18     31.0
19     29.0
20     35.0
21     34.0
22     15.0
23     28.0
24      8.0
25     38.0
26     29.0
27     19.0
28     29.0
29     29.0
       ... 
861    21.0
862    48.0
863    29.0
864    24.0
865    42.0
866    27.0
867    31.0
868    29.0
869     4.0
870    26.0
871    47.0
872    33.0
873    47.0
874    28.0
875    15.0
876    20.0
877    19.0
878    29.0
879    56.0
880    25.0
881    33.0
882    22.0
883    28.0
884    25.0
885    39.0
886    27.0
887    19.0
888    29.0
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [19]:
df_train_copy['Embarked']=df_train_copy['Embarked'].fillna(0)
df_train_copy.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64