In [227]:
import pandas as pd

# import train and test files
csv_train = pd.read_csv('train.csv', sep=',')
csv_test = pd.read_csv('test.csv', sep=',')
csv_train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [228]:
csv_train.shape

(891, 12)

In [229]:
csv_test.shape

(418, 11)

In [230]:
# start building X_data. Filter from the csv_train, only the OK fields
X_data_train = csv_train.filter(items=["Age", "SibSp", "Parch", "Fare"])
X_data_test = csv_test.filter(items=["Age", "SibSp", "Parch", "Fare"])

In [231]:
# let's also create our y_data_train for fitting and scoring purposes
y_data_train = csv_train.filter(items=["Survived"]).values
y_data_train = y_data_train.reshape(-1,1)

In [232]:
# concatenate train and test together. I HOPE THE INDICES DON'T CHANGE
X_data = pd.concat([X_data_train, X_data_test], axis=0)
print("X_data length :" + str(X_data.shape[0]))

X_data length :1309


In [233]:
# now we want to apply regex to separate the title from the the rest of the name with the help of regex
import re

# first, we get store the column values from each of original dataframes
passenger_titles_train = csv_train['Name'].values
passenger_titles_test = csv_test['Name'].values

# now we filter the title of each passenger with the help of RegEx
regex = r', (.*)\. '
for name_index in range(0, len(passenger_titles_train)):
    passenger_titles_train[name_index] = re.search(regex, passenger_titles_train[name_index]).group(1)

for name_index in range(0, len(passenger_titles_test)):
    passenger_titles_test[name_index] = re.search(regex, passenger_titles_test[name_index]).group(1)

# for some reason, the dataframe itself was changed, so the passenger names are already up to date.
# let's test this hypothesis
csv_train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Mr,male,22.0,1,0,A/5 21171,7.25,,S


In [234]:
# now we just need to concatenate both dfs to one single column and add them to the X_data
temp_df_titles = pd.concat([csv_train['Name'], csv_test['Name']], axis=0)
# X_data = pd.concat([temp_df_titles, X_data], axis=1)
print(X_data.shape)
X_data.head(1)


(1309, 4)


Unnamed: 0,Age,SibSp,Parch,Fare
0,22.0,1,0,7.25


In [235]:
# now, we need to transform all the categorical features into numerical
from sklearn.preprocessing import LabelEncoder

# we'll initiate Label Encoder and create useful variables for this process
le = LabelEncoder()

# list of columns we want to tranform, concatenate both csv files
cols_to_label = ['Sex', 'Name']
train_test_concat = pd.concat([csv_train, csv_test], axis=0)

# iterate through columns selected above and fit_transform each of them
for col in cols_to_label:
    le.fit(train_test_concat[col].values)
    train_test_concat[col] = le.transform(train_test_concat[col])

#let's confirm that Sex and Name are now categorical values
train_test_concat.head(2)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,12,0,1,3,1,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,13,0,2,1,0,1,1.0,PC 17599


In [236]:
# transform all the numerical/categorical features into a column for each category of its own
from sklearn.preprocessing import OneHotEncoder

# now we use onehotencoder on Sex, Pclass and Name to create column for the category of each feature
enc = OneHotEncoder(sparse=False)
cols_to_ohe = ['Sex', 'Pclass', 'Name']
for col in cols_to_ohe:
    data = train_test_concat[[col]]
    enc.fit(data)
    temp = enc.transform(data[[col]])
    temp = pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data[col].value_counts().index])
    temp = temp.set_index(train_test_concat.index.values)
    X_data = pd.concat([X_data, temp], axis=1)

# let's print the head of X_data
X_data.head(5)

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_1,Sex_0,Pclass_3,Pclass_1,Pclass_2,Name_12,...,Name_7,Name_5,Name_2,Name_3,Name_18,Name_6,Name_17,Name_11,Name_14,Name_0
0,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35.0,1,0,53.1,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [237]:
# imputer on columns with NaN

In [238]:
# feature scalling

In [239]:
# is there something else we can try?

In [240]:
# we need to separate the known that from the unseen data from X_data.

# store the lenght of both train and test for later separation
train_length = X_data_train.shape[0]
test_length = X_data_test.shape[0]
print("train length: " + str(train_length))
print("test length: " + str(test_length))

# for this, we need the initial length of the known data 
X_data_train = X_data[:train_length]
X_data_train.shape
X_data_test = X_data[train_length:]
X_data_test.shape

train length: 891
test length: 418


(418, 28)

In [241]:
# create a "local" train and test with the csv_train data. we'll call the test portion, a "cross validation cv"
from sklearn.model_selection import train_test_split

# define the random state
randstate = 2

# now we use the model_selection to split our data in trainning and cross validation
X_train, X_cv, y_train, y_cv = train_test_split(X_data_train, y_data_train, test_size=0.3, random_state=randstate)

# transform X_train, X_cv, X_data_train into Numpy
X_train

In [242]:
c_values = [0.01, 0.1, 1, 10]
kernel_types = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']

2

In [243]:
from sklearn.svm import SVC

print(type(X_train))
print(type(y_train))

# start a loop to itirate over all the parameters we are trying out
for c in c_values:
    for kern in kernel_types:
        clf = SVC(C=c, kernel=kern, random_state=randstate)
        clf.fit(X_train.values, y_train)
        print()

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').