***Titanic Competition - Support Vector Machine***

***Import Libraries***

In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

***Get Data***

In [69]:
data = pd.read_csv("./train.csv")
test= pd.read_csv("./test.csv")

***PassengerId holders***

In [70]:
test_pid = np.array(test["PassengerId"])

***Data Cleaning***

In [71]:
def fill_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        elif Pclass == 3:
            return 27
    return Age
data["Age"] = data[["Age","Pclass"]].apply(fill_age,axis=1)
test["Age"] = test[["Age","Pclass"]].apply(fill_age,axis=1)
data["Fare"].fillna(data["Fare"].mean(),inplace=True)
test["Fare"].fillna(test["Fare"].mean(),inplace=True)

In [72]:
def data_cleaning(data):
    df = data.drop(["PassengerId","Name","Ticket","Cabin"])
    return df

data = data.apply(data_cleaning,axis=1)

In [73]:
def test_cleaning(test):
    df_test = test.drop(["PassengerId","Name","Ticket","Cabin"])
    return df_test

test = test.apply(test_cleaning,axis=1)


***String values to Numeric***

In [74]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cols = ["Sex","Embarked"]
for col in cols:
    data[col] = le.fit_transform(data[col])
    test[col] = le.transform(test[col])
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


***Train Test Split***

In [75]:
from sklearn.model_selection import train_test_split
X = data.drop("Survived",axis=1)
y = data["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

***Train the model with SVM Method***

In [76]:
from sklearn.svm import SVC
svc_model = SVC(C=100,gamma=0.001)
svc_model.fit(X_train,y_train)

***Prediction***

In [77]:
predictions = svc_model.predict(X_test)

***Evaluation***

In [78]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.81      0.86      0.83       175
           1       0.78      0.70      0.74       120

    accuracy                           0.80       295
   macro avg       0.79      0.78      0.79       295
weighted avg       0.80      0.80      0.79       295



***Gridsearch Practice to find the best values for C and gamma***

In [79]:
from sklearn.model_selection import GridSearchCV
params = {"C" : [0.1,1,10,100], "gamma":[1,0.1,0.001,0.001]}
gs = GridSearchCV(SVC(),params,verbose=3)

In [80]:
gs.fit(X_train,y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ....................C=0.1, gamma=1;, score=0.625 total time=   0.0s
[CV 2/5] END ....................C=0.1, gamma=1;, score=0.630 total time=   0.0s
[CV 3/5] END ....................C=0.1, gamma=1;, score=0.630 total time=   0.0s
[CV 4/5] END ....................C=0.1, gamma=1;, score=0.630 total time=   0.0s
[CV 5/5] END ....................C=0.1, gamma=1;, score=0.622 total time=   0.0s
[CV 1/5] END ..................C=0.1, gamma=0.1;, score=0.625 total time=   0.0s
[CV 2/5] END ..................C=0.1, gamma=0.1;, score=0.630 total time=   0.0s
[CV 3/5] END ..................C=0.1, gamma=0.1;, score=0.630 total time=   0.0s
[CV 4/5] END ..................C=0.1, gamma=0.1;, score=0.630 total time=   0.0s
[CV 5/5] END ..................C=0.1, gamma=0.1;, score=0.622 total time=   0.0s
[CV 1/5] END ................C=0.1, gamma=0.001;, score=0.642 total time=   0.0s
[CV 2/5] END ................C=0.1, gamma=0.001;

***Prediction of test dataset***

In [81]:
grid_predictions = gs.predict(test)

***Build a DataFrame***

In [82]:
df = pd.DataFrame({"PassengerId" : test_pid  ,"Survived":grid_predictions})
print(df)

     PassengerId  Survived
0            892         0
1            893         1
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]


***Export the result***

In [83]:
df.to_csv("submission_svm.csv",index=False)