In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler      # In SVC we Scale to balance out feature Columns as which Column has highest value will dominate so it scale them so they are balanced and dont predict wrongly based on values
from sklearn.svm import SVC                                         #Support Vector Machine
from sklearn.metrics import accuracy_score                          # we use accuracy_score from metrics in CLASSIFIER MODELS whereas we use builtin function Model.score() in Linear Models

In [4]:
df = pd.read_csv("Datasets/Titanic.csv")

In [5]:
df.head(60)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [24]:
df['Age'].fillna(df['Age'].median(),inplace=True)                    # Median - filling average age in NaN values 
df['Embarked'].fillna(df['Embarked'].mode()[0])                      # Mode - filling with most common used Category like there are : S, Q, C so if S is used mostly it will fill S with NaN values
df['Fare'].fillna(df['Fare'].median(),inplace=True)                  # Median - Same for fare

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(),inplace=True)                    # Median - filling average age in NaN values
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fare'].fillna(df['Fare'].median(),inplace=True)                  # Mode - filling with most common used Category l

In [25]:
df['Age'].isnull().sum()

np.int64(0)

In [26]:
Features = df[['Pclass','Sex','Age','SibSp','Parch','Fare', 'Embarked']]
Features.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [27]:
Target = df['Survived']
Target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [28]:
LE = LabelEncoder()
Features['Sex'] = LE.fit_transform(Features['Sex'])
Features['Embarked'] = LE.fit_transform(Features['Embarked'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Features['Sex'] = LE.fit_transform(Features['Sex'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Features['Embarked'] = LE.fit_transform(Features['Embarked'])


In [29]:
Features.head() # (891, 7) : (891 rows, 7 columns)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


In [37]:
Features_train, Features_test, Target_train, Target_test = train_test_split(Features, Target, test_size=0.2, random_state=42)

In [31]:
print(pd.DataFrame(Features_train).isnull().sum())
print(pd.DataFrame(Features_test).isnull().sum())

# TO CHECK if there is still any NULL values in Features Column

# as we had NULL values (anything above 0 here is null SUM count)

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [32]:
# # Handling missing values first
# Features_train = Features_train.fillna(Features_train.median())
# Features_test = Features_test.fillna(Features_test.median())

In [40]:
# Here is where we go different from DECISON TREE MODEL
# We are SCALING Features to balance out values so no Unfair Value based Predictions
Scaler = StandardScaler()
Features_train_Scaled = Scaler.fit_transform(Features_train)       # it scales columns fit_transform
Features_test_Scaled = Scaler.transform(Features_test)             # but here we apply same scaling we did above, not TO DO IT AGAIN as data is already scaled it will make null values in it

In [34]:
Features_train_Scaled

array([[-1.61413602,  0.7243102 ,  1.25364106, ..., -0.47934164,
        -0.07868358,  0.55744438],
       [-0.40055118,  0.7243102 , -0.47728355, ..., -0.47934164,
        -0.37714494,  0.55744438],
       [ 0.81303367,  0.7243102 ,  0.21508629, ..., -0.47934164,
        -0.47486697,  0.55744438],
       ...,
       [ 0.81303367,  0.7243102 ,  0.90745614, ..., -0.47934164,
        -0.35580399,  0.55744438],
       [-1.61413602, -1.38062393, -1.1696534 , ...,  2.04874166,
         1.68320121,  0.55744438],
       [-1.61413602,  0.7243102 , -0.63114352, ...,  0.78470001,
         0.86074761,  0.55744438]], shape=(712, 7))

In [41]:
SVM = SVC(kernel='rbf',C=1 , gamma='scale')
SVM.fit(Features_train_Scaled, Target_train)
Predictions = SVM.predict(Features_test_Scaled)

In [42]:
print("Support Vector Machine Model Score: {:.2f}%".format(accuracy_score(Target_test,Predictions)*100))

Support Vector Machine Model Score: 81.56%
