In [1]:
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline

#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# algorithm
from sklearn.svm import SVC

# Metrics to evaluate the model
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve,recall_score

# For tuning the model
from sklearn.model_selection import GridSearchCV

In [2]:
path = '/Users/andrescervantes/Library/CloudStorage/OneDrive-Personal/03_MIT_IDSS/09_hackathon/Traveldata_train.csv'
travel = pd.read_csv(path)

path = '/Users/andrescervantes/Library/CloudStorage/OneDrive-Personal/03_MIT_IDSS/09_hackathon/surveydata_train.csv'
survey = pd.read_csv(path)

In [3]:
# merging data
df = travel.merge(survey, how='outer', on='ID', indicator=True)
df.head()

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Overall_Experience,...,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding,_merge
0,98800001,Female,Loyal Customer,52.0,,Business,272,0.0,5.0,0,...,Needs Improvement,Acceptable,Needs Improvement,Needs Improvement,Acceptable,Needs Improvement,Good,Needs Improvement,Poor,both
1,98800002,Male,Loyal Customer,48.0,Personal Travel,Eco,2200,9.0,0.0,0,...,Poor,Good,Good,Excellent,Needs Improvement,Poor,Needs Improvement,Good,Good,both
2,98800003,Female,Loyal Customer,43.0,Business Travel,Business,1061,77.0,119.0,1,...,Good,Excellent,Excellent,Excellent,Excellent,Excellent,Good,Excellent,Excellent,both
3,98800004,Female,Loyal Customer,44.0,Business Travel,Business,780,13.0,18.0,0,...,Needs Improvement,Acceptable,Acceptable,Acceptable,Acceptable,Acceptable,Good,Acceptable,Acceptable,both
4,98800005,Female,Loyal Customer,50.0,Business Travel,Business,1981,0.0,0.0,1,...,Good,Excellent,Good,Good,Good,Good,Good,Good,Good,both


In [4]:
num_cols = df.describe().columns.tolist()
cat_cols = df.describe(include='object').columns.tolist()

# filling NAs

In [5]:
nona_df = df.copy()

def fill_nas_with_mode(df, col):
    mode = df[col].mode().iloc[0]
    df[col].fillna(mode, inplace=True)

def fill_nas_with_median(df, col):
    median = df[col].median()
    df[col].fillna(median, inplace=True)

for col in cat_cols:
    fill_nas_with_mode(nona_df, col)

for col in num_cols:
    fill_nas_with_median(nona_df, col)

# dummy variables

In [6]:
dumm_df = nona_df.drop(columns=['ID','_merge'])

dumm_df = pd.get_dummies(dumm_df, drop_first=True)

dumm_df.columns = dumm_df.columns.str.strip()
dumm_df.columns = dumm_df.columns.str.replace(' ', '_')

# scaling the data

In [7]:
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(dumm_df), columns=dumm_df.columns)
scaled_df.head()

Unnamed: 0,Age,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Overall_Experience,Gender_Male,Customer_Type_Loyal_Customer,Type_Travel_Personal_Travel,Travel_Class_Eco,Seat_Comfort_Excellent,...,Cleanliness_Excellent,Cleanliness_Extremely_Poor,Cleanliness_Good,Cleanliness_Needs_Improvement,Cleanliness_Poor,Online_Boarding_Excellent,Online_Boarding_Extremely_Poor,Online_Boarding_Good,Online_Boarding_Needs_Improvement,Online_Boarding_Poor
0,0.832356,-1.660469,-0.383916,-0.259227,-1.098107,-0.985222,0.445082,-0.625411,-1.046703,-0.399166,...,-0.617498,-0.007279,-0.775313,2.93677,-0.251939,-0.547105,-0.011277,-0.60909,-0.407688,2.730732
1,0.567699,0.215099,-0.147874,-0.389511,-1.098107,1.014999,0.445082,1.59895,0.95538,-0.399166,...,-0.617498,-0.007279,1.289802,-0.34051,-0.251939,-0.547105,-0.011277,1.641793,-0.407688,-0.366202
2,0.236878,-0.892926,1.635557,2.711265,0.910658,-0.985222,0.445082,-0.625411,-1.046703,-0.399166,...,1.619437,-0.007279,-0.775313,-0.34051,-0.251939,1.827802,-0.011277,-0.60909,-0.407688,-0.366202
3,0.303042,-1.166284,-0.042966,0.079514,-1.098107,-0.985222,0.445082,-0.625411,-1.046703,-0.399166,...,-0.617498,-0.007279,-0.775313,-0.34051,-0.251939,-0.547105,-0.011277,-0.60909,-0.407688,-0.366202
4,0.700027,0.002054,-0.383916,-0.389511,0.910658,-0.985222,0.445082,-0.625411,-1.046703,-0.399166,...,-0.617498,-0.007279,1.289802,-0.34051,-0.251939,-0.547105,-0.011277,1.641793,-0.407688,-0.366202


# SVM

In [8]:
X = scaled_df.drop(columns='Overall_Experience')
y = df['Overall_Experience']

In [9]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [12]:
y_train.value_counts(normalize=True)

1    0.545917
0    0.454083
Name: Overall_Experience, dtype: float64

In [13]:
y_test.value_counts(normalize=True)

1    0.548386
0    0.451614
Name: Overall_Experience, dtype: float64

In [14]:
# Fitting SVM
svm = SVC(kernel='linear') # Linear kernal or linear decision boundary
model = svm.fit(X=X_train, y=y_train)

In [None]:
y_pred_train_svm = model.predict(X_train_scaled)

metrics_score(y_train, y_pred_train_svm)