In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, train_test_split

In [2]:
df = pd.read_excel('./data/file_after_eda.xlsx')
df.drop('Unnamed: 0', axis=1,inplace=True)

In [3]:
# Null % in each column
round(df.isna().sum()/len(df)*100,2)

airline             0.00
overall             2.95
author              0.00
review_date         0.00
customer_review     0.00
aircraft           69.07
traveller_type     37.18
cabin               1.77
route              37.23
date_flown         37.36
seat_comfort        5.94
cabin_service       5.89
food_bev           19.11
entertainment      32.68
ground_service     37.80
value_for_money     0.73
recommended         0.00
dtype: float64

In [4]:
# Drop columns with higher than 19% null
col_with_high_null = ['aircraft', 'traveller_type', 'route', 'date_flown', 'entertainment', 'ground_service']
unwanted_cols = ['airline', 'author', 'review_date', 'customer_review']
df.drop(col_with_high_null,axis=1, inplace=True)
unwanted_cols = ['airline', 'author', 'review_date', 'customer_review']
df.drop(unwanted_cols, axis=1, inplace=True)

In [5]:
# Imputing null with top and mean
df['overall']= df['overall'].fillna(df['overall'].mean())
df['cabin'] = df['cabin'].fillna(df['cabin'].describe()['top'])
df['seat_comfort']= df['seat_comfort'].fillna(df['seat_comfort'].mean())
df['cabin_service']= df['cabin_service'].fillna(df['cabin_service'].mean())
df['food_bev']= df['food_bev'].fillna(df['food_bev'].mean())
df['value_for_money']= df['value_for_money'].fillna(df['value_for_money'].mean())

In [6]:
df = pd.get_dummies(df,columns=['cabin'],dtype='int64') # Onehot encoding
df['recommended'] = df['recommended'].apply(lambda x:1 if x=='yes' else 0)
X = df.drop('recommended',axis=1)
y = df['recommended']

In [7]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0, test_size=.05)
print('X_train : ',X_train.shape)
print('y_train : ',y_train.shape)
print('X_test : ', X_test.shape)
print('X_test : ', X_test.shape)

X_train :  (56772, 9)
y_train :  (56772,)
X_test :  (2989, 9)
X_test :  (2989, 9)


In [8]:
from sklearn.svm import SVC

In [9]:
svc = SVC(C=0.001, kernel='linear', random_state=0)

In [10]:
svc.fit(X_train, y_train)

In [11]:
train_pred = svc.predict(X_train)
test_pred = svc.predict(X_test)

In [12]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score

In [13]:
precision_score(y_train,train_pred)

0.960836969900414

In [14]:
precision_score(y_test,test_pred)

0.9593908629441624