In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, roc_auc_score, recall_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA

In [2]:
df = pd.read_excel('./data/file_after_eda.xlsx')
df.drop('Unnamed: 0', axis=1,inplace=True)

#### Null handling

In [3]:
# Null % in each column
round(df.isna().sum()/len(df)*100,2)

airline             0.00
overall             2.95
author              0.00
review_date         0.00
customer_review     0.00
aircraft           69.07
traveller_type     37.18
cabin               1.77
route              37.23
date_flown         37.36
seat_comfort        5.94
cabin_service       5.89
food_bev           19.11
entertainment      32.68
ground_service     37.80
value_for_money     0.73
recommended         0.00
dtype: float64

In [4]:
# Drop columns with higher than 19% null
col_with_high_null = ['aircraft', 'traveller_type', 'route', 'date_flown', 'entertainment', 'ground_service']
unwanted_cols = ['airline', 'author', 'review_date', 'customer_review']
df.drop(col_with_high_null,axis=1, inplace=True)
unwanted_cols = ['airline', 'author', 'review_date', 'customer_review']
df.drop(unwanted_cols, axis=1, inplace=True)

In [5]:
# Imputing null with top and mean
df['overall']= df['overall'].fillna(df['overall'].mean())
df['cabin'] = df['cabin'].fillna(df['cabin'].describe()['top'])
df['seat_comfort']= df['seat_comfort'].fillna(df['seat_comfort'].mean())
df['cabin_service']= df['cabin_service'].fillna(df['cabin_service'].mean())
df['food_bev']= df['food_bev'].fillna(df['food_bev'].mean())
df['value_for_money']= df['value_for_money'].fillna(df['value_for_money'].mean())

In [6]:
df = pd.get_dummies(df,columns=['cabin'],dtype='int64') # Onehot encoding
df['recommended'] = df['recommended'].apply(lambda x:1 if x=='yes' else 0)
X = df.drop('recommended',axis=1)
y = df['recommended']

#### Dimentionality reduction

In [7]:
pca = PCA(n_components=4)
X = pca.fit_transform(X)
print('variance explaine by 4 components : ',sum(pca.explained_variance_ratio_))

variance explaine by 4 components :  0.9598858555860601


#### Preprocessing

In [8]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0, test_size=.05)
print('X_train : ',X_train.shape)
print('y_train : ',y_train.shape)
print('X_test : ', X_test.shape)
print('y_test : ', y_test.shape)

X_train :  (56772, 4)
y_train :  (56772,)
X_test :  (2989, 4)
y_test :  (2989,)


### Model building

#### Logistic Regression

In [9]:
model_logistic = LogisticRegression()
model_logistic.fit(X_train,y_train)

train_pred = model_logistic.predict(X_train)
test_pred = model_logistic.predict(X_test)

In [10]:
# Classification Report
print('Train :')
print(classification_report(y_train, train_pred))
# Classification Report
print('Test :')
print(classification_report(y_test, test_pred))

Train :
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     29672
           1       0.96      0.95      0.95     27100

    accuracy                           0.96     56772
   macro avg       0.96      0.96      0.96     56772
weighted avg       0.96      0.96      0.96     56772

Test :
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1596
           1       0.96      0.95      0.96      1393

    accuracy                           0.96      2989
   macro avg       0.96      0.96      0.96      2989
weighted avg       0.96      0.96      0.96      2989



##### Hyper-parameter tuning

In [11]:
params = {'C':[1e-5,1e-4,1e-3,1e-2,1e-1,1,10], 'tol': [1e-5,1e-4,1e-3,1e-2,1e-1,1,10]}
gscv = GridSearchCV(model_logistic, param_grid=params)
gscv.fit(X_train, y_train)

print('best parameters :')
print(gscv.best_params_)

estimator = gscv.best_estimator_
train_pred = estimator.predict(X_train)
test_pred = estimator.predict(X_test)

best parameters :
{'C': 1e-05, 'tol': 1e-05}


In [12]:
# Classification Report
print('Train :')
print(classification_report(y_train, train_pred))
# Classification Report
print('Test :')
print(classification_report(y_test, test_pred))

Train :
              precision    recall  f1-score   support

           0       0.95      0.96      0.96     29672
           1       0.96      0.95      0.95     27100

    accuracy                           0.96     56772
   macro avg       0.96      0.96      0.96     56772
weighted avg       0.96      0.96      0.96     56772

Test :
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1596
           1       0.96      0.95      0.96      1393

    accuracy                           0.96      2989
   macro avg       0.96      0.96      0.96      2989
weighted avg       0.96      0.96      0.96      2989



In this case cross validation is not so useful.

#### SVM Model

In [13]:
# SVM Model C=0.001, kernel='linear', random_state=0
svc = SVC()
svc.fit(X_train, y_train)

train_pred = svc.predict(X_train)
test_pred = svc.predict(X_test)

In [14]:
# Classification Report
print('Train :')
print(classification_report(y_train, train_pred))
# Classification Report
print('Test :')
print(classification_report(y_test, test_pred))

Train :
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     29672
           1       0.96      0.95      0.96     27100

    accuracy                           0.96     56772
   macro avg       0.96      0.96      0.96     56772
weighted avg       0.96      0.96      0.96     56772

Test :
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1596
           1       0.96      0.95      0.95      1393

    accuracy                           0.96      2989
   macro avg       0.96      0.96      0.96      2989
weighted avg       0.96      0.96      0.96      2989



###### Hyper-parameter Tuning

In [15]:
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [16]:
params = {'C': [0.0001, 0.001],
 'kernel': ['linear'],
 'random_state': [0],
  'tol': [0.01,.1]
 }
gscv = GridSearchCV(svc, param_grid=params,cv=3,verbose=2)
gscv.fit(X_train, y_train)

print('best parameters :')
print(gscv.best_params_)

estimator = gscv.best_estimator_
train_pred = estimator.predict(X_train)
test_pred = estimator.predict(X_test)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END ..C=0.0001, kernel=linear, random_state=0, tol=0.01; total time=  12.3s
[CV] END ..C=0.0001, kernel=linear, random_state=0, tol=0.01; total time=  13.8s
[CV] END ..C=0.0001, kernel=linear, random_state=0, tol=0.01; total time=  15.6s
[CV] END ...C=0.0001, kernel=linear, random_state=0, tol=0.1; total time=  14.9s
[CV] END ...C=0.0001, kernel=linear, random_state=0, tol=0.1; total time=  15.2s
[CV] END ...C=0.0001, kernel=linear, random_state=0, tol=0.1; total time=  13.8s
[CV] END ...C=0.001, kernel=linear, random_state=0, tol=0.01; total time=   8.5s
[CV] END ...C=0.001, kernel=linear, random_state=0, tol=0.01; total time=   8.7s
[CV] END ...C=0.001, kernel=linear, random_state=0, tol=0.01; total time=   9.3s
[CV] END ....C=0.001, kernel=linear, random_state=0, tol=0.1; total time=  10.4s
[CV] END ....C=0.001, kernel=linear, random_state=0, tol=0.1; total time=  10.8s
[CV] END ....C=0.001, kernel=linear, random_state

In [17]:
# Classification Report
print('Train :')
print(classification_report(y_train, train_pred))
# Classification Report
print('Test :')
print(classification_report(y_test, test_pred))

Train :
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     29672
           1       0.96      0.95      0.95     27100

    accuracy                           0.96     56772
   macro avg       0.96      0.96      0.96     56772
weighted avg       0.96      0.96      0.96     56772

Test :
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1596
           1       0.96      0.95      0.95      1393

    accuracy                           0.96      2989
   macro avg       0.96      0.96      0.96      2989
weighted avg       0.96      0.96      0.96      2989



#### Prediction by user's input

In [23]:
print(f"Enter values separating by commas : \n{list(df.drop('recommended',axis=1).columns)}")
# input and preprocessing
inp = input()
x = [float(i) for i in inp.split(',')]
x = np.array(x)
x = pca.transform(x.reshape(1,-1))
# prediction
res = model_logistic.predict(x)
if res==0:
    print('Not recommended')
else:
    print('Recommended')

Enter values separating by commas : 
['overall', 'seat_comfort', 'cabin_service', 'food_bev', 'value_for_money', 'cabin_Business Class', 'cabin_Economy Class', 'cabin_First Class', 'cabin_Premium Economy']


 3,0,4,10,0,0,0,0,0


Not recommended




In [26]:
import pickle
!mkdir models
pickle.dump(model_logistic,open('models/model_logistic.pkl','wb'))

A subdirectory or file models already exists.
