In [1]:
import pandas as pd

REVIEW = '/kaggle/input/british-airline-review-dataset/airline_review.csv'
df = pd.read_csv(filepath_or_buffer=REVIEW, index_col=[0], parse_dates=['date', 'date_flown'])
df['flown_year'] = df['date_flown'].dt.year
df['flown_year'] = df['flown_year'].fillna(value=df['flown_year'].mean())
df.head()

Unnamed: 0,rating,header,author,date,place,content,aircraft,traveller_type,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_beverages,ground_service,value_for_money,recommended,entertainment,trip_verified,flown_year
0,2,service was mediocre at best,Gary Storer,2023-10-03,United Kingdom,"Just returned from Chicago, flew out 10 days ...",A380,Couple Leisure,Economy Class,Chicago to Manchester via Heathrow,2023-10-01,2,3,1,2,2,no,-1,Not Verified,2023.0
1,2,BA standards continue to decline,A Jensen,2023-10-02,United Kingdom,BA standards continue to decline every time ...,A320,Business,Business Class,London Heathrow to Munich,2023-09-01,2,1,2,1,1,no,-1,Verified,2023.0
2,2,"won the race to the bottom""",John Rockett,2023-10-02,United Kingdom,Awful. Business class check in queue just as...,A320,Couple Leisure,Business Class,Heathrow to Istanbul,2023-09-01,2,3,2,1,1,no,-1,Not Verified,2023.0
3,3,Not a reliable airline,Tatiana Bobrovskaya,2023-10-02,United Kingdom,Not a reliable airline. You cannot trust the...,A320,Business,Economy Class,London to Geneva,2023-10-01,4,4,2,1,1,no,-1,Verified,2023.0
4,1,It is a national disgrace,A Dawson,2023-09-30,United Kingdom,I take comfort in reading the last ten or so...,,Couple Leisure,Business Class,Athens to London,2023-09-01,1,1,1,1,1,no,1,Verified,2023.0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3616 entries, 0 to 3653
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   rating               3616 non-null   int64         
 1   header               3616 non-null   object        
 2   author               3616 non-null   object        
 3   date                 3616 non-null   datetime64[ns]
 4   place                3616 non-null   object        
 5   content              3616 non-null   object        
 6   aircraft             1902 non-null   object        
 7   traveller_type       2895 non-null   object        
 8   seat_type            3614 non-null   object        
 9   route                2891 non-null   object        
 10  date_flown           2888 non-null   datetime64[ns]
 11  seat_comfort         3616 non-null   int64         
 12  cabin_staff_service  3616 non-null   int64         
 13  food_beverages       3616 non-null   i

In [3]:
from plotly.express import histogram
for x in ['rating', 'date', 'traveller_type', 'seat_type', 'seat_comfort', 'cabin_staff_service', 'cabin_staff_service', 
         'food_beverages', 'ground_service', 'value_for_money', 'entertainment', 'trip_verified']:
    histogram(data_frame=df, x=x, color='recommended').show()

In [4]:
from plotly.express import scatter
from plotly.graph_objects import Figure
from sklearn.manifold import TSNE
import numpy as np

def render_tsne(input_df: pd.DataFrame, columns: list, target: list) -> Figure:
    tsne = TSNE(n_components=2, init='pca', verbose=1, random_state=2024, )
    tsne_data_df = input_df[columns].copy()
    tsne_df = pd.DataFrame(data=tsne.fit_transform(X=tsne_data_df), columns=['tx', 'ty'])
    tsne_df['target'] = target
    return scatter(data_frame=tsne_df, x='tx', y='ty', color='target')

render_tsne(input_df=df, columns= ['rating', 'seat_comfort', 'cabin_staff_service', 'food_beverages', 'ground_service', 'value_for_money', 'entertainment', 'flown_year' ],
      target=df['recommended'].values.tolist()).show()


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 3616 samples in 0.004s...
[t-SNE] Computed neighbors for 3616 samples in 0.212s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3616
[t-SNE] Computed conditional probabilities for sample 2000 / 3616
[t-SNE] Computed conditional probabilities for sample 3000 / 3616
[t-SNE] Computed conditional probabilities for sample 3616 / 3616
[t-SNE] Mean sigma: 1.091833
[t-SNE] KL divergence after 250 iterations with early exaggeration: 71.152779
[t-SNE] KL divergence after 1000 iterations: 1.130881


This is pretty encouraging; our dimension reduction suggests our numerical data contains at least some information that will help us predict our target variable. Let's build a simple model and see what it says.

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

columns= ['rating', 'seat_comfort', 'cabin_staff_service', 'food_beverages', 'ground_service', 'value_for_money', 'entertainment', 'flown_year', ]
X_train, X_test, y_train, y_test = train_test_split(df[columns], (df['recommended']).map({'no': False, 'yes': True}), test_size=0.25, random_state=2024)
model = LogisticRegression(max_iter=100000)
model.fit(X_train, y_train)
print('accuracy: {} pct'.format(round(1000 * accuracy_score(y_test, model.predict(X_test)))/10))

accuracy: 93.7 pct


This is not bad; let's see how our our errors break down.

In [6]:
from sklearn.metrics import classification_report
print(classification_report(y_true = y_test, y_pred = model.predict(X_test)))
histogram(x=columns, y=model.coef_[0])

              precision    recall  f1-score   support

       False       0.95      0.94      0.95       535
        True       0.92      0.93      0.92       369

    accuracy                           0.94       904
   macro avg       0.93      0.94      0.93       904
weighted avg       0.94      0.94      0.94       904



Not surprisingly all of our numerical values contribute more signal than noise to our model accuracy, with the possible exception of the year flown.

In [7]:
y_pred = model.predict(X_test).tolist()
y_target = ['{}-{}'.format(y_test.tolist()[index], y_pred[index], ) for index in range(len(y_pred))]
render_tsne(input_df=X_test, columns=columns, target=y_target)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 904 samples in 0.001s...
[t-SNE] Computed neighbors for 904 samples in 0.026s...
[t-SNE] Computed conditional probabilities for sample 904 / 904
[t-SNE] Mean sigma: 1.566737
[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.437759
[t-SNE] KL divergence after 1000 iterations: 0.706005


Not surprisingly our errors for the most part are around the perimeter of the various TSNE clusters.