In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from textblob import TextBlob  # Import TextBlob for sentiment analysis

In [7]:
df = pd.read_csv("BA_AirlineReviews.csv")

In [8]:
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
numeric_df = df[numeric_columns]

In [9]:
missing_values = numeric_df.isnull().sum()
print("\nMissing Values in Numeric Columns:")
print(missing_values[missing_values > 0])


Missing Values in Numeric Columns:
OverallRating               5
SeatComfort               116
CabinStaffService         127
GroundService             846
ValueForMoney               1
Food&Beverages            386
InflightEntertainment    1150
Wifi&Connectivity        3092
dtype: int64


In [10]:
df.dropna(inplace=True)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df['ReviewBody'], df['Recommended'], test_size=0.2, random_state=42)

In [11]:
def analyze_sentiment(text):
    analysis = TextBlob(text)
    # Assign sentiment labels
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

In [12]:
def sentiment_analysis_model(X_train, y_train, X_test):
    vectorizer = TfidfVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)

    model = SVC(C=1.0, kernel='linear', gamma='auto')
    model.fit(X_train_vectorized, y_train)
    y_pred = model.predict(X_test_vectorized)

    return pd.Series(y_pred, index=X_test.index)

In [13]:
def naive_bayes_model(X_train, y_train, X_test):
    vectorizer = TfidfVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)

    model = MultinomialNB()
    model.fit(X_train_vectorized, y_train)
    y_pred = model.predict(X_test_vectorized)

    return pd.Series(y_pred, index=X_test.index)

In [14]:
def logistic_regression_model(X_train, y_train, X_test):
    vectorizer = TfidfVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)

    model = LogisticRegression()
    model.fit(X_train_vectorized, y_train)
    y_pred = model.predict(X_test_vectorized)

    return pd.Series(y_pred, index=X_test.index)

In [16]:
df['Sentiment'] = df['ReviewBody'].apply(analyze_sentiment)
df.head()

Unnamed: 0.1,Unnamed: 0,OverallRating,ReviewHeader,Name,Datetime,VerifiedReview,ReviewBody,TypeOfTraveller,SeatType,Route,...,SeatComfort,CabinStaffService,GroundService,ValueForMoney,Recommended,Aircraft,Food&Beverages,InflightEntertainment,Wifi&Connectivity,Sentiment
1,1,3.0,"""do not upgrade members based on status""",Austin Jones,19th November 2023,True,I recently had a delay on British Airways from...,Business,Economy Class,Brussels to London,...,2.0,3.0,1.0,2.0,no,A320,1.0,2.0,2.0,positive
8,8,2.0,"""Angry, disappointed, and unsatisfied""",Massimo Tricca,5th November 2023,False,"Angry, disappointed, and unsatisfied. My route...",Family Leisure,Economy Class,London Heatrow to Atlanta,...,4.0,5.0,3.0,5.0,yes,Boeing 777,4.0,4.0,3.0,positive
25,25,5.0,"""Club Europe is simply a joke""",M Dale,14th October 2023,True,I am a frequent flyer with BA and have been fo...,Business,Business Class,London to Istanbul,...,3.0,4.0,3.0,2.0,no,A320,1.0,1.0,1.0,positive
33,33,10.0,"""Excellent service levels""",Peter Costello,7th October 2023,True,"Excellent service levels, proactive crew and s...",Solo Leisure,First Class,London to New York JFK,...,5.0,5.0,5.0,5.0,yes,Boeing 777,5.0,4.0,5.0,positive
34,34,1.0,"""British Airways was absolutely shocking""",Kane Kelly,5th October 2023,False,Booked a very special holiday for me and my pa...,Couple Leisure,Business Class,Heathrow to Marseille,...,1.0,1.0,1.0,1.0,no,BA366,1.0,1.0,1.0,positive


In [18]:
# Perform Sentiment Analysis using SVM
y_pred_sentiment = sentiment_analysis_model(X_train, y_train, X_test)
accuracy_sentiment = accuracy_score(y_test, y_pred_sentiment)
print(f'SVM Accuracy: {accuracy_sentiment}')
print('SVM Classification Report:\n', classification_report(y_test, y_pred_sentiment))
print('SVM Confusion Matrix:\n', confusion_matrix(y_test, y_pred_sentiment))

SVM Accuracy: 0.9242424242424242
SVM Classification Report:
               precision    recall  f1-score   support

          no       0.88      1.00      0.94        38
         yes       1.00      0.82      0.90        28

    accuracy                           0.92        66
   macro avg       0.94      0.91      0.92        66
weighted avg       0.93      0.92      0.92        66

SVM Confusion Matrix:
 [[38  0]
 [ 5 23]]


In [23]:
# Perform Naive Bayes analysis
y_pred_nb = naive_bayes_model(X_train, y_train, X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f'Naive Bayes Accuracy: {accuracy_nb}')
print('Naive Bayes Classification Report:\n', classification_report(y_test, y_pred_nb))
#print('Naive Bayes Confusion Matrix:\n', confusion_matrix(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.5757575757575758
Naive Bayes Classification Report:
               precision    recall  f1-score   support

          no       0.58      1.00      0.73        38
         yes       0.00      0.00      0.00        28

    accuracy                           0.58        66
   macro avg       0.29      0.50      0.37        66
weighted avg       0.33      0.58      0.42        66



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# Perform Logistic Regression analysis
y_pred_lr = logistic_regression_model(X_train, y_train, X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'Logistic Regression Accuracy: {accuracy_lr}')
print('Logistic Regression Classification Report:\n', classification_report(y_test, y_pred_lr))
#print('Logistic Regression Confusion Matrix:\n', confusion_matrix(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.7575757575757576
Logistic Regression Classification Report:
               precision    recall  f1-score   support

          no       0.70      1.00      0.83        38
         yes       1.00      0.43      0.60        28

    accuracy                           0.76        66
   macro avg       0.85      0.71      0.71        66
weighted avg       0.83      0.76      0.73        66



In [25]:
df.shape

(327, 21)