In [17]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

In [18]:
df = pd.read_csv('../dataclean/qatar_reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Rating,Title,Author,Country,Date,Review Body,Type Of Traveller,Seat Type,Aircraft,Verified,Source,Destination,Via,Date Flown Month,Date Flown Year
0,0,1,marred inconvenience,mary le,united kingdom,2024-03-02,delay flight haneda doha caused bit chaos upon...,Solo Leisure,economy class,,False,TYO,LHR,1,3,2024.0
1,1,1,unknown,brian english,unknown,2024-02-29,convinced needed pay 1500 add middle name flig...,Couple Leisure,business class,,True,DOH,YUL,0,2,2024.0
2,2,1,attempt address complaint,wayne burgess,australia,2024-02-29,sent 5 email received 2 automatic response ema...,Business,economy class,A380,True,DOH,PER,0,0,
3,3,10,hope update a380,alwaleed althani,qatar,2024-02-25,unknown,Couple Leisure,first class,A380-800,Unknown,DOH,LHR,0,2,2024.0
4,4,7,unknown,w warnock,united kingdom,2024-02-22,unknown,Couple Leisure,business class,,Unknown,AKL,DOH,0,2,2024.0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2773 entries, 0 to 2772
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         2773 non-null   int64  
 1   Rating             2773 non-null   int64  
 2   Title              2773 non-null   object 
 3   Author             2773 non-null   object 
 4   Country            2773 non-null   object 
 5   Date               2655 non-null   object 
 6   Review Body        2773 non-null   object 
 7   Type Of Traveller  1860 non-null   object 
 8   Seat Type          2218 non-null   object 
 9   Aircraft           1421 non-null   object 
 10  Verified           2773 non-null   object 
 11  Source             2773 non-null   object 
 12  Destination        2773 non-null   object 
 13  Via                2773 non-null   int64  
 14  Date Flown Month   2773 non-null   int64  
 15  Date Flown Year    1878 non-null   float64
dtypes: float64(1), int64(4),

### labelling data

In [20]:
df['Review Body'].sample(10)

2309    singapore miami boeing 787 seat appear good ch...
1520    verified review frankfurt bangkok via doha fir...
1907                                              unknown
970     singapore milan via doha missing connection fl...
993     stockholm chennai via doha good value money bi...
446     qr favorite airline step travel pleasant due a...
624                                               unknown
1965    ive used qatar airway mix class 10 time past y...
2227    manchester riyadh via doha food good seat expe...
1258    brisbane moscow via adelaide doha qatar airway...
Name: Review Body, dtype: object

In [21]:
# Download stopwords dan wordnet jika belum
nltk.download('stopwords')
nltk.download('wordnet')

# Inisialisasi stopwords dan lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nadia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nadia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [71]:
def clean_text(text):
    if isinstance(text, str):  # Periksa apakah input adalah string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
        text = re.sub(r'@\w+', '', text)  # Remove mentions
        text = re.sub(r'#', '', text)  # Remove hash symbols
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = re.sub(r'[\U0001F600-\U0001F64F]', '', text)  # Remove emoticons
        text = re.sub(r'[\U0001F300-\U0001F5FF]', '', text)  # Remove symbols & pictographs
        text = re.sub(r'[\U0001F680-\U0001F6FF]', '', text)  # Remove transport & map symbols
        text = re.sub(r'[\U0001F700-\U0001F77F]', '', text)  # Remove alchemical symbols
        text = re.sub(r'[\U0001F780-\U0001F7FF]', '', text)  # Remove Geometric Shapes Extended
        text = re.sub(r'[\U0001F800-\U0001F8FF]', '', text)  # Remove Supplemental Arrows-C
        text = re.sub(r'[\U0001F900-\U0001F9FF]', '', text)  # Remove Supplemental Symbols and Pictographs
        text = re.sub(r'[\U0001FA00-\U0001FA6F]', '', text)  # Remove Chess Symbols
        text = re.sub(r'[\U0001FA70-\U0001FAFF]', '', text)  # Remove Symbols and Pictographs Extended-A
        text = re.sub(r'[\U00002702-\U000027B0]', '', text)  # Remove Dingbats
        text = re.sub(r'\W', ' ', text)  # Remove special characters and punctuation
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
        words = text.split()
        words = [word for word in words if word not in stop_words]  # Remove stopwords
        words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize words
        return ' '.join(words)
    else:
        return ''  # Kembalikan string kosong jika bukan teks

In [23]:
df['Review Body'] = df['Review Body'].apply(clean_text)

In [24]:
# membuat fungsi negasi
# Fungsi untuk mendeteksi negasi
def detect_negation(text):
    # Daftar kata negasi umum dalam bahasa Inggris
    negation_words = {"not", "no", "never", "none", "nobody", "nothing", "neither", "nowhere", "hardly", "scarcely", "barely", "without", "cannot"}
    
    words = text.lower().split()
    negated_text = []
    negate = False
    
    for word in words:
        if word in negation_words:
            negate = True  # Aktifkan penanda negasi
        elif negate:
            # Tambahkan prefiks NOT_ ke kata setelah kata negasi
            negated_text.append("NOT_" + word)
            negate = False  # Reset negasi setelah satu kata
        else:
            negated_text.append(word)
    
    return ' '.join(negated_text)

In [25]:
# Fungsi untuk menganalisis sentimen dengan TextBlob
def analyze_sentiment_with_negation(text):
    # Deteksi negasi pada teks
    negated_text = detect_negation(text)
    
    # Lakukan analisis sentimen menggunakan TextBlob
    blob = TextBlob(negated_text)
    sentiment = blob.sentiment.polarity  # Nilai polaritas (-1 hingga 1)

    # Tentukan sentimen (positif, negatif, netral)
    if sentiment > 0:
        sentiment_label = "Positive"
    elif sentiment < 0:
        sentiment_label = "Negative"
    else:
        sentiment_label = "Neutral"

    return sentiment_label, sentiment

In [26]:
df[['Sentiment Label', 'Polarity']] = df['Review Body'].apply(lambda x: pd.Series(analyze_sentiment_with_negation(x)))

In [45]:
df[['Rating', 'Review Body', 'Sentiment Label']].sample(10)

Unnamed: 0,Rating,Review Body,Sentiment Label
1862,8,verified review travelled twice qatar airway r...,Positive
2588,9,jnbdohbcn return flight time early jnb return ...,Positive
278,4,flight service good usual high standard servic...,Negative
1861,8,verified review travelled twice qatar airway r...,Positive
1117,7,checked munich airport special request seat me...,Positive
1483,1,verified review lagos washington via doha boar...,Negative
2365,10,maddel via doha return way excellent trip over...,Positive
1710,10,verified review frankfurt cape town via doha n...,Positive
2185,10,qatar airway chicago delhi via doha fantastic ...,Positive
161,1,first time flying qatar airway definitely last...,Positive


In [37]:
# Hapus baris yang mengandung kata 'unknown' pada kolom 'Review Body'
df = df[~df['Review Body'].str.contains('unknown', case=False, na=False)]


In [49]:
df['Sentiment Label'].value_counts()

Sentiment Label
Positive    1973
Negative     231
Neutral       15
Name: count, dtype: int64

### Modelling sentiment analysis tanpa rating 

#### SVM 

In [56]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [50]:
df['Sentiment Label'] = df['Sentiment Label'].map({'Positive': 1, 'Negative': -1, 'Neutral': 0})

In [51]:
df['Sentiment Label'].value_counts()

Sentiment Label
 1    1973
-1     231
 0      15
Name: count, dtype: int64

In [52]:
X = df['Review Body']
y = df['Sentiment Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
# Pipeline dengan TF-IDF dan SVM
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),         # Konversi teks ke TF-IDF
    ('svm', SVC())                        # Model SVM
])

In [54]:
# Daftar parameter yang akan diuji dalam GridSearchCV
param_grid = {
    'tfidf__max_features': [1000, 3000, 5000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Uji unigram dan bigram
    'svm__C': [0.1, 1, 10],                  # Nilai C untuk SVM
    'svm__kernel': ['linear', 'rbf'],        # Kernel linear dan RBF
    'svm__gamma': ['scale', 'auto']          # Nilai gamma
}

In [55]:
# GridSearchCV untuk menemukan parameter terbaik
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Mencetak parameter terbaik
print("Best Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best Parameters: {'svm__C': 10, 'svm__gamma': 'scale', 'svm__kernel': 'linear', 'tfidf__max_features': 3000, 'tfidf__ngram_range': (1, 1)}


In [57]:
# Menggunakan model terbaik untuk prediksi
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [58]:
# Evaluasi model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

Accuracy: 0.9031531531531531
Classification Report:
               precision    recall  f1-score   support

    Negative       0.54      0.44      0.49        43
     Neutral       1.00      0.20      0.33         5
    Positive       0.93      0.96      0.95       396

    accuracy                           0.90       444
   macro avg       0.83      0.53      0.59       444
weighted avg       0.90      0.90      0.90       444



In [59]:
# mencoba prediksi pada dataset airline reviews
df_airline = pd.read_csv('../dataclean/Airline_Reviews (4).csv')
df_airline.head()

Unnamed: 0.1,Unnamed: 0,Passanger_Name,Flying_month,Route,Rating,Verified,Review_title,Review_content,Traveller_type,Class,Aircraft,Recommend
0,0,Paige Boet,June 2023,New Orleans to London,1.0,Trip Verified,The airline lost my luggage,The airline lost my luggage and was absolutely...,Solo Leisure,Economy Class,,
1,1,,March 2023,London to Amman,1.0,Trip Verified,fully refunded by our travel insurance,"We booked on the BA website, round trip flight...",Couple Leisure,,,
2,2,E Lanewoski,,Heathrow to Bodrum,2.0,Trip Verified,no boarding drinks provided,"First time flying with BA business class, neve...",,Business Class,A321 neo,
3,3,Joel Burman,June 2023,Amman to London,4.0,Not Verified,,,Solo Leisure,Economy Class,,
4,4,,,London City to Ibiza,7.0,Trip Verified,,This is a two-for-one review covering economy ...,Family Leisure,Business Class,Embraer 190,


In [72]:
# Menghapus baris dengan nilai NaN di kolom Review_content
df_airline = df_airline.dropna(subset=['Review_content'])
df_airline['Review_content'].apply(clean_text)

0       airline lost luggage absolutely awful througho...
1       booked ba website round trip flight seattle am...
2       first time flying ba business class never boar...
4       two one review covering economy business class...
5       absolutely horrible airline communication terr...
                              ...                        
3575    lhr hkg boeing much written tired old fleet go...
3576    got back bridgetown barbados flying british ai...
3577    lhr jfk lax lhr check ok apart snapped early c...
3578    hkg lhr new club world boeing bought ticket wt...
3579    yyz lhr july flew overnight premium economy ch...
Name: Review_content, Length: 2864, dtype: object

In [73]:
# Gunakan model terbaik dari GridSearchCV untuk memprediksi sentimen pada data baru
df_airline['predicted_sentiment'] = best_model.predict(df_airline['Review_content'])

In [74]:
# Dictionary mapping
label_mapping = {1: 'positive', 0: 'neutral', -1: 'negative'}

# Menggunakan map untuk mengonversi angka ke label
df_airline['predicted_sentiment'] = df_airline['predicted_sentiment'].map(label_mapping)

In [75]:
df_airline[['predicted_sentiment', 'Review_content', 'Rating']].sample(10)

Unnamed: 0,predicted_sentiment,Review_content,Rating
331,positive,Having just booked BA for a return flight - be...,
402,positive,No check in staff for Economy. Never experienc...,3.0
2833,negative,Sadly flying British Airways is no longer abou...,1.0
1396,positive,Verified Review London to Chicago. I am a BA...,3.0
2844,negative,Gatwick to Florida return Flight out was quite...,
99,negative,At 7.54 am on the day of travel whilst driving...,
84,positive,I tried to check in on line and was informed t...,
3256,negative,Travelled to Cyprus from Gatwick in economy as...,2.0
1367,positive,Verified Review On a flight from Bangkok to ...,
648,negative,Hong Kong to London. An email on the 25th Augu...,1.0


In [84]:
negative_reviews = df_airline[df_airline['predicted_sentiment'] == 'negative']

print(negative_reviews[['Review_content', 'predicted_sentiment']].sample(10))

                                         Review_content predicted_sentiment
1782  Verified Review   Manchester to Los Angeles vi...            negative
614   Terrible lack of any leg and body room in econ...            negative
239   BA cancelled my flight, I never got on it, cou...            negative
275   I booked a flight to London and paid, months i...            negative
2045  Verified Review   Mexico to Amsterdam via Lond...            negative
3482  Glasgow to LHR on a completely full flight. Th...            negative
5     Absolutely horrible airline. Communication is ...            negative
1237  London to Rome. It's been a while since I last...            negative
3401  BA206 from Miami to LHR on upper deck dinner t...            negative
58    I wouldn't recommend British Airways at all. I...            negative


In [92]:
positive_reviews = df_airline[df_airline['predicted_sentiment'] == 'positive']

print(positive_reviews[['Review_content', 'predicted_sentiment', 'Rating']].sample(10))

                                         Review_content predicted_sentiment  \
2396  British Airways flight from Punta Cana to Lond...            positive   
504   London Heathrow to Sofia. A good experience on...            positive   
3421  LHR-FRA-LHR in spotlessly clean airbuses both ...            positive   
1425  Verified Review   Krakow to London. Got us the...            positive   
893   London to Berlin scheduled for 21st December. ...            positive   
2113  Verified Review   I would have been very disap...            positive   
733   London to Gothenburg. Unique procedures that a...            positive   
2610  British Airways flight from Moscow to London o...            positive   
3047  Delhi to London in seat 2A 747 aircraft. Very ...            positive   
736   I had some difficulty from the start, after my...            positive   

      Rating  
2396     3.0  
504      8.0  
3421     8.0  
1425     NaN  
893      1.0  
2113     3.0  
733      3.0  
2610     7