In [24]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer
from tqdm import tqdm

In [25]:
nltk.download("punkt_tab")
nltk.download("stopwords")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
file_path = "/home/azureuser/cloudfiles/code/Users/oskar.wolf/nlp_air_france_reviews/data/interim/explored_reviews.csv"
df = pd.read_csv(file_path)

In [27]:
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns")
df.head()

Dataset contains 2560 rows and 13 columns


Unnamed: 0,rating,title,text,publishedDate,sentiment,review_length_text,review_length_title,polarity_text,polarity_title,subjectivity_text,subjectivity_title,sentiment_text,sentiment_title
0,1,WORST AIRLINE,I travel a lot - and I travel often. Last week...,2024-11-13,Negative,408,2,0.018448,-1.0,0.512241,1.0,positive,negative
1,1,Terrible experience with Airfrance,"This review is regarding flight AF185, we book...",2024-11-13,Negative,157,4,-0.060897,-1.0,0.592949,1.0,negative,negative
2,1,Extremely Disappointing Experience with Air Fr...,I recently flew with Air France on flight #185...,2024-11-12,Negative,259,11,-0.094163,-0.6,0.488287,0.7,negative,negative
3,1,Horrible,Wow!!! What a horrible experience!! I've alway...,2024-11-11,Negative,274,1,-0.109373,-1.0,0.494012,1.0,negative,negative
4,1,The Worst Flight Experience I’ve Ever Had,I spent a fantastic 10-day vacation in Hong Ko...,2024-11-11,Negative,311,7,-0.126476,-1.0,0.485192,1.0,negative,negative


In [None]:
# Define stopword list at the top
stop_words = set(stopwords.words("english"))  # Load NLTK stopwords

import re
import pandas as pd


def clean(text):
    if pd.isna(text):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove all numbers (standalone or within words)
    text = re.sub(r"\b\d+\b", "", text)  # Removes standalone numbers
    text = re.sub(r"\w*\d\w*", "", text)  # Removes words containing numbers

    # Remove punctuation (except alphanumeric words)
    text = re.sub(r"[^\w\s]", "", text)

    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize (split into words)
    words = text.split()

    # Stopword removal & short word filtering
    words = [w for w in words if w not in stop_words and len(w) > 2]

    return " ".join(words)  # Return cleaned text


# Apply cleaning to text columns
df["clean_text"] = df["text"].apply(clean)
df["clean_title"] = df["title"].apply(clean)

# Preview the cleaned results
df[["text", "clean_text", "title", "clean_title"]].head()

Unnamed: 0,text,clean_text,title,clean_title
0,I travel a lot - and I travel often. Last week...,travel lot travel often last week flew texas u...,WORST AIRLINE,worst airline
1,"This review is regarding flight AF185, we book...",review regarding flight booked economy flex fl...,Terrible experience with Airfrance,terrible experience airfrance
2,I recently flew with Air France on flight #185...,recently flew air france flight hong kong pari...,Extremely Disappointing Experience with Air Fr...,extremely disappointing experience air france ...
3,Wow!!! What a horrible experience!! I've alway...,wow horrible experience ive always flown porte...,Horrible,horrible
4,I spent a fantastic 10-day vacation in Hong Ko...,spent fantastic day vacation hong kong family ...,The Worst Flight Experience I’ve Ever Had,worst flight experience ive ever


In [29]:
def tokenize(text):
    return [w for w in word_tokenize(text) if len(w) > 2]  # Remove single letters


df["tokens_text"] = df["clean_text"].apply(tokenize)
df["tokens_title"] = df["clean_title"].apply(tokenize)

df[["clean_text", "tokens_text", "clean_title", "tokens_title"]].head()

Unnamed: 0,clean_text,tokens_text,clean_title,tokens_title
0,travel lot travel often last week flew texas u...,"[travel, lot, travel, often, last, week, flew,...",worst airline,"[worst, airline]"
1,review regarding flight booked economy flex fl...,"[review, regarding, flight, booked, economy, f...",terrible experience airfrance,"[terrible, experience, airfrance]"
2,recently flew air france flight hong kong pari...,"[recently, flew, air, france, flight, hong, ko...",extremely disappointing experience air france ...,"[extremely, disappointing, experience, air, fr..."
3,wow horrible experience ive always flown porte...,"[wow, horrible, experience, ive, always, flown...",horrible,[horrible]
4,spent fantastic day vacation hong kong family ...,"[spent, fantastic, day, vacation, hong, kong, ...",worst flight experience ive ever,"[worst, flight, experience, ive, ever]"


In [30]:
stop_words = set(stopwords.words("english"))


def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]


df["tokens_text_nostop"] = df["tokens_text"].apply(remove_stopwords)
df["tokens_title_nostop"] = df["tokens_title"].apply(remove_stopwords)

df[["tokens_text", "tokens_text_nostop", "tokens_title", "tokens_title_nostop"]].head()

Unnamed: 0,tokens_text,tokens_text_nostop,tokens_title,tokens_title_nostop
0,"[travel, lot, travel, often, last, week, flew,...","[travel, lot, travel, often, last, week, flew,...","[worst, airline]","[worst, airline]"
1,"[review, regarding, flight, booked, economy, f...","[review, regarding, flight, booked, economy, f...","[terrible, experience, airfrance]","[terrible, experience, airfrance]"
2,"[recently, flew, air, france, flight, hong, ko...","[recently, flew, air, france, flight, hong, ko...","[extremely, disappointing, experience, air, fr...","[extremely, disappointing, experience, air, fr..."
3,"[wow, horrible, experience, ive, always, flown...","[wow, horrible, experience, ive, always, flown...",[horrible],[horrible]
4,"[spent, fantastic, day, vacation, hong, kong, ...","[spent, fantastic, day, vacation, hong, kong, ...","[worst, flight, experience, ive, ever]","[worst, flight, experience, ive, ever]"


In [31]:
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")


def stem_tokens(tokens, stemmer):
    return [stemmer.stem(word) for word in tokens]


df["stemmed_text_porter"] = df["tokens_text_nostop"].apply(
    lambda x: stem_tokens(x, porter_stemmer)
)
df["stemmed_title_porter"] = df["tokens_title_nostop"].apply(
    lambda x: stem_tokens(x, porter_stemmer)
)

df["stemmed_text_snowball"] = df["tokens_text_nostop"].apply(
    lambda x: stem_tokens(x, snowball_stemmer)
)
df["stemmed_title_snowball"] = df["tokens_title_nostop"].apply(
    lambda x: stem_tokens(x, snowball_stemmer)
)

df[
    [
        "tokens_text_nostop",
        "stemmed_text_porter",
        "stemmed_text_snowball",
        "tokens_title_nostop",
        "stemmed_title_porter",
        "stemmed_title_snowball",
    ]
].head()

Unnamed: 0,tokens_text_nostop,stemmed_text_porter,stemmed_text_snowball,tokens_title_nostop,stemmed_title_porter,stemmed_title_snowball
0,"[travel, lot, travel, often, last, week, flew,...","[travel, lot, travel, often, last, week, flew,...","[travel, lot, travel, often, last, week, flew,...","[worst, airline]","[worst, airlin]","[worst, airlin]"
1,"[review, regarding, flight, booked, economy, f...","[review, regard, flight, book, economi, flex, ...","[review, regard, flight, book, economi, flex, ...","[terrible, experience, airfrance]","[terribl, experi, airfranc]","[terribl, experi, airfranc]"
2,"[recently, flew, air, france, flight, hong, ko...","[recent, flew, air, franc, flight, hong, kong,...","[recent, flew, air, franc, flight, hong, kong,...","[extremely, disappointing, experience, air, fr...","[extrem, disappoint, experi, air, franc, fligh...","[extrem, disappoint, experi, air, franc, fligh..."
3,"[wow, horrible, experience, ive, always, flown...","[wow, horribl, experi, ive, alway, flown, port...","[wow, horribl, experi, ive, alway, flown, port...",[horrible],[horribl],[horribl]
4,"[spent, fantastic, day, vacation, hong, kong, ...","[spent, fantast, day, vacat, hong, kong, famil...","[spent, fantast, day, vacat, hong, kong, famil...","[worst, flight, experience, ive, ever]","[worst, flight, experi, ive, ever]","[worst, flight, experi, ive, ever]"


In [32]:
nlp = spacy.load("en_core_web_sm")


def lemmatize_tokens(text):
    doc = nlp(" ".join(text))
    return [token.lemma_ for token in doc]


df["lemmatized_text"] = df["tokens_text_nostop"].apply(lemmatize_tokens)
df["lemmatized_title"] = df["tokens_title_nostop"].apply(lemmatize_tokens)

df[
    ["tokens_text_nostop", "lemmatized_text", "tokens_title_nostop", "lemmatized_title"]
].head()

Unnamed: 0,tokens_text_nostop,lemmatized_text,tokens_title_nostop,lemmatized_title
0,"[travel, lot, travel, often, last, week, flew,...","[travel, lot, travel, often, last, week, fly, ...","[worst, airline]","[bad, airline]"
1,"[review, regarding, flight, booked, economy, f...","[review, regard, flight, book, economy, flex, ...","[terrible, experience, airfrance]","[terrible, experience, airfrance]"
2,"[recently, flew, air, france, flight, hong, ko...","[recently, fly, air, france, flight, hong, kon...","[extremely, disappointing, experience, air, fr...","[extremely, disappointing, experience, air, fr..."
3,"[wow, horrible, experience, ive, always, flown...","[wow, horrible, experience, I, ve, always, fly...",[horrible],[horrible]
4,"[spent, fantastic, day, vacation, hong, kong, ...","[spend, fantastic, day, vacation, hong, kong, ...","[worst, flight, experience, ive, ever]","[bad, flight, experience, I, ve, ever]"


In [33]:
df["publishedDate"] = pd.to_datetime(df["publishedDate"], errors="coerce")

df["day_of_week"] = df["publishedDate"].dt.day_name()
df["month"] = df["publishedDate"].dt.month
df["year"] = df["publishedDate"].dt.year

df.drop(columns=["publishedDate"], inplace=True)


In [34]:
final_columns = [
    "rating",  # Numerical sentiment score
    "review_length_text",
    "review_length_title",  # Text length analysis
    "polarity_text",
    "polarity_title",  # Sentiment polarity scores
    "subjectivity_text",
    "subjectivity_title",  # Opinion-based scores
    "lemmatized_text",
    "lemmatized_title",  # Clean processed text for vectorization
    "day_of_week",
    "month",
    "year",  # Time-based trend analysis
]

df_final = df[final_columns]
df_final.head()

Unnamed: 0,rating,review_length_text,review_length_title,polarity_text,polarity_title,subjectivity_text,subjectivity_title,lemmatized_text,lemmatized_title,day_of_week,month,year
0,1,408,2,0.018448,-1.0,0.512241,1.0,"[travel, lot, travel, often, last, week, fly, ...","[bad, airline]",Wednesday,11,2024
1,1,157,4,-0.060897,-1.0,0.592949,1.0,"[review, regard, flight, book, economy, flex, ...","[terrible, experience, airfrance]",Wednesday,11,2024
2,1,259,11,-0.094163,-0.6,0.488287,0.7,"[recently, fly, air, france, flight, hong, kon...","[extremely, disappointing, experience, air, fr...",Tuesday,11,2024
3,1,274,1,-0.109373,-1.0,0.494012,1.0,"[wow, horrible, experience, I, ve, always, fly...",[horrible],Monday,11,2024
4,1,311,7,-0.126476,-1.0,0.485192,1.0,"[spend, fantastic, day, vacation, hong, kong, ...","[bad, flight, experience, I, ve, ever]",Monday,11,2024


In [35]:
from collections import Counter

# Flatten all words and count occurrences
word_counts = Counter(word for text in df_final["lemmatized_text"] for word in text)

# Convert to a sorted list (by frequency descending)
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Display first 50 words for preview
for word, count in sorted_word_counts[:10000]:  # Adjust number to see more
    print(f"{word}: {count}")

flight: 4455
air: 2618
france: 2505
seat: 1725
service: 1461
fly: 1369
not: 1341
time: 1340
get: 1305
good: 1263
paris: 1163
food: 1105
airline: 1022
hour: 942
would: 926
do: 837
one: 833
plane: 774
make: 740
staff: 724
airport: 722
well: 714
take: 703
check: 702
luggage: 677
class: 675
could: 630
day: 620
back: 595
experience: 595
economy: 579
bag: 577
business: 574
tell: 570
even: 565
say: 554
travel: 547
crew: 512
arrive: 510
book: 483
great: 474
first: 465
bad: 452
long: 449
pay: 446
go: 446
like: 441
use: 435
also: 427
never: 427
give: 423
ask: 417
way: 414
passenger: 412
delay: 406
meal: 406
customer: 394
cdg: 383
ticket: 380
find: 379
trip: 375
comfortable: 373
wait: 368
nice: 363
call: 361
next: 356
really: 355
boarding: 349
need: 348
attendant: 346
board: 340
return: 337
people: 332
minute: 330
much: 326
two: 325
leave: 317
cabin: 316
drink: 312
try: 309
leg: 309
know: 306
entertainment: 306
gate: 301
another: 297
help: 290
baggage: 285
offer: 282
serve: 280
lose: 276
come: 27

In [None]:
# Define a mapping of similar words to a single common term

word_mapping = {
    # Food & Beverages
    "meal": "food",
    "meals": "food",
    "dinner": "food",
    "breakfast": "food",
    "lunch": "food",
    "snack": "food",
    "beverage": "drink",
    "drink": "beverage",
    # Luggage & Baggage
    "baggage": "luggage",
    "carryon": "luggage",
    "suitcase": "luggage",
    "cases": "luggage",
    "bags": "luggage",
    "bag": "luggage",
    # Seating
    "seat": "seating",
    "seats": "seating",
    "sitting": "seating",
    "row": "seating",
    # Delays & Cancellations
    "delay": "delayed",
    "delays": "delayed",
    "delayed": "delayed",
    "cancel": "canceled",
    "cancellation": "canceled",
    "canceled": "canceled",
    # Ticketing & Booking
    "ticket": "booking",
    "tickets": "booking",
    "reservation": "booking",
    "book": "booking",
    "reservations": "booking",
    # Staff & Crew
    "crew": "staff",
    "attendant": "staff",
    "attendants": "staff",
    "steward": "staff",
    "stewards": "staff",
    "employees": "staff",
    "personnel": "staff",
    # Passengers & Customers
    "customer": "passenger",
    "customers": "passenger",
    "flyer": "passenger",
    "flyers": "passenger",
    "passengers": "passenger",
    # Class & Seating Categories
    # "economy": "class", "business": "class", "premium": "class",
    # Travel & Journeys
    "trip": "travel",
    "journey": "travel",
    "vacation": "travel",
    "itinerary": "travel",
    "tour": "travel",
    # Pricing & Costs
    "price": "cost",
    "charge": "cost",
    "fare": "cost",
    "fee": "cost",
    # Refunds & Compensation
    "refund": "compensation",
    "vouchers": "compensation",
    "voucher": "compensation",
    # Politeness & Rudeness
    "rude": "impolite",
    "polite": "friendly",
    "unfriendly": "impolite",
    # Security & Queues
    "security": "check",
    "checkin": "check",
    "line": "queue",
    "queue": "queue",
    # Connectivity & Transfers
    "connect": "transfer",
    "transfer": "transfer",
    "layover": "transfer",
    # Entertainment
    "entertainment": "movie",
    "screen": "movie",
    "inflight entertainment": "movie",
    # Sentiments
    "happy": "pleasant",
    "unhappy": "unpleasant",
    "pleased": "pleasant",
    "frustrated": "unpleasant",
    "displeased": "unpleasant",
}


def normalize_words(text_list, word_map):
    """Replace words based on a mapping dictionary."""
    return [[word_map.get(word, word) for word in text] for text in text_list]


df_final["lemmatized_text"] = normalize_words(df_final["lemmatized_text"], word_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["lemmatized_text"] = normalize_words(df_final["lemmatized_text"], word_mapping)


In [37]:
from collections import Counter

# Flatten all words and count occurrences
word_counts = Counter(word for text in df_final["lemmatized_text"] for word in text)

# Convert to a sorted list (by frequency descending)
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Display first 50 words for preview
for word, count in sorted_word_counts[:10000]:  # Adjust number to see more
    print(f"{word}: {count}")

flight: 4455
air: 2618
france: 2505
seating: 1944
food: 1916
staff: 1691
luggage: 1680
service: 1461
fly: 1369
not: 1341
time: 1340
get: 1305
good: 1263
paris: 1163
travel: 1117
check: 1042
airline: 1022
booking: 1011
hour: 942
would: 926
do: 837
passenger: 835
one: 833
plane: 774
make: 740
airport: 722
well: 714
take: 703
class: 675
could: 630
day: 620
movie: 609
back: 595
experience: 595
economy: 579
business: 574
tell: 570
even: 565
say: 554
cost: 516
arrive: 510
great: 474
first: 465
bad: 452
long: 449
pay: 446
go: 446
like: 441
use: 435
also: 427
never: 427
give: 423
ask: 417
way: 414
transfer: 414
delayed: 406
cdg: 383
find: 379
comfortable: 373
wait: 368
nice: 363
call: 361
next: 356
really: 355
boarding: 349
need: 348
queue: 348
board: 340
return: 337
people: 332
minute: 330
friendly: 328
much: 326
two: 325
leave: 317
cabin: 316
beverage: 312
try: 309
leg: 309
know: 306
gate: 301
another: 297
help: 290
offer: 282
serve: 280
lose: 276
come: 275
delta: 275
change: 270
still: 269


In [38]:
# from collections import Counter

# # Flatten list of words and count occurrences
# word_counts = Counter([word for sublist in df_final["lemmatized_text"] for word in sublist])

# # Determine cutoff thresholds
# high_freq_cutoff = int(len(df_final) * 0.30)  # Words appearing in >15% of reviews
# low_freq_cutoff = 5  # Words appearing fewer than 5 times

# # Identify words to remove
# high_freq_words = {word for word, count in word_counts.items() if count > high_freq_cutoff}
# low_freq_words = {word for word, count in word_counts.items() if count < low_freq_cutoff}

# # Combine lists of words to remove
# words_to_remove = high_freq_words.union(low_freq_words)

# def remove_unwanted_words(text_list, stopwords):
#     """Remove words that are too common or too rare."""
#     return [[word for word in text if word not in stopwords] for text in text_list]

# df_final["lemmatized_text"] = remove_unwanted_words(df_final["lemmatized_text"], words_to_remove)



In [None]:
from collections import Counter

# Flatten List of words and count occurrences
word_counts = Counter(
    [word for sublist in df_final["lemmatized_text"] for word in sublist]
)

# Determine cutoff threshold for low-frequency words
low_freq_cutoff = 20  # Words appearing fewer than 5 times

# Identify words to remove (only low-frequency words)
low_freq_words = {
    word for word, count in word_counts.items() if count < low_freq_cutoff
}


# Function to remove unwanted words
def remove_unwanted_words(text_list, stopwords):
    """Remove words that are too rare."""
    return [[word for word in text if word not in stopwords] for text in text_list]


# Apply filtering to remove only low-frequency words
df_final["lemmatized_text"] = remove_unwanted_words(
    df_final["lemmatized_text"], low_freq_words
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["lemmatized_text"] = remove_unwanted_words(df_final["lemmatized_text"], low_freq_words)


In [40]:
from collections import Counter

# Flatten all words and count occurrences
word_counts = Counter(word for text in df_final["lemmatized_text"] for word in text)

# Convert to a sorted list (by frequency descending)
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Display first 50 words for preview
for word, count in sorted_word_counts[:10000]:  # Adjust number to see more
    print(f"{word}: {count}")


flight: 4455
air: 2618
france: 2505
seating: 1944
food: 1916
staff: 1691
luggage: 1680
service: 1461
fly: 1369
not: 1341
time: 1340
get: 1305
good: 1263
paris: 1163
travel: 1117
check: 1042
airline: 1022
booking: 1011
hour: 942
would: 926
do: 837
passenger: 835
one: 833
plane: 774
make: 740
airport: 722
well: 714
take: 703
class: 675
could: 630
day: 620
movie: 609
back: 595
experience: 595
economy: 579
business: 574
tell: 570
even: 565
say: 554
cost: 516
arrive: 510
great: 474
first: 465
bad: 452
long: 449
pay: 446
go: 446
like: 441
use: 435
also: 427
never: 427
give: 423
ask: 417
way: 414
transfer: 414
delayed: 406
cdg: 383
find: 379
comfortable: 373
wait: 368
nice: 363
call: 361
next: 356
really: 355
boarding: 349
need: 348
queue: 348
board: 340
return: 337
people: 332
minute: 330
friendly: 328
much: 326
two: 325
leave: 317
cabin: 316
beverage: 312
try: 309
leg: 309
know: 306
gate: 301
another: 297
help: 290
offer: 282
serve: 280
lose: 276
come: 275
delta: 275
change: 270
still: 269


In [None]:
custom_stopwords = set(
    [
        # ✈️ Domain-Specific (High-frequency airline words that dilute topics)
        "flight",
        "flights",
        "air",
        "france",
        "af",
        "airfrance",
        "airline",
        "airlines",
        "plane",
        "aircraft",
        "airport",
        "departure",
        "arrival",
        "terminal",
        "gate",
        "connection",
        "layover",
        "transit",
        "passport",
        "passenger",
        "passengers",
        "paris",
        "cdg",
        "charles",
        "de",
        "gaulle",
        "service",
        "staff",
        "booking",
        "day",
        "travel",
        "delta",
        "seating",
        # 🏢 Generic Words (Not topic-specific)
        "people",
        "minute",
        "extra",
        "last",
        "ever",
        "however",
        "serve",
        "since",
        "finally",
        "enough",
        "end",
        "pass",
        "home",
        "around",
        "system",
        "choice",
        "seem",
        "ve",
        "row",
        "sure",
        "can",
        "although",
        "mean",
        "close",
        "show",
        "walk",
        "counter",
        "second",
        "treat",
        "right",
        "process",
        "course",
        "bus",
        "flat",
        "standard",
        "possible",
        "plan",
        "may",
        "turn",
        "efficient",
        "upon",
        "route",
        "wonderful",
        "rather",
        "instead",
        "partner",
        "top",
        "away",
        "despite",
        "refuse",
        "love",
        "thank",
        "review",
        "side",
        "item",
        "talk",
        "suppose",
        "fact",
        "name",
        "lie",
        "deal",
        "purchase",
        "queue",
        "stop",
        "run",
        "lady",
        "anyone",
        "employee",
        "either",
        "morning",
        "husband",
        "schedule",
        "completely",
        "mention",
        "touch",
        "friend",
        "sorry",
        "expensive",
        "ahead",
        "hot",
        "real",
        "usually",
        "twice",
        "response",
        "disappoint",
        "disappointing",
        "attentive",
        "eventually",
        "actually",
        "middle",
        # 🚀 Common Verbs (Unnecessary for topic modeling)
        "get",
        "go",
        "come",
        "take",
        "make",
        "find",
        "give",
        "put",
        "see",
        "know",
        "want",
        "would",
        "could",
        "should",
        "must",
        "did",
        "does",
        "do",
        "say",
        "let",
        "tell",
        "call",
        "ask",
        "try",
        "need",
        "think",
        "use",
        "work",
        "wait",
        "expect",
        "offer",
        "look",
        "pay",
        "charge",
        "buy",
        "book",
        "do",
        # ❌ Auxiliary Words & Negations (Filler words)
        "not",
        "never",
        "always",
        "still",
        "even",
        "much",
        "very",
        "more",
        "less",
        "like",
        "without",
        "thing",
        "another",
        "many",
        "every",
        "way",
        "back",
        "time",
        "now",
        "soon",
        "later",
        "then",
        "before",
        "after",
        "ago",
        "today",
        "yesterday",
        "tomorrow",
        "early",
        "late",
        "long",
        "short",
        "already",
        "yet",
        "just",
        # 📌 Pronouns & Function Words (Non-informative)
        "i",
        "me",
        "my",
        "mine",
        "you",
        "your",
        "yours",
        "we",
        "our",
        "ours",
        "they",
        "them",
        "their",
        "theirs",
        "he",
        "him",
        "his",
        "she",
        "her",
        "hers",
        "it",
        "its",
        "this",
        "that",
        "these",
        "those",
        "who",
        "whom",
        "whose",
        "which",
        "what",
        "where",
        "when",
        "why",
        "how",
        "there",
        "here",
        "some",
        "any",
        "few",
        "several",
        "many",
        "others",
        # 🤷‍♂️ Opinion & Adjective Words (Subjective, not topic-defining)
        "good",
        "bad",
        "best",
        "worst",
        "nice",
        "great",
        "terrible",
        "awful",
        "horrible",
        "amazing",
        "fantastic",
        "excellent",
        "perfect",
        "fine",
        "poor",
        "better",
        "worse",
        "new",
        "old",
        "big",
        "small",
        "high",
        "low",
        "fast",
        "slow",
        "easy",
        "hard",
        # 🔢 Other Common Words (Too generic for topic modeling)
        "one",
        "two",
        "three",
        "four",
        "five",
        "six",
        "seven",
        "eight",
        "nine",
        "ten",
        "hundred",
        "thousand",
        "million",
        "someone",
        "everyone",
        "thing",
        "something",
        "everything",
        "nothing",
        "yes",
        "no",
        "ok",
        "okay",
        "probably",
        "definitely",
        "bit",
        "lot",
        "kind",
        "sort",
        "part",
        "piece",
        "level",
        "place",
        "point",
        "case",
        "situation",
        "matter",
        "reason",
        "result",
        "problem",
        "solution",
        # 🚫 Noise Words Identified from -1 Cluster
        "really",
        "also",
        "leg",
        "return",
        "next",
        "year"
        # 🛬 Airline-Specific but Not Topic-Differentiating
        "seating",
        "food",
        "luggage",
        "fly",
        "check",
        "hour",
        "class",
        "movie",
        "experience",
        "economy",
        "business",
        "cost",
        "arrive",
        "first",
        "transfer",
        "delayed",
        "comfortable",
        "boarding",
        "board",
        "friendly",
        "leave",
        "cabin",
        "beverage",
        "help",
        "lose",
        "change",
        "canceled",
        "premium",
        "compensation",
        "room",
        "miss",
        "pleasant",
        "company",
        "french",
        "due",
        "lounge",
        "provide",
        "receive",
        "though",
        "helpful",
        "sit",
        "recommend",
        "issue",
        "little",
        "sleep",
        "full",
        "impolite",
        "feel",
        "hotel",
        "money",
        "space",
        "wine",
        "keep",
        "phone",
        "care",
        "agent",
        "able",
        "different",
        "klm",
        "clean",
        "quality",
        "claim",
        "night",
        "option",
        "speak",
        "available",
        "email",
        "start",
        "almost",
        "land",
        "overall",
        "quite",
        "front",
        "allow",
        "send",
        "number",
        "free",
        "via",
        "month",
        "wife",
        "selection",
        "upgrade",
        "include",
        "legroom",
        "person",
        "choose",
        "move",
        "enjoy",
        "anything",
        "spend",
        "week",
        "inflight",
        "happen",
        "online",
        "ground",
        "water",
        "avoid",
        "desk",
        "recline",
        "extremely",
        "complaint",
        "carry",
        "deliver",
        "understand",
        "least",
        "hand",
        "member",
        "average",
        "smooth",
        "contact",
        "request",
        "pretty",
        "family",
        "far",
        "half",
        "strike",
        "priority",
        "information",
        "uncomfortable",
        "comfort",
        "professional",
        "destination",
        "international",
        "airplane",
        "eat",
        "rest",
        "together",
        "whole",
        "etc",
        "stay",
        "area",
        "especially",
        "worth",
        "min",
        "english",
        "large",
        "follow",
        "order",
        "window",
        "haul",
        "else",
        "toilet",
        "pick",
        "sandwich",
        "drink",
        "inform",
        "champagne",
        "hold",
        "add",
        "plus",
        "surprise",
        "plenty",
        "carrier",
        "cause",
        "lack",
        "behind",
        "direct",
        "bring",
        "open",
        "decide",
        "couple",
        "manage",
        "special",
        "hear",
        "depart",
        "amsterdam",
        "stand",
        "control",
        "cheap",
        "please",
        "holiday",
        "landing",
        "future",
        "absolutely",
        "watch",
        "airbus",
        "handle",
        "wrong",
        "consider",
        "cold",
        "onboard",
        "reach",
        "answer",
        "believe",
        "quick",
        "within",
        "catch",
        "card",
        "complain",
        "assistance",
        "lax",
        "chair",
        "smile",
        "total",
        "fault",
        "euro",
        "access",
        "menu",
        "bed",
        "compare",
        "other",
        "group",
        "hope",
        "job",
        "date",
        "website",
        "child",
        "break",
        "print",
        "entire",
        "aisle",
        "son",
        "explain",
        "difference",
        "decent",
        "unpleasant",
        "daughter",
        "jfk",
        "message",
        "past",
        "might",
        "confirm",
        "form",
        "country",
        "light",
        "unfortunately",
        "life",
        "sky",
        "except",
        "original",
        "pilot",
        "mile",
        "attitude",
        "woman",
        "state",
        "section",
        "kid",
        "exit",
        "additional",
        "amount",
        "simply",
        "empty",
        "guy",
        "have",
        "begin",
        "fully",
        "tray",
        "europe",
        # 🔹 High-Frequency Airline-Specific Words (Not Topic-Differentiating)
        "seating",
        "food",
        "luggage",
        "fly",
        "check",
        "hour",
        "class",
        "movie",
        "experience",
        "economy",
        "business",
        "cost",
        "arrive",
        "first",
        "transfer",
        "delayed",
        "comfortable",
        "boarding",
        "board",
        "friendly",
        "leave",
        "cabin",
        "beverage",
        "help",
        "lose",
        "change",
        "canceled",
        "premium",
        "compensation",
        "room",
        "miss",
        "pleasant",
        "company",
        "lounge",
        "provide",
        "receive",
        "though",
        "helpful",
        "sit",
        "recommend",
        "issue",
        "little",
        "sleep",
        "full",
        "impolite",
        "feel",
        "hotel",
        "money",
        "space",
        "wine",
        "keep",
        "phone",
        "care",
        "agent",
        "able",
        "different",
        "klm",
        "clean",
        "quality",
        "claim",
        "night",
        "option",
        "speak",
        "available",
        "email",
        "start",
        "almost",
        "land",
        "overall",
        "quite",
        "front",
        "allow",
        "send",
        "number",
        "free",
        "via",
        "month",
        "wife",
        "selection",
        "upgrade",
        "include",
        "legroom",
        "person",
        "choose",
        "move",
        "enjoy",
        "anything",
        "spend",
        "week",
        "inflight",
        "happen",
        "online",
        "ground",
        "water",
        "avoid",
        "desk",
        "recline",
        "extremely",
        "complaint",
        "carry",
        "deliver",
        "understand",
        "least",
        "hand",
        "member",
        "average",
        "smooth",
        "contact",
        "request",
        "pretty",
        "family",
        "far",
        "half",
        "strike",
        "priority",
        "information",
        "uncomfortable",
        "comfort",
        "professional",
        "destination",
        "international",
        "airplane",
        "eat",
        "rest",
        "together",
        "whole",
        "stay",
        "area",
        "especially",
        "worth",
        "min",
        "english",
        "large",
        "follow",
        "order",
        "window",
        "haul",
        "else",
        "toilet",
        "pick",
        "sandwich",
        "drink",
        "inform",
        "champagne",
        "hold",
        "add",
        "plus",
        "surprise",
        "plenty",
        "carrier",
        "cause",
        "lack",
        "behind",
        "direct",
        "bring",
        "open",
        "decide",
        "couple",
        "manage",
        "special",
        "hear",
        "depart",
        "amsterdam",
        "stand",
        "control",
        "cheap",
        "please",
        "holiday",
        "landing",
        "future",
        "absolutely",
        "watch",
        "airbus",
        "handle",
        "wrong",
        "consider",
        "cold",
        "onboard",
        "reach",
        "answer",
        "believe",
        "quick",
        "within",
        "catch",
        "card",
        "complain",
        "assistance",
        "lax",
        "chair",
        "smile",
        "total",
        "fault",
        "euro",
        "access",
        "menu",
        "bed",
        "compare",
        "other",
        "group",
        "hope",
        "job",
        "date",
        "website",
        "child",
        "break",
        "print",
        "entire",
        "aisle",
        "son",
        "explain",
        "difference",
        "decent",
        "unpleasant",
        "daughter",
        "jfk",
        "message",
        "past",
        "might",
        "confirm",
        "form",
        "country",
        "light",
        "unfortunately",
        "life",
        "sky",
        "except",
        "original",
        "pilot",
        "mile",
        "attitude",
        "woman",
        "state",
        "section",
        "kid",
        "exit",
        "additional",
        "amount",
        "simply",
        "empty",
        "guy",
        "have",
        "begin",
        "fully",
        "tray",
        "europe",
        # 🔹 Location Names (Common in Airlines but Not Useful for Topics)
        "london",
        "new york",
        "los angeles",
        "san francisco",
        "dubai",
        "paris",
        "madrid",
        "johannesburg",
        "mumbai",
        "toronto",
        "montreal",
        "boston",
        "houston",
        "singapore",
        "atlanta",
        "africa",
        "canada",
        "italy",
        "india",
        "vancouver",
        "barcelona",
        "bangalore",
        "dublin",
        "york",
        "birmingham",
        "florence",
        "rome",
        "usa",
        # 🔹 Generic/Weak Meaning Words (Do Not Define Topics)
        "tight",
        "multiple",
        "nightmare",
        "manager",
        "welcome",
        "wish",
        "question",
        "meet",
        "advance",
        "regular",
        "force",
        "highly",
        "forward",
        "normal",
        "main",
        "round",
        "file",
        "value",
        "impossible",
        "manchester",
        "film",
        "weight",
        "american",
        "maybe",
        "write",
        "warm",
        "blanket",
        "wheelchair",
        "overnight",
        "eye",
        "throughout",
        "often",
        "personal",
        "bathroom",
        "foot",
        "world",
        "across",
        "save",
        "separate",
        "notice",
        "super",
        "read",
        "usual",
        "clothe",
        "single",
        "stewardess",
        "status",
        "delicious",
        "blue",
        "assign",
        "near",
        "select",
        "operate",
        "cramp",
        "report",
        "nearly",
        "unable",
        "car",
        "man",
        "update",
        "train",
        "traveler",
        "huge",
        "waiting",
        "courteous",
        "head",
        "damage",
        "support",
        "dirty",
        "announcement",
        "totally",
        "along",
        "pillow",
        "coffee",
        "confirmation",
        "will",
        "tag",
        "rate",
        "non",
        "quickly",
        "certainly",
        "team",
        "improve",
        "complete",
        "inside",
        "cheese",
        "accommodate",
        "pack",
        "become",
        "star",
        "communication",
        "bother",
        "concern",
        "note",
        "supervisor",
        "tasty",
        "orly",
        "payment",
        "clear",
        "lucky",
        "per",
        "rebook",
        "waste",
        "frequent",
        "drop",
        "appreciate",
        "advise",
        "athens",
        "immigration",
        "recent",
        "step",
        "appear",
        "enjoyable",
        "surprised",
        "limited",
        "bottle",
        "tall",
        "guess",
        "stuff",
        "wide",
        "quiet",
        "stroller",
        "fix",
        "recently",
        "share",
        "unhelpful",
        "fill",
        "word",
        "clearly",
        "fail",
        "prefer",
        "office",
        "lovely",
        "safety",
        "require",
        "deck",
        "jet",
        "weigh",
        "door",
        "prior",
        "responsibility",
        "barely",
        "obviously",
        "otherwise",
        "outbound",
        "shame",
        "realize",
        "ignore",
        "adequate",
        "wifi",
        "major",
        "slightly",
        "detail",
        "custom",
        "weather",
        "storage",
        "difficult",
        "stress",
        "domestic",
        "attempt",
        "reply",
        "ill",
        "representative",
        "apparently",
        "alternative",
        "credit",
        "locate",
        "glass",
        "learn",
        "regard",
        "negative",
        "bar",
        "language",
        "chicken",
        "configuration",
        "video",
        "roll",
        "base",
        "load",
        "enter",
        "assist",
        "previous",
        "european",
        "unless",
        "music",
        "explanation",
        "unacceptable",
        "nobody",
        "visit",
        "dog",
        "machine",
        "baby",
        "evening",
        "spacious",
        "type",
        "attention",
        "mistake",
        "onto",
        "continue",
        "overhead",
        "rush",
        "none",
        "reasonable",
        "immediately",
        "bread",
        "chance",
        "hrs",
        "apart",
        "suggest",
        "thru",
        "general",
        "narrow",
        "story",
        "condition",
        "perhaps",
        "effort",
        "third",
        "address",
        "limit",
        "taste",
        "fairly",
        "final",
        "luckily",
        "apology",
        "tired",
        "arrogant",
        "face",
        "therefore",
        "directly",
        "idea",
        "ice",
        "boeing",
        "receipt",
        "mind",
        "vegetarian",
        "sign",
        "moment",
        "amenity",
    ]
)


def remove_custom_stopwords(text_list, stopwords):
    """Remove custom stopwords from tokenized text"""
    return [[word for word in text if word not in stopwords] for text in text_list]


# Apply stopword removal to lemmatized text
df_final["lemmatized_text"] = remove_custom_stopwords(
    df_final["lemmatized_text"], custom_stopwords
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["lemmatized_text"] = remove_custom_stopwords(


In [42]:
from collections import Counter

# Flatten all words and count occurrences
word_counts = Counter(word for text in df_final["lemmatized_text"] for word in text)

# Convert to a sorted list (by frequency descending)
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Display first 50 words for preview
for word, count in sorted_word_counts[:10000]:  # Adjust number to see more
    print(f"{word}: {count}")


well: 714
year: 263
be: 189
I: 113
s: 105
tight: 49
multiple: 49
nightmare: 49
manager: 49
welcome: 49
wish: 49
question: 49
meet: 49
advance: 49
regular: 49
force: 48
highly: 48
forward: 48
normal: 48
main: 48
round: 48
file: 47
value: 47
impossible: 47
manchester: 47
film: 47
re: 46
weight: 46
american: 46
maybe: 46
world: 46
write: 46
warm: 46
blanket: 46
wheelchair: 46
overnight: 46
notice: 46
eye: 46
london: 46
throughout: 46
often: 45
personal: 45
bathroom: 45
foot: 45
across: 45
save: 45
separate: 45
super: 45
read: 45
usual: 45
clothe: 44
single: 44
stewardess: 44
status: 44
delicious: 44
blue: 44
assign: 44
near: 43
select: 43
operate: 43
cramp: 43
report: 43
nearly: 43
unable: 43
dirty: 43
rome: 43
car: 43
man: 43
update: 43
train: 43
traveler: 43
huge: 43
waiting: 43
courteous: 43
head: 43
damage: 42
support: 42
announcement: 42
totally: 42
along: 42
pillow: 42
coffee: 42
confirmation: 41
will: 41
tag: 41
rate: 41
non: 41
quickly: 41
atlanta: 41
certainly: 40
team: 40
city: 

In [None]:
from fuzzywuzzy import process, fuzz
import gensim.downloader as api
from collections import Counter

# ✅ Load Pre-trained Word Embeddings
word_vectors = api.load(
    "glove-wiki-gigaword-50"
)  # Can use 'glove-twitter-100' for social media text

# ✅ Get Most Common Words in Dataset
word_counts = Counter(
    [word for tokens in df_final["lemmatized_text"] for word in tokens]
)
common_words = [
    word for word, count in word_counts.items() if count > 10
]  # Only words appearing 10+ times


# ✅ Function to Merge Similar Words
def merge_words(tokens):
    merged_tokens = []
    for word in tokens:
        # ✅ Step 1: Check if it's a frequent word (keep it)
        if word in common_words:
            merged_tokens.append(word)
            continue

        # ✅ Step 2: Fuzzy Matching for Typos
        similar_word, similarity = process.extractOne(
            word, common_words, scorer=fuzz.ratio
        )
        if similarity > 85:  # Adjust threshold as needed
            merged_tokens.append(similar_word)
            continue

        # ✅ Step 3: Word Embedding Similarity (if word exists in model)
        if word in word_vectors:
            similar_embeddings = word_vectors.most_similar(word, topn=5)
            for similar_word, score in similar_embeddings:
                if score > 0.8 and similar_word in common_words:
                    merged_tokens.append(similar_word)
                    break
            else:
                merged_tokens.append(word)
        else:
            merged_tokens.append(word)

    return merged_tokens


# ✅ Apply the Function to Your Dataset
df_final["lemmatized_text"] = df_final["lemmatized_text"].apply(merge_words)

# ✅ Preview the cleaned data
df_final.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["lemmatized_text"] = df_final["lemmatized_text"].apply(merge_words)


Unnamed: 0,rating,review_length_text,review_length_title,polarity_text,polarity_title,subjectivity_text,subjectivity_title,lemmatized_text,lemmatized_title,day_of_week,month,year
0,1,408,2,0.018448,-1.0,0.512241,1.0,"[often, usa, be, confirmation, confirmation, p...","[bad, airline]",Wednesday,11,2024
1,1,157,4,-0.060897,-1.0,0.592949,1.0,"[regard, regard, be, wear, negative, sense]","[terrible, experience, airfrance]",Wednesday,11,2024
2,1,259,11,-0.094163,-0.6,0.488287,0.7,"[recently, hong, kong, disappointed, hong, kon...","[extremely, disappointing, experience, air, fr...",Tuesday,11,2024
3,1,274,1,-0.109373,-1.0,0.494012,1.0,"[I, shock, computer, computer, busy, re, tight...",[horrible],Monday,11,2024
4,1,311,7,-0.126476,-1.0,0.485192,1.0,"[hong, kong, ruin, assure, accept, discover, s...","[bad, flight, experience, I, ve, ever]",Monday,11,2024


In [None]:
import spacy

# Load spaCy model with NER
nlp = spacy.load("en_core_web_sm")


# Function to remove locations and airports
def remove_places(text_list):
    """Remove words that are classified as locations or airports."""
    cleaned_texts = []
    for text in text_list:
        doc = nlp(" ".join(text))  # Convert list back to text for processing
        filtered_words = [
            token.text for token in doc if token.ent_type_ not in ["GPE", "LOC", "FAC"]
        ]
        cleaned_texts.append(filtered_words)  # Keep only non-location words
    return cleaned_texts


# Apply location filtering to the lemmatized_text column
df_final["lemmatized_text"] = remove_places(df_final["lemmatized_text"])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["lemmatized_text"] = remove_places(df_final["lemmatized_text"])


In [45]:
final_path = "/home/azureuser/cloudfiles/code/Users/oskar.wolf/nlp_air_france_reviews/data/processed/processed_reviews.csv"
df_final.to_csv(final_path, index=False, encoding="utf-8")

print(f"Successfully saved processed dataset at: {final_path}")

Successfully saved processed dataset at: /home/azureuser/cloudfiles/code/Users/oskar.wolf/nlp_air_france_reviews/data/processed/processed_reviews.csv


In [46]:
from collections import Counter

# Flatten all words and count occurrences
word_counts = Counter(word for text in df_final["lemmatized_text"] for word in text)

# Convert to a sorted list (by frequency descending)
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Display first 50 words for preview
for word, count in sorted_word_counts[:10000]:  # Adjust number to see more
    print(f"{word}: {count}")

well: 714
year: 261
be: 189
I: 113
s: 105
tight: 49
multiple: 49
nightmare: 49
manager: 49
welcome: 49
wish: 49
question: 49
meet: 49
advance: 49
regular: 49
force: 48
highly: 48
forward: 48
normal: 48
main: 48
round: 48
file: 47
value: 47
impossible: 47
manchester: 47
film: 47
re: 46
weight: 46
american: 46
maybe: 46
write: 46
warm: 46
blanket: 46
wheelchair: 46
overnight: 46
eye: 46
throughout: 46
often: 45
personal: 45
bathroom: 45
foot: 45
world: 45
across: 45
save: 45
separate: 45
notice: 45
super: 45
read: 45
usual: 45
clothe: 44
single: 44
stewardess: 44
status: 44
delicious: 44
blue: 44
assign: 44
near: 43
select: 43
operate: 43
cramp: 43
report: 43
nearly: 43
unable: 43
car: 43
man: 43
update: 43
train: 43
traveler: 43
huge: 43
waiting: 43
courteous: 43
head: 43
damage: 42
support: 42
dirty: 42
announcement: 42
totally: 42
along: 42
pillow: 42
coffee: 42
confirmation: 41
will: 41
tag: 41
rate: 41
non: 41
quickly: 41
certainly: 40
team: 40
improve: 40
complete: 40
inside: 40
ch