In [30]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer
from tqdm import tqdm

In [31]:
nltk.download("punkt_tab")
nltk.download("stopwords")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
file_path = "/home/azureuser/cloudfiles/code/Users/oskar.wolf/nlp_air_france_reviews/data/interim/explored_reviews.csv"
df = pd.read_csv(file_path)

In [33]:
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns")
df.head()

Dataset contains 2560 rows and 13 columns


Unnamed: 0,rating,title,text,publishedDate,sentiment,review_length_text,review_length_title,polarity_text,polarity_title,subjectivity_text,subjectivity_title,sentiment_text,sentiment_title
0,1,WORST AIRLINE,I travel a lot - and I travel often. Last week...,2024-11-13,Negative,408,2,0.018448,-1.0,0.512241,1.0,positive,negative
1,1,Terrible experience with Airfrance,"This review is regarding flight AF185, we book...",2024-11-13,Negative,157,4,-0.060897,-1.0,0.592949,1.0,negative,negative
2,1,Extremely Disappointing Experience with Air Fr...,I recently flew with Air France on flight #185...,2024-11-12,Negative,259,11,-0.094163,-0.6,0.488287,0.7,negative,negative
3,1,Horrible,Wow!!! What a horrible experience!! I've alway...,2024-11-11,Negative,274,1,-0.109373,-1.0,0.494012,1.0,negative,negative
4,1,The Worst Flight Experience I’ve Ever Had,I spent a fantastic 10-day vacation in Hong Ko...,2024-11-11,Negative,311,7,-0.126476,-1.0,0.485192,1.0,negative,negative


In [34]:
def clean(text):
    if pd.isna(text):
        return ""

    text = text.lower()
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text.strip()


df["clean_text"] = df["text"].apply(clean)
df["clean_title"] = df["title"].apply(clean)

df[["text", "clean_text", "title", "clean_title"]].head()


Unnamed: 0,text,clean_text,title,clean_title
0,I travel a lot - and I travel often. Last week...,i travel a lot and i travel often last week i...,WORST AIRLINE,worst airline
1,"This review is regarding flight AF185, we book...",this review is regarding flight af we booked a...,Terrible experience with Airfrance,terrible experience with airfrance
2,I recently flew with Air France on flight #185...,i recently flew with air france on flight fro...,Extremely Disappointing Experience with Air Fr...,extremely disappointing experience with air fr...
3,Wow!!! What a horrible experience!! I've alway...,wow what a horrible experience ive always flow...,Horrible,horrible
4,I spent a fantastic 10-day vacation in Hong Ko...,i spent a fantastic day vacation in hong kong ...,The Worst Flight Experience I’ve Ever Had,the worst flight experience ive ever had


In [35]:
def tokenize(text):
    return word_tokenize(text)


df["tokens_text"] = df["clean_text"].apply(tokenize)
df["tokens_title"] = df["clean_title"].apply(tokenize)

df[["clean_text", "tokens_text", "clean_title", "tokens_title"]].head()

Unnamed: 0,clean_text,tokens_text,clean_title,tokens_title
0,i travel a lot and i travel often last week i...,"[i, travel, a, lot, and, i, travel, often, las...",worst airline,"[worst, airline]"
1,this review is regarding flight af we booked a...,"[this, review, is, regarding, flight, af, we, ...",terrible experience with airfrance,"[terrible, experience, with, airfrance]"
2,i recently flew with air france on flight fro...,"[i, recently, flew, with, air, france, on, fli...",extremely disappointing experience with air fr...,"[extremely, disappointing, experience, with, a..."
3,wow what a horrible experience ive always flow...,"[wow, what, a, horrible, experience, ive, alwa...",horrible,[horrible]
4,i spent a fantastic day vacation in hong kong ...,"[i, spent, a, fantastic, day, vacation, in, ho...",the worst flight experience ive ever had,"[the, worst, flight, experience, ive, ever, had]"


In [36]:
stop_words = set(stopwords.words("english"))


def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]


df["tokens_text_nostop"] = df["tokens_text"].apply(remove_stopwords)
df["tokens_title_nostop"] = df["tokens_title"].apply(remove_stopwords)

df[["tokens_text", "tokens_text_nostop", "tokens_title", "tokens_title_nostop"]].head()

Unnamed: 0,tokens_text,tokens_text_nostop,tokens_title,tokens_title_nostop
0,"[i, travel, a, lot, and, i, travel, often, las...","[travel, lot, travel, often, last, week, flew,...","[worst, airline]","[worst, airline]"
1,"[this, review, is, regarding, flight, af, we, ...","[review, regarding, flight, af, booked, econom...","[terrible, experience, with, airfrance]","[terrible, experience, airfrance]"
2,"[i, recently, flew, with, air, france, on, fli...","[recently, flew, air, france, flight, hong, ko...","[extremely, disappointing, experience, with, a...","[extremely, disappointing, experience, air, fr..."
3,"[wow, what, a, horrible, experience, ive, alwa...","[wow, horrible, experience, ive, always, flown...",[horrible],[horrible]
4,"[i, spent, a, fantastic, day, vacation, in, ho...","[spent, fantastic, day, vacation, hong, kong, ...","[the, worst, flight, experience, ive, ever, had]","[worst, flight, experience, ive, ever]"


In [37]:
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")


def stem_tokens(tokens, stemmer):
    return [stemmer.stem(word) for word in tokens]


df["stemmed_text_porter"] = df["tokens_text_nostop"].apply(
    lambda x: stem_tokens(x, porter_stemmer)
)
df["stemmed_title_porter"] = df["tokens_title_nostop"].apply(
    lambda x: stem_tokens(x, porter_stemmer)
)

df["stemmed_text_snowball"] = df["tokens_text_nostop"].apply(
    lambda x: stem_tokens(x, snowball_stemmer)
)
df["stemmed_title_snowball"] = df["tokens_title_nostop"].apply(
    lambda x: stem_tokens(x, snowball_stemmer)
)

df[
    [
        "tokens_text_nostop",
        "stemmed_text_porter",
        "stemmed_text_snowball",
        "tokens_title_nostop",
        "stemmed_title_porter",
        "stemmed_title_snowball",
    ]
].head()

Unnamed: 0,tokens_text_nostop,stemmed_text_porter,stemmed_text_snowball,tokens_title_nostop,stemmed_title_porter,stemmed_title_snowball
0,"[travel, lot, travel, often, last, week, flew,...","[travel, lot, travel, often, last, week, flew,...","[travel, lot, travel, often, last, week, flew,...","[worst, airline]","[worst, airlin]","[worst, airlin]"
1,"[review, regarding, flight, af, booked, econom...","[review, regard, flight, af, book, economi, fl...","[review, regard, flight, af, book, economi, fl...","[terrible, experience, airfrance]","[terribl, experi, airfranc]","[terribl, experi, airfranc]"
2,"[recently, flew, air, france, flight, hong, ko...","[recent, flew, air, franc, flight, hong, kong,...","[recent, flew, air, franc, flight, hong, kong,...","[extremely, disappointing, experience, air, fr...","[extrem, disappoint, experi, air, franc, fligh...","[extrem, disappoint, experi, air, franc, fligh..."
3,"[wow, horrible, experience, ive, always, flown...","[wow, horribl, experi, ive, alway, flown, port...","[wow, horribl, experi, ive, alway, flown, port...",[horrible],[horribl],[horribl]
4,"[spent, fantastic, day, vacation, hong, kong, ...","[spent, fantast, day, vacat, hong, kong, famil...","[spent, fantast, day, vacat, hong, kong, famil...","[worst, flight, experience, ive, ever]","[worst, flight, experi, ive, ever]","[worst, flight, experi, ive, ever]"


In [38]:
nlp = spacy.load("en_core_web_sm")


def lemmatize_tokens(text):
    doc = nlp(" ".join(text))
    return [token.lemma_ for token in doc]


df["lemmatized_text"] = df["tokens_text_nostop"].apply(lemmatize_tokens)
df["lemmatized_title"] = df["tokens_title_nostop"].apply(lemmatize_tokens)

df[
    ["tokens_text_nostop", "lemmatized_text", "tokens_title_nostop", "lemmatized_title"]
].head()

Unnamed: 0,tokens_text_nostop,lemmatized_text,tokens_title_nostop,lemmatized_title
0,"[travel, lot, travel, often, last, week, flew,...","[travel, lot, travel, often, last, week, fly, ...","[worst, airline]","[bad, airline]"
1,"[review, regarding, flight, af, booked, econom...","[review, regard, flight, af, book, economy, fl...","[terrible, experience, airfrance]","[terrible, experience, airfrance]"
2,"[recently, flew, air, france, flight, hong, ko...","[recently, fly, air, france, flight, hong, kon...","[extremely, disappointing, experience, air, fr...","[extremely, disappointing, experience, air, fr..."
3,"[wow, horrible, experience, ive, always, flown...","[wow, horrible, experience, I, ve, always, fly...",[horrible],[horrible]
4,"[spent, fantastic, day, vacation, hong, kong, ...","[spend, fantastic, day, vacation, hong, kong, ...","[worst, flight, experience, ive, ever]","[bad, flight, experience, I, ve, ever]"


In [39]:
df["publishedDate"] = pd.to_datetime(df["publishedDate"], errors="coerce")

df["day_of_week"] = df["publishedDate"].dt.day_name()
df["month"] = df["publishedDate"].dt.month
df["year"] = df["publishedDate"].dt.year

df.drop(columns=["publishedDate"], inplace=True)


In [40]:
final_columns = [
    "rating",  # Numerical sentiment score
    "review_length_text",
    "review_length_title",  # Text length analysis
    "polarity_text",
    "polarity_title",  # Sentiment polarity scores
    "subjectivity_text",
    "subjectivity_title",  # Opinion-based scores
    "lemmatized_text",
    "lemmatized_title",  # Clean processed text for vectorization
    "day_of_week",
    "month",
    "year",  # Time-based trend analysis
]

df_final = df[final_columns]
df_final.head()

Unnamed: 0,rating,review_length_text,review_length_title,polarity_text,polarity_title,subjectivity_text,subjectivity_title,lemmatized_text,lemmatized_title,day_of_week,month,year
0,1,408,2,0.018448,-1.0,0.512241,1.0,"[travel, lot, travel, often, last, week, fly, ...","[bad, airline]",Wednesday,11,2024
1,1,157,4,-0.060897,-1.0,0.592949,1.0,"[review, regard, flight, af, book, economy, fl...","[terrible, experience, airfrance]",Wednesday,11,2024
2,1,259,11,-0.094163,-0.6,0.488287,0.7,"[recently, fly, air, france, flight, hong, kon...","[extremely, disappointing, experience, air, fr...",Tuesday,11,2024
3,1,274,1,-0.109373,-1.0,0.494012,1.0,"[wow, horrible, experience, I, ve, always, fly...",[horrible],Monday,11,2024
4,1,311,7,-0.126476,-1.0,0.485192,1.0,"[spend, fantastic, day, vacation, hong, kong, ...","[bad, flight, experience, I, ve, ever]",Monday,11,2024


In [None]:
# Define a refined custom stopword list
custom_stopwords = set(
    [
        # Domain-Specific (Removing high-frequency airline words that dilute topics)
        "flight",
        "flights",
        "air",
        "france",
        "af",
        "airfrance",
        "airline",
        "airlines",
        "plane",
        "aircraft",
        "aircrafts",
        "airplanes",
        "airplane",
        "airport",
        "airports",
        "departure",
        "arrival",
        "terminal",
        "gate",
        "connection",
        "layover",
        "transit",
        "check",
        "boarding",
        "security",
        "passport",
        "customs",
        "crew",
        "attendant",
        "passenger",
        "passengers",
        "service",
        "staff",
        "paris",
        "well",
        # Keeping topic-relevant words:
        # ✅ "baggage" (lost baggage could be a topic)
        # ✅ "business", "economy", "premium" (could be topic-relevant)
        # ✅ "delay", "cancel", "reschedule" (important for service complaints)
        # ✅ "meal", "food", "drink" (potential topic)
        # ❌ Removing "seat", "class", "window", "row" (appear across all topics)
        # Common Verbs (Removing generic action words)
        "get",
        "go",
        "come",
        "take",
        "make",
        "find",
        "give",
        "put",
        "see",
        "know",
        "want",
        "would",
        "could",
        "should",
        "must",
        "did",
        "does",
        "do",
        "say",
        "let",
        "tell",
        "call",
        "ask",
        "try",
        "need",
        "think",
        "use",
        "work",
        "wait",
        "expect",
        "offer",
        "look",
        "pay",
        "charge",
        "buy",
        "book",
        # Auxiliary Words & Negations (Common filler words)
        "not",
        "never",
        "always",
        "still",
        "even",
        "much",
        "very",
        "more",
        "less",
        "like",
        "without",
        "thing",
        "another",
        "many",
        "every",
        "way",
        "back",
        "time",
        "now",
        "soon",
        "later",
        "then",
        "before",
        "after",
        "ago",
        "today",
        "yesterday",
        "tomorrow",
        "early",
        "late",
        "long",
        "short",
        "already",
        "yet",
        "just",
        # Pronouns & Function Words (Non-informative words)
        "i",
        "me",
        "my",
        "mine",
        "you",
        "your",
        "yours",
        "we",
        "our",
        "ours",
        "they",
        "them",
        "their",
        "theirs",
        "he",
        "him",
        "his",
        "she",
        "her",
        "hers",
        "it",
        "its",
        "this",
        "that",
        "these",
        "those",
        "who",
        "whom",
        "whose",
        "which",
        "what",
        "where",
        "when",
        "why",
        "how",
        "there",
        "here",
        "some",
        "any",
        "few",
        "several",
        "many",
        "others",
        # Adjectives & Opinion Words (Subjective words that don’t define topics well)
        "good",
        "bad",
        "best",
        "worst",
        "nice",
        "great",
        "terrible",
        "awful",
        "horrible",
        "amazing",
        "fantastic",
        "excellent",
        "perfect",
        "fine",
        "poor",
        "better",
        "worse",
        "new",
        "old",
        "big",
        "small",
        "high",
        "low",
        "fast",
        "slow",
        "easy",
        "hard",
        # Other Commonly Overused Words
        "one",
        "two",
        "three",
        "four",
        "five",
        "six",
        "seven",
        "eight",
        "nine",
        "ten",
        "hundred",
        "thousand",
        "million",
        "someone",
        "everyone",
        "thing",
        "something",
        "everything",
        "nothing",
        "yes",
        "no",
        "ok",
        "okay",
        "probably",
        "definitely",
        "bit",
        "lot",
        "kind",
        "sort",
        "part",
        "piece",
        "level",
        "place",
        "point",
        "case",
        "situation",
        "matter",
        "reason",
        "result",
        "problem",
        "solution",
    ]
)


def remove_custom_stopwords(text_list, stopwords):
    """Remove custom stopwords from tokenized text"""
    return [[word for word in text if word not in stopwords] for text in text_list]


# Apply stopword removal to lemmatized text
df_final["lemmatized_text"] = remove_custom_stopwords(
    df_final["lemmatized_text"], custom_stopwords
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["lemmatized_text"] = remove_custom_stopwords(


In [42]:
final_path = "/home/azureuser/cloudfiles/code/Users/oskar.wolf/nlp_air_france_reviews/data/processed/processed_reviews.csv"
df_final.to_csv(final_path, index=False, encoding="utf-8")

print(f"Successfully saved processed dataset at: {final_path}")

Successfully saved processed dataset at: /home/azureuser/cloudfiles/code/Users/oskar.wolf/nlp_air_france_reviews/data/processed/processed_reviews.csv


In [43]:
df_final.head()

Unnamed: 0,rating,review_length_text,review_length_title,polarity_text,polarity_title,subjectivity_text,subjectivity_title,lemmatized_text,lemmatized_title,day_of_week,month,year
0,1,408,2,0.018448,-1.0,0.512241,1.0,"[travel, travel, often, last, week, fly, texas...","[bad, airline]",Wednesday,11,2024
1,1,157,4,-0.060897,-1.0,0.592949,1.0,"[review, regard, economy, flex, choose, seat, ...","[terrible, experience, airfrance]",Wednesday,11,2024
2,1,259,11,-0.094163,-0.6,0.488287,0.7,"[recently, fly, hong, kong, nov, th, deeply, d...","[extremely, disappointing, experience, air, fr...",Tuesday,11,2024
3,1,274,1,-0.109373,-1.0,0.494012,1.0,"[wow, experience, I, ve, fly, porter, shock, c...",[horrible],Monday,11,2024
4,1,311,7,-0.126476,-1.0,0.485192,1.0,"[spend, day, vacation, hong, kong, family, ret...","[bad, flight, experience, I, ve, ever]",Monday,11,2024
