In [17]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from bertopic import BERTopic
from hdbscan import HDBSCAN


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nino/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv("all_airlines_clean.csv")

df.head(3)
texts = df['combined_text'].astype(str).tolist()


In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
airline_names = [
    "easyjet", "ryanair", "turkish", "wizz", "air", "airways", "british",
    "lufthansa", "klm", "delta", "emirates", "qatar", "etihad", "united", 
    "american", "alitalia", "airfrance", "aeroflot"  # add more as needed
]
def preprocess(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabetic characters
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words and word not in airline_names]  # Remove stopwords 
    return ' '.join(tokens) 
clean_texts = [preprocess(t) for t in texts]
df['cleaned_text'] = df['combined_text'].astype(str).apply(preprocess)



In [5]:
df['cleaned_text'][0]

'new flights abz long overdue nonstop flights restarting aberdeen paris time operated service begins october absolutely delighted handy flight would prefer weekend option thats great long weekend'

In [6]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=500)
X = vectorizer.fit_transform(df['cleaned_text'])


In [7]:
X.shape

(940, 500)

In [8]:
n_topics = 8
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X)

# Show top words per topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):#lda.components_ is a matrix with shape (n_topics, n_words)
    top_words = [feature_names[i] for i in topic.argsort()[-10:][::-1]] #top 10 most relevant words in the current topic, ordered by their importance.
    print(f"Topic #{topic_idx + 1}: {' '.join(top_words)}")


Topic #1: cases airport flights airlines istanbul services new operated service morning
Topic #2: flight airlines flights get plane airport one would like time
Topic #3: please questions moderators automatically bot performed action concerns carrier contact
Topic #4: list check per card boarding book bank transfer credit alternative
Topic #5: die boeing landing ist airlines airport cheapest im city name
Topic #6: de la jet fine passengers care ground nice used thats
Topic #7: airlines class boeing month expedia last india first times deal
Topic #8: bag seat kg check luggage bags backpack fit im cabin


BERTopic

In [None]:
docs = df['cleaned_text'].tolist()

# Create and fit model
hdbscan_model = HDBSCAN(min_cluster_size=30, min_samples=10)
topic_model = BERTopic(hdbscan_model=hdbscan_model)
topics, probs = topic_model.fit_transform(docs)

# Assign topics back to your dataframe
df['topic'] = topics

# See topic names
topic_info = topic_model.get_topic_info()

In [13]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,286,-1_flight_airport_would_time,"[flight, airport, would, time, get, airlines, ...",[euryanair worst company next steps hi qeustio...
1,0,134,0_flight_compensation_eu_carrier,"[flight, compensation, eu, carrier, uk, claim,...",[flight cancelled hey flight got cancelled sta...
2,1,93,1_bag_kg_luggage_bags,"[bag, kg, luggage, bags, backpack, fit, cm, im...",[check backpack im flying tbilisi internationa...
3,2,89,2_card_points_flights_get,"[card, points, flights, get, miles, credit, wo...",[reward flights hey guys never booked reward f...
4,3,51,3_pilots_flight_turbulence_crash,"[pilots, flight, turbulence, crash, plane, fly...",[overcame flight anxiety back context always s...
5,4,41,4_flight_plane_people_said,"[flight, plane, people, said, woman, fuck, pas...",[among airlines fined abusive baggage fees att...
6,5,40,5_hotel_istanbul_layover_stopover,"[hotel, istanbul, layover, stopover, airlines,...",[march airlines stopover program wanted share ...
7,6,32,6_armenia_source_india_indigo,"[armenia, source, india, indigo, armenian, aze...",[daily news report date reading time minutes w...
8,7,23,7_food_airlines_vegan_gluten,"[food, airlines, vegan, gluten, economy, servi...",[airlines food flew many airlines far two flig...
9,8,22,8_de_la_die_der,"[de, la, die, der, und, nu, cu, ca, sa, pentru]",[compensatie zbor anulat salutare nu stiu daca...


In [15]:
topic_model.visualize_topics().show()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
topic_distributions = lda.transform(X)
df['topic'] = topic_distributions.argmax(axis=1)

topic_names = {
    1: "Ticket Booking & Travel Agencies",
    2: "Flight Experience (Ryanair/Turkish)",
    3: "Airport & Landing Procedures",
    4: "Safety & Regional Travel",
    5: "Turkish Airlines Experiences",
    6: "Budget Airline Complaints",
    7: "British Airways & Airport Services",
    8: "Baggage & Cabin Luggage Issues",
}
df['topic_name'] = df['topic'].map(topic_names)


# Save to a new CSV with topic assigned
df.to_csv("reddit_posts_with_topics.csv", index=False)


In [28]:
df.head(2)

Unnamed: 0,airline,id,title,text,comments,combined_text,category,score,url,created_utc,topic,cleaned_text,topic_name
0,EasyJet,1ldgpnj,New flights from ABZ,"Long overdue, but non-stop flights are restart...",Absolutely delighted about this - was such a h...,"New flights from ABZ Long overdue, but non-sto...",Other,38,https://www.reddit.com/r/Aberdeen/comments/1ld...,2025/06,1,new flights abz long overdue nonstop flights r...,Flight Experience (Ryanair/Turkish)
1,EasyJet,1ld3wdh,EU citizen traveling back to UK soon - ETA nee...,Hi all! Writing on some advice on ETA. I am a ...,Check https://www.gov.uk/eta/when-not-need-eta...,EU citizen traveling back to UK soon - ETA nee...,Other,0,https://www.reddit.com/r/ukvisa/comments/1ld3w...,2025/06,1,eu citizen traveling back uk soon eta needed h...,Flight Experience (Ryanair/Turkish)


In [30]:
df['text'][0]

'Long overdue, but non-stop flights are restarting between Aberdeen and Paris, this time operated by Easyjet.  The service begins in October.\n\n\n\nhttps://preview.redd.it/abrsqal8tf7f1.png?width=1289&format=png&auto=webp&s=1d9b73fd13a870533f3c3b37a26005ddbbb4a777\n\n'

In [15]:
topic_summary = df.groupby(['airline', 'topic']).size().reset_index(name='count')
topic_summary


Unnamed: 0,airline,topic,count
0,EasyJet,0,1
1,EasyJet,2,1
2,EasyJet,4,2
3,EasyJet,6,102
4,EasyJet,7,33
5,EasyJet,8,1
6,Ryanair,0,1
7,Ryanair,1,1
8,Ryanair,2,4
9,Ryanair,5,4
