In [24]:
import pandas as pd
import numpy as np
import ast
import pickle

from gensim import corpora
from gensim.models import Phrases
from gensim.models import LdaModel, CoherenceModel, LdaMulticore


In [33]:
#import cleaned data

def list_converter(text):
    #to revert list->str conversion from pd.read_csv
    return ast.literal_eval(text)


data = pd.read_csv('../Data/training_data.csv', converters ={'tokens':list_converter})
data = data.drop(columns = ['index'])
data.head()

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text,tokens,word_count
0,comment,gtfou07,2021-04-05 13:13:23,2021,I am single and I have not traveled to any cun...,single travel past,"[single, travel, past]",3
1,comment,gtfrgpe,2021-04-05 13:56:09,2021,What happens when you shop at dragon mart...,shop dragon mart,"[shop, dragon, mart]",3
2,comment,gthiiwi,2021-04-05 23:18:56,2021,"That’s just absolutely hilarious, is this in t...",hilarious spring souk,"[hilarious, spring, souk]",3
3,comment,gtgfl4c,2021-04-05 18:21:42,2021,Is reel cinema and roxy part of emaar?,reel cinema roxy emaar,"[reel, cinema, roxy, emaar]",4
4,comment,gth5wdv,2021-04-05 21:42:41,2021,An innocent redditor here...can someone pls ex...,innocent pls explain everyday,"[innocent, pls, explain, everyday]",4


In [49]:
#extract submissions

#submissions = data[data.text_type == 'submission']

In [48]:
#submissions ID
#sub_id = submissions['ID'].tolist()

In [16]:
#save sub_ids

with open ('sub_ids', 'wb') as file:
    pickle.dump(sub_id, file)

## **LOAD MODEL AND CORPUS**

In [31]:
model = LdaModel.load("../topic_modelling/lda_model_1")

In [35]:
#convert df['tokens'] to list of strings for bag-of-words model
docs = data['tokens'].tolist()

#from gensim bag of words documentation page

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 5000
Number of documents: 65987


In [36]:
model.print_topics()

[(0,
  '0.026*"police" + 0.015*"building" + 0.014*"family" + 0.012*"parent" + 0.012*"wife" + 0.011*"area" + 0.011*"parking" + 0.011*"metro" + 0.011*"child" + 0.010*"station"'),
 (1,
  '0.039*"visa" + 0.021*"card" + 0.014*"apply" + 0.013*"bank" + 0.013*"visit" + 0.013*"travel" + 0.012*"website" + 0.010*"book" + 0.010*"plan" + 0.010*"cancel"'),
 (2,
  '0.017*"price" + 0.017*"buy" + 0.016*"salary" + 0.015*"cost" + 0.014*"rent" + 0.012*"property" + 0.012*"market" + 0.010*"aed" + 0.010*"sell" + 0.009*"offer"'),
 (3,
  '0.020*"law" + 0.018*"government" + 0.016*"local" + 0.014*"muslim" + 0.011*"culture" + 0.011*"arab" + 0.010*"rule" + 0.009*"community" + 0.009*"expat" + 0.008*"citizenship"'),
 (4,
  '0.036*"covid" + 0.030*"test" + 0.024*"fine" + 0.013*"vaccine" + 0.012*"worker" + 0.012*"stop" + 0.012*"mask" + 0.010*"medical" + 0.010*"hospital" + 0.010*"pandemic"'),
 (5,
  '0.036*"food" + 0.027*"order" + 0.024*"buy" + 0.022*"restaurant" + 0.022*"tip" + 0.022*"service" + 0.021*"delivery" + 0.01

In [67]:
#topic label

topic_label ={
    0: "infrastructure",
    1: "travel",
    2: "accomodation and rental",
    3: "locals and culture",
    4: "covid",
    5: "food and dining",
    6: "driving and road safety",
    7: "weather and outdoors",
    8: "education and educational facilities",
    9: "entertainment and recreation"
    
}

In [68]:
#dataset of topcis and topic representation
num_topics = model.num_topics

topics_words = []

for topic in range(num_topics):
    topic_words = model.show_topic(topic, topn = 10)
    words = [word[0] for word in topic_words]
    topics_words.append({"topic": topic, "words": words, "label":topic_label[topic]})
    

#create a dataframe
topics_df = pd.DataFrame(topics_words)

topics_df
    


Unnamed: 0,topic,words,label
0,0,"[police, building, family, parent, wife, area,...",infrastructure
1,1,"[visa, card, apply, bank, visit, travel, websi...",travel
2,2,"[price, buy, salary, cost, rent, property, mar...",accomodation and rental
3,3,"[law, government, local, muslim, culture, arab...",locals and culture
4,4,"[covid, test, fine, vaccine, worker, stop, mas...",covid
5,5,"[food, order, buy, restaurant, tip, service, d...",food and dining
6,6,"[drive, road, lane, fast, speed, pass, light, ...",driving and road safety
7,7,"[water, beach, movie, summer, cat, ban, hot, t...",weather and outdoors
8,8,"[indian, arabic, middle, passport, school, uni...",education and educational facilities
9,9,"[night, mall, walk, hotel, room, area, drink, ...",entertainment and recreation


In [45]:
#include column for most probable topic for each entry

top_topic_per_document = []

for doc in corpus:
    topics = model.get_document_topics(doc)
    top_topic = sorted(topics, key=lambda x: x[1], reverse = True)[0][0]
    top_topic_per_document.append(top_topic)
    
#add column to data dataframe for the selected topic
data['top_topic'] = top_topic_per_document    

In [None]:
#merge topic label data dataframes

full_df = data.merge(topics_df, merge)

In [52]:
#topics assigned to posts
submissions = data[data.text_type == 'submission']
submissions

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text,tokens,word_count,top_topic
62813,submission,iw964e,2020-09-20 11:10:32,2020,Here’s a time-lapse I did of the golden sunset...,lapse golden sunset pass burj khalifa,"[lapse, golden, sunset, pass, burj, khalifa, b...",6,5
62814,submission,r38z4m,2021-11-27 12:14:59,2021,Helped find a tourist their lost wedding ring ...,helped tourist lost wedding ring jbr beach met...,"[helped, tourist, lost, wedding, ring, jbr, be...",12,7
62815,submission,sl3qco,2022-02-05 14:15:16,2022,My colleagues and I drove from Dubai this morn...,colleague drive morning pick garbage beach ajm...,"[colleague, drive, morning, pick, garbage, bea...",11,7
62816,submission,12hxnxe,2023-04-11 01:53:28,2023,Update on Karen case I posted a year ago about...,karen horrible karen community court instance ...,"[karen, horrible, karen, community, court, ins...",40,0
62817,submission,khzit1,2020-12-22 10:05:58,2020,Fresh supply just entered from Oman. We safe.,fresh supply oman safe,"[fresh, supply, oman, safe]",4,7
...,...,...,...,...,...,...,...,...,...
65982,submission,14f49ta,2023-06-21 14:45:45,2023,Legal advice needed. Would highly appreciate i...,legal highly legal highly private involved par...,"[legal, highly, legal, highly, private, involv...",10,2
65983,submission,14f46ji,2023-06-21 14:40:54,2023,"Best beauty saloons in Dubai? Hello fellas, I ...",beauty saloon fella wife real saloon beauty sa...,"[beauty, saloon, fella, wife, real, saloon, be...",17,8
65984,submission,14f4ri3,2023-06-21 15:10:25,2023,Scam ? Healthy.line My sister has a CBD debit ...,scam healthy line sister cbd debit card april ...,"[scam, healthy, line, sister, cbd, debit, card...",35,1
65985,submission,14f4k3r,2023-06-21 15:00:34,2023,Thoughts on Expo City properties? Anyone else ...,expo property expo sale pleasant price locate ...,"[expo, property, expo, sale, pleasant, price, ...",9,2
