In [70]:
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
import numpy as np
import pandas as pd
import re

In [23]:
# Read the data
df_comments1 = pd.read_pickle('./pickle_dataframes/comments1.pkl')
df_comments2 = pd.read_pickle('./pickle_dataframes/comments2.pkl')
df_comments = pd.concat([df_comments1,df_comments2])
df_comments.reset_index(drop=True, inplace=True)

df_posts1 = pd.read_pickle('./pickle_dataframes/posts1.pkl')
df_posts2 = pd.read_pickle('./pickle_dataframes/posts2.pkl')
df_posts3 = pd.read_pickle('./pickle_dataframes/posts3.pkl')
df_posts = pd.concat([df_posts1, df_posts2, df_posts3])
df_posts.reset_index(drop=True, inplace=True)

df_postlinks = pd.read_pickle('./pickle_dataframes/posts_links.pkl')
df_tags = pd.read_pickle('./pickle_dataframes/tags.pkl')
df_users = pd.read_pickle('./pickle_dataframes/users.pkl')

### Take a look at our DFs

In [24]:
df_comments.head()

Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId
0,1,1,9,Is it fair to inquire about the disadvantages ...,2012-12-04 22:00:00.933,28
1,3,1,3,"I could have reformulated the question, but at...",2012-12-04 22:02:37.737,18
2,7,2,2,Source on this? I don't see how it could possi...,2012-12-04 22:10:10.070,45
3,13,2,1,@Nick122 In a parliamentary system like the No...,2012-12-04 22:14:33.463,43
4,15,2,0,"Yes, but you will give a negative vote by voti...",2012-12-04 22:16:29.437,45


In [25]:
df_posts.head()

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount
0,1,1,-1,5,2012-12-04 21:40:29.743,42,8309,<p>We all know the situation could arise in th...,18,2019-06-29 09:18:38.430,What are the disadvantages of first-past-the-p...,<election><voting-systems><first-past-the-post>,3,3
1,2,1,-1,19,2012-12-04 21:53:18.800,26,7832,<p>I've heard that mathematically it can be sh...,21,2017-05-03 13:53:26.063,Why can't voting be fair if there are more tha...,<voting><political-theory><voting-systems>,4,3
2,4,2,1,-1,2012-12-04 21:58:11.187,7,-1,<p>First-past-the-post voting tends to result ...,26,2012-12-04 21:58:11.187,Comment: N/A,Comment: N/A,-1,1
3,5,2,1,-1,2012-12-04 21:58:39.037,47,-1,<p>Simple plurality voting has very little in ...,8,2012-12-04 22:04:42.767,Comment: N/A,Comment: N/A,-1,1
4,6,1,-1,28,2012-12-04 21:58:47.500,46,68096,<p>Living in a country where mandatory voting ...,18,2019-02-03 17:38:05.237,What are the advantages/disadvantages of a man...,<voting><voting-systems>,8,5


### Filtering Posts

In [26]:
questions_df = df_posts[df_posts['PostTypeId'] == 1]
# questions_sample_df = questions_df.sample(frac=0.25)
#questions_df = questions_df.sample(frac=0.01)

### Text preprocessing
- Adjust StopWords?
- Stemming
- Lemmatization

In [27]:
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stopwords and Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    return ' '.join([lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')])

In [28]:
# Applying preprocessing to Title, Body, and Tags
questions_df['Title'] = questions_df['Title'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions_df['Title'] = questions_df['Title'].apply(preprocess_text)


In [29]:
questions_df['Body'] = questions_df['Body'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions_df['Body'] = questions_df['Body'].apply(preprocess_text)


In [30]:
questions_df['Tags'] = questions_df['Tags'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions_df['Tags'] = questions_df['Tags'].apply(preprocess_text)


In [31]:
questions_df.to_pickle('questions_cleaned_text.pkl')

### Topic Modelling
- TF-IDF
    - Adjust `max_features`: Limiting or expanding the number of features (words) included in the TF-IDF matrix can impact topic quality.
    - Change `ngram_range`: Including bi-grams or tri-grams (e.g., ngram_range=(1,2)) can sometimes help the model capture more meaningful phrases.

- **Clustering to find the optimal number of Topics?**

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine Title, Body, and Tags with more weight on Tags
questions_df['CombinedText'] = questions_df['Title'] + ' ' + questions_df['Body'] + ' ' + questions_df['Tags'] * 2

# Applying TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000) # You can adjust the number of features
tfidf_matrix = tfidf_vectorizer.fit_transform(questions_df['CombinedText'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions_df['CombinedText'] = questions_df['Title'] + ' ' + questions_df['Body'] + ' ' + questions_df['Tags'] * 2


### LDA

In [33]:
from sklearn.decomposition import LatentDirichletAllocation

# Number of topics
n_topics = 10  # You can change this based on your requirements

# Create and fit the LDA model
lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
lda_topics = lda.fit_transform(tfidf_matrix)

# Displaying the top words in each topic
feature_names = tfidf_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))

Topic #0:
tax government money pay budget income would federal cost state
Topic #1:
court president supreme state law power justice executive congress member
Topic #2:
vote election voting candidate party voter ballot state electoral system
Topic #3:
law state amendment constitution bill citizen right country citizenship act
Topic #4:
russia russian ukraine war nato sanction country iran military turkey
Topic #5:
china country covid eu debt bank 19 government chinese trade
Topic #6:
uk eu brexit parliament minister prime party deal mp referendum
Topic #7:
political party people democracy right country would government question like
Topic #8:
trump president republican senate election house vote democrat candidate party
Topic #9:
india pakistan country israel korea nuclear north china police war


### NMF

In [34]:
from sklearn.decomposition import NMF

# Create and fit the NMF model
nmf = NMF(n_components=n_topics, random_state=0)
nmf_topics = nmf.fit_transform(tfidf_matrix)

# Displaying the top words in each topic
for topic_idx, topic in enumerate(nmf.components_):
    print(f"Topic #{topic_idx}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))

Topic #0:
people would political question right like one law make seems
Topic #1:
vote election candidate voting voter ballot presidential electoral result win
Topic #2:
president trump vice office donald impeachment presidential biden power would
Topic #3:
eu uk brexit european deal parliament member union trade agreement
Topic #4:
russia ukraine russian war nato putin military sanction ukrainian nuclear
Topic #5:
party political republican seat democratic democrat conservative labour leader system
Topic #6:
senate house bill congress senator representative vote majority republican member
Topic #7:
state united law court federal constitution amendment supreme shall legal
Topic #8:
country china india chinese pakistan korea world international nation usa
Topic #9:
government minister tax federal debt parliament prime power money budget
