**Twitter Data**

Dependencies

In [None]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, MiniBatchNMF, LatentDirichletAllocation
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_excel("/content/drive/MyDrive/A3/PostPRediction.xlsx")

Helper functions

In [None]:
def remove_by_regex(texts, regexp):
        output_texts = regexp.sub("",texts)
        return output_texts

def remove_urls(input_text):
        return remove_by_regex(input_text, re.compile(r"http.?://[^\s]+[\s]?"))

def remove_hashtags_and_mentions(input_text):
        input_text = remove_by_regex(input_text, re.compile(r"(#[A-Za-z]+[A-Za-z0-9-_]+)"))
        return remove_by_regex(input_text, re.compile(r"(?<!RT\s)(@[A-Za-z]+[A-Za-z0-9-_]+)"))

def preprocesstextForLDA(incorpus):
    outcorpus = []
    for i in range(0, len(incorpus)):
      corpuselement = remove_urls(incorpus[i])
      corpuselement = remove_hashtags_and_mentions(incorpus[i])
      corpuselement = re.sub('[^a-zA-Z]', ' ', corpuselement)
      corpuselement = corpuselement.lower()
      corpuselement = corpuselement.split()
      ps = PorterStemmer()
      all_stopwords = stopwords.words('english')
      all_stopwords.append('https')
      all_stopwords.append('amp')
      corpuselement = [word for word in corpuselement if not word in set(all_stopwords)]
      corpuselement = ' '.join(corpuselement)
      outcorpus.append(corpuselement)
    return outcorpus

def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)


In [None]:
df['LDAContent'] = preprocesstextForLDA(df['Content'])

Subset to the relevant observations

In [None]:
dfLDA=df[df['Pred']==True]


In [None]:
dfLDA.describe()

LDA

In [None]:
# the raw term count is used for LDA.
tf_vectorizer = CountVectorizer(
    max_df=0.95, min_df=2, max_features=1000, stop_words="english"
)
tf = tf_vectorizer.fit_transform(dfLDA['LDAContent'])

In [None]:
lda = LatentDirichletAllocation(
    n_components=5,
    max_iter=5,
    learning_method="online",
    learning_offset=50.0,
    random_state=0,
)
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names_out()
plot_top_words(lda, tf_feature_names, 5, "Topics in LDA model")

NML

In [None]:
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95, min_df=2, max_features=300, stop_words="english"
)
tfidf = tfidf_vectorizer.fit_transform(dfLDA['LDAContent'])

In [None]:
nmf = NMF(
    n_components=5,
    random_state=1,
    init="nndsvda",
    beta_loss="frobenius",
    alpha_W=0.00005,
    alpha_H=0.00005,
    l1_ratio=1,
).fit(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    nmf, tfidf_feature_names, 5, "NMF model Topics"
)

In [None]:
df.groupby(['Content']).size().reset_index(name='counts')\
  .sort_values('counts', ascending=False).head(10)