<a href="https://colab.research.google.com/github/nikolajvester/7th_semester/blob/main/notebooks/M2-pol_tweets_workshop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tweet-preprocessor -q

# Installing Gensim and PyLDAvis
!pip install -qq -U gensim
!pip install -qq pyLDAvis

[K     |████████████████████████████████| 24.1 MB 5.3 MB/s 
[K     |████████████████████████████████| 1.7 MB 6.2 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone


In [None]:
# explainability (why did the model say it's hate speech)
!pip install eli5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting eli5
  Downloading eli5-0.13.0.tar.gz (216 kB)
[K     |████████████████████████████████| 216 kB 6.9 MB/s 
Collecting jinja2>=3.0.0
  Downloading Jinja2-3.1.2-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 59.7 MB/s 
Building wheels for collected packages: eli5
  Building wheel for eli5 (setup.py) ... [?25l[?25hdone
  Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107748 sha256=c31b9a40a895e284a27c56567775361f5e62f9ae4dd8e39fb6c23a4353903d3f
  Stored in directory: /root/.cache/pip/wheels/cc/3c/96/3ead31a8e6c20fc0f1a707fde2e05d49a80b1b4b30096573be
Successfully built eli5
Installing collected packages: jinja2, eli5
  Attempting uninstall: jinja2
    Found existing installation: Jinja2 2.11.3
    Uninstalling Jinja2-2.11.3:
      Successfully uninstalled Jinja2-2.11.3
[31mERROR: pip's dependency resolver does not currently ta

In [None]:
import pandas as pd
import numpy as np
import preprocessor as prepro # twitter prepro
import tqdm #progress bar

import spacy #spacy for quick language prepro
nlp = spacy.load('en_core_web_sm') #instantiating English module

# sampling, splitting
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split


# loading ML libraries
from sklearn.pipeline import make_pipeline #pipeline creation
from sklearn.feature_extraction.text import TfidfVectorizer #transforms text to sparse matrix
from sklearn.linear_model import LogisticRegression #Logit model
from sklearn.metrics import classification_report #that's self explanatory
from sklearn.decomposition import TruncatedSVD #dimensionality reduction
from xgboost import XGBClassifier

import altair as alt #viz

#explainability
import eli5
from eli5.lime import TextExplainer

# topic modeling

from gensim.corpora.dictionary import Dictionary # Import the dictionary builder
from gensim.models import LdaMulticore # we'll use the faster multicore version of LDA

# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
pyLDAvis.enable_notebook()

  from collections import Iterable
  from collections import Mapping


In [None]:
# prepro settings
prepro.set_options(prepro.OPT.URL, prepro.OPT.NUMBER, prepro.OPT.RESERVED, prepro.OPT.MENTION, prepro.OPT.SMILEY)

In [None]:
pol_tweets = pd.read_json('https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pol_tweets.gz')

In [None]:
print(pol_tweets.iloc[-1]['text'])
print(pol_tweets.iloc[-1]['labels'])

In [None]:
# write everything into one function that can be re-used later
def text_prepro(texts):
  """
  takes in a pandas series (1 column of a DF)
  removes twitter stuff
  lowercases, normalizes text
  """
  texts_clean = texts.map(lambda t: prepro.clean(t))
  texts_clean = texts_clean.str.replace('#','')

  clean_container = []

  pbar = tqdm.tqdm(total=len(texts_clean),position=0, leave=True)

  for text in nlp.pipe(texts_clean, disable=["tagger", "parser", "ner"]):

    txt = [token.lemma_.lower() for token in text 
          if token.is_alpha 
          and not token.is_stop 
          and not token.is_punct]

    clean_container.append(" ".join(txt))
    pbar.update(1)
  
  return clean_container

In [None]:
# apply all prepro-pipeline to texts
pol_tweets['text_clean'] = text_prepro(pol_tweets['text'])

In [None]:
pol_tweets.labels.value_counts().reset_index()

In [None]:
# fixing sample imbalance
rus = RandomUnderSampler(random_state=42)
data_df_res, y_res = rus.fit_resample(pol_tweets, pol_tweets['labels'])

In [None]:
# Splitting the dataset into the Training set and Test set (since we have a new output variable)
X_train, X_test, y_train, y_test = train_test_split(pol_tweets['text_clean'], pol_tweets['labels'], test_size = 0.4, stratify=pol_tweets['labels'], random_state = 42)

In [None]:
#instantiate models and "bundle up as pipeline"

tfidf = TfidfVectorizer()
cls = LogisticRegression()

pipe = make_pipeline(tfidf, cls)

In [None]:
pipe.fit(X_train,y_train) # fit model

In [None]:
# evaluate model performance on training set

y_eval = pipe.predict(X_train)
report = classification_report(y_train, y_eval)
print(report)

In [None]:
# overall weights (works only for linear models)
eli5.show_weights(pipe, top=20, target_names=['Rep.','Dem.'])

In [None]:
y_pred = pipe.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

In [None]:
debate_tweets = pd.read_json('https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pres_debate_2020.gz')

In [None]:
debate_tweets

In [None]:
# apply all prepro-pipeline to texts
debate_tweets['text_clean'] = text_prepro(debate_tweets['tweet'])

In [None]:
debate_tweets['class_pred'] = pipe.predict(debate_tweets['text_clean'])

In [None]:
debate_tweets['dem_proba'] = pipe.predict_proba(debate_tweets['text_clean'])[:,1]

In [None]:
debate_tweets.sort_values(['dem_proba'], ascending=True)['tweet'][:10]

In [None]:
debate_tweets.sort_values(['dem_proba'], ascending=False)

## Topic modellng - what do dem/rep tweets say?

you can try out the tweetopic library (it's rather new, developed by people from Århus Uni. and I'm not sure about it so far: https://centre-for-humanities-computing.github.io/tweetopic/using_tweetopic.pipeline.html)

In [None]:
debate_tweets['dem_proba'].hist()

In [None]:
rep_tweets = debate_tweets[debate_tweets['dem_proba']<=0.2]

In [None]:
# preprocess texts (we need tokens)
tokens = []

for text in nlp.pipe(rep_tweets['text_clean'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in text 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [None]:
rep_tweets['tokens'] = tokens

In [None]:
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(rep_tweets['tokens'])
# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 1000 words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in rep_tweets['tokens']]

In [None]:
# Training the model
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=5, workers = 4, passes=10)

In [None]:
# Let's try to visualize
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

In [None]:
 # Let's Visualize
pyLDAvis.display(lda_display)

**Tunint model N topics**

We can evaluate coherence from gensim.
Other measures are available within the Octis libary: https://github.com/mind-Lab/octis

Also consider:

https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0


In [None]:
from gensim.models import CoherenceModel

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=rep_tweets['tokens'], dictionary=dictionary, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### other models

In [None]:
from gensim.models import LsiModel, TfidfModel
from gensim.matutils import corpus2dense

In [None]:
tfidf = TfidfModel(corpus)

In [None]:
corpus_tfidf = tfidf[corpus]

In [None]:
lsi = LsiModel(corpus_tfidf, num_topics=10, id2word=dictionary)

In [None]:
lsi.print_topics()

In [None]:
corpus_lsi = lsi[corpus_tfidf]

In [None]:
lsi_matrix = corpus2dense(corpus_lsi, num_terms = 10)

In [None]:
lsi_matrix.T[1]

In [None]:
corpus_lsi[1]