# Visualize Topic Models with PyLDAViz

This notebook is modified from https://github.com/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb

### Install Required Modules

In [None]:
!pip install pyldavis

### Load Cleaned Political News Data

In [None]:
# import pandas
import pandas as pd

# read file from csv
df = pd.read_csv('news-corpus-df-clean.csv', sep='\t', encoding='utf-8')

In [None]:
def read_data(filename):
    # read in csv
    df = pd.read_csv(filename, sep='\t', encoding='utf-8')
    
    #limit df content to bias and text
    df = df.loc[:, ['bias', 'text']]
    
    return df

In [None]:
new = read_data('news-corpus-df-clean.csv')
new.head()

### Define Each Bias Class as a Separate Dataframe

In [None]:
# define each bias class as a df
center = new.loc[new['bias'] == 2]
right = new.loc[new['bias'] == 3]
left = new.loc[new['bias'] == 1]

In [None]:
# delimit each df to just its text
center_data = center['text'].values
right_data = right['text'].values
left_data = left['text'].values

### Load LDA Models with pyLDAvis

In [None]:
from __future__ import print_function

In [None]:
import  pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# define term frequency model
def LDA_tf_model(data_file):
  # LDA model using count vectorizer
  tf_vectorizer = CountVectorizer(strip_accents = 'unicode', max_df = 0.5, min_df = 10)
  dtm_tf = tf_vectorizer.fit_transform(data_file)
  lda_tf = LatentDirichletAllocation(n_topics=20, random_state=0)
  lda_tf.fit(dtm_tf)
  return lda_tf, dtm_tf, tf_vectorizer

# define TF-IDF model
def LDA_tfidf_model(data_file):
  # LDA model using tfidf vectorizer
  tfidf_vectorizer  = TfidfVectorizer(**tf_vectorizer.get_params())
  dtm_tfidf = tfidf_vectorizer.fit_transform(data_file)
  lda_tfidf = LatentDirichletAllocation(n_topics=20, random_state=0)
  lda_tfidf.fit(dtm_tfidf)
  return lda_tfidf, dtm_tfidf, tfidf_vectorizer

In [None]:
# run tf model over each of the dataframes
center_lda_tf, center_dtm_tf, center_tf_vectorizer = LDA_tf_model(center_data)
right_lda_tf, right_dtm_tf, right_tf_vectorizer = LDA_tf_model(right_data)
left_lda_tf, left_dtm_tf, left_tf_vectorizer = LDA_tf_model(left_data)



### Visualize Topic Models for Each Bias Set

In [None]:
center_viz = pyLDAvis.sklearn.prepare(center_lda_tf, center_dtm_tf, center_tf_vectorizer)
pyLDAvis.save_html(center_viz, 'center_viz.html')

In [None]:
right_viz = pyLDAvis.sklearn.prepare(right_lda_tf, right_dtm_tf, right_tf_vectorizer)
pyLDAvis.save_html(right_viz, 'right_viz.html')

In [None]:
left_viz = pyLDAvis.sklearn.prepare(left_lda_tf, left_dtm_tf, left_tf_vectorizer)
pyLDAvis.save_html(left_viz, 'left_viz.html')