# Text  Visualization

In [1]:
import pandas as pd
import numpy as np
import gensim, spacy
from gensim.utils import simple_preprocess
import nltk

import altair as alt



In [2]:
df = pd.read_csv('VoxData.csv', header=0).dropna()

In [3]:
print(df.shape)

(6903, 8)


In [4]:
df.head()

Unnamed: 0,title,author,category,published_date,updated_on,slug,blurb,body
0,Every year of a prison term makes a couple 32 ...,Dara Lind,Criminal Justice,2014-05-29 12:30:05,2014-05-29 12:30:07,http://www.vox.com/2014/5/29/5756646/every-yea...,But even a short jail stay can strain a marria...,A new study by criminologists Sonja Siennick a...
1,Making sense of Donald Trump,John Patty,Mischiefs of Faction,2016-01-12 19:50:08,2016-01-12 19:50:09,http://www.vox.com/mischiefs-of-faction/2016/1...,Social science predicted that it can't predict...,The current fight for the GOP presidential nom...
2,Acting white: the most insidious myth about bl...,JenÃ©e Desmond-Harris,Race in America,2015-03-04 13:40:02,2015-05-04 02:51:51,http://www.vox.com/2015/3/4/8138739/acting-whi...,This popular theory about how African-American...,You've probably heard it before: too many blac...
3,Hillary Clintonâ€™s pitch: Tim Kaine will be t...,Dylan Matthews,Hillary Clinton,2016-07-23 21:23:13,2016-07-25 15:56:38,http://www.vox.com/2016/7/23/12263516/tim-kain...,He's not Tom Perez or Cory Booker. But...,"To many on the left, Tim Kaine’s selection as ..."
4,"Democratic debate 2015: start time, schedule, ...",Andrew Prokop,Debates,2015-11-13 16:20:02,2015-11-14 23:47:28,http://www.vox.com/2015/11/13/9728432/democrat...,The three remaining candidates will debate in ...,The horrific attacks in Paris will loom large ...


# Text with metadata

In [5]:
''' distribution of authors'''
author_count = df.groupby('author').size().reset_index(name='count')

alt.Chart(author_count).mark_bar().encode(
    x = alt.X('author:N',  sort='-y'),
    y = alt.Y('count:Q'),
)


In [6]:
''' distribution of categories '''
cat_count = df.groupby('category').size().reset_index(name='count')

alt.Chart(cat_count).mark_bar().encode(
    x = alt.X('category:N',  sort='-y'),
    y = alt.Y('count:Q'),
)


In [7]:
df[df['category']=='Politics & Policy']['body'].iloc[0]

' It was February 24, 2009. Bobby Jindal, then in his first term as governor of Louisiana and a rising star in the Republican Party, was tapped to deliver the GOP response to President Barack Obama’s joint address to Congress. Obama’s speech focused on economic recovery and health-care reform and, despite its grim focus, was greeted with applause and standing ovations. Jindal’s response, televised immediately after, was supposed to be his big political "coming out."\xa0If the speech went well, many thought it would provide a jumping-off point for a possible presidential run in 2012. Instead, Jindal became the object of bipartisan ridicule. Jon Stewart skewered his Mr. Rogers–esque delivery — Jindal\'s over-enunciation and earnest stare seemed more appropriate for addressing 5-year-olds than adults. Republican strategist David Johnson characterized the speech simply as a "flop." Even Fox News criticized Jindal’s delivery as "amateurish." To be fair, such responses are often derided. In 

# Processing for Filtering and Machine Learning Models

## Tokenization

In [8]:
''' we focus on the category of culture here '''
processed = df[df['category']=='Politics & Policy'].reset_index(drop=True)

In [9]:
processed

Unnamed: 0,title,author,category,published_date,updated_on,slug,blurb,body
0,How one speech changed the course of Republica...,Tez Clark,Politics & Policy,2015-06-22 17:45:02,2016-02-15 13:12:50,http://www.vox.com/2015/6/22/8824553/bobby-jin...,Louisiana governor Bobby Jindal's disastrous s...,"It was February 24, 2009. Bobby Jindal, then ..."
1,Ted Cruz is sorry he ruined his colleagues' we...,Dara Lind,Politics & Policy,2014-12-16 21:10:02,2014-12-16 21:10:02,http://www.vox.com/2014/12/16/7404457/cruz-unp...,Especially after they sunk his vote.,"Last weekend, Ted Cruz forced the Senate to st..."
2,Did Loretta Lynch just set a time bomb for Oba...,Dara Lind,Politics & Policy,2015-01-28 20:23:07,2015-01-28 20:32:06,http://www.vox.com/2015/1/28/7929095/lynch-imm...,"Her legal standard for ""prosecutorial discreti...","At her confirmation hearing Wednesday, Attorne..."
3,Internet mob justice is random and severe. So ...,German Lopez,Politics & Policy,2015-07-31 15:40:02,2015-08-06 05:03:11,http://www.vox.com/2015/7/31/9078777/criminal-...,It's like a random lightning bolt from the sky.,Today's criminal justice system shares a big p...
4,Tax credits to pay for child care is the next ...,Matthew Yglesias,Politics & Policy,2015-09-08 11:00:01,2015-09-08 11:00:02,http://www.vox.com/2015/9/8/9262901/high-quali...,A top progressive think tank proposes Obamacar...,Parents of young children often struggle with ...
...,...,...,...,...,...,...,...,...
410,Clinton seizes the center on race and guns in ...,Jonathan Allen,Politics & Policy,2015-06-20 18:20:02,2015-06-22 16:43:42,http://www.vox.com/2015/6/20/8818611/clinton-C...,Hillary Clinton focused on racism and guns in ...,Hillary Clinton is not seen as honest and trus...
411,Why the federal government is forgiving millio...,Libby Nelson,Politics & Policy,2015-06-08 21:50:02,2015-06-08 22:21:10,http://www.vox.com/2015/6/8/8748535/corinthian...,Students at now-bankrupt Corinthian Colleges c...,Some students from the for-profit Corinthian ...
412,2 school shootings in a day is shocking. But t...,German Lopez,Politics & Policy,2015-10-09 21:30:52,2015-10-10 03:52:54,http://www.vox.com/2015/10/9/9489599/school-sh...,"America's gun problem goes much, much further ...",America has been horrified by what seems like ...
413,Obamaâ€™s dialogue with Marilynne Robinson is ...,Ezra Klein,Politics & Policy,2015-10-15 17:40:02,2016-02-15 13:25:47,http://www.vox.com/2015/10/15/9542015/obama-ma...,"""We had this idea that why donâ€™t I just have...","According to a September 2015 CNN/ORC poll, 29..."


In [10]:
''' first trial of tokenization using simple_preprocess '''
data_words = gensim.utils.simple_preprocess(processed['body'][0])

In [11]:
data_words

['it',
 'was',
 'february',
 'bobby',
 'jindal',
 'then',
 'in',
 'his',
 'first',
 'term',
 'as',
 'governor',
 'of',
 'louisiana',
 'and',
 'rising',
 'star',
 'in',
 'the',
 'republican',
 'party',
 'was',
 'tapped',
 'to',
 'deliver',
 'the',
 'gop',
 'response',
 'to',
 'president',
 'barack',
 'obama',
 'joint',
 'address',
 'to',
 'congress',
 'obama',
 'speech',
 'focused',
 'on',
 'economic',
 'recovery',
 'and',
 'health',
 'care',
 'reform',
 'and',
 'despite',
 'its',
 'grim',
 'focus',
 'was',
 'greeted',
 'with',
 'applause',
 'and',
 'standing',
 'ovations',
 'jindal',
 'response',
 'televised',
 'immediately',
 'after',
 'was',
 'supposed',
 'to',
 'be',
 'his',
 'big',
 'political',
 'coming',
 'out',
 'if',
 'the',
 'speech',
 'went',
 'well',
 'many',
 'thought',
 'it',
 'would',
 'provide',
 'jumping',
 'off',
 'point',
 'for',
 'possible',
 'presidential',
 'run',
 'in',
 'instead',
 'jindal',
 'became',
 'the',
 'object',
 'of',
 'bipartisan',
 'ridicule',
 'jon',

In [12]:
data_word_list = [simple_preprocess(sentence) for sentence in processed['body']]

In [13]:
print("length of data_word_list: " , len(data_word_list))
print("length of data_word_list[0]: " , len(data_word_list[0]))

length of data_word_list:  415
length of data_word_list[0]:  283


In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/junyuan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
#NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['com', 'from', 'subject', 're', 'edu', 'use', 'not', 'would', 
                   'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 
                   'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 
                   'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 
                   'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])


In [16]:
data_words = [[word for word in doc if word not in stop_words] for doc in data_word_list]

In [17]:
print("length of data_words: " , len(data_words))
print("length of data_words[0]: " , len(data_words[0]))

length of data_words:  415
length of data_words[0]:  176


## Stemming

In [18]:
#Stemming using porter Stemming Algorithm
from gensim.parsing.porter import PorterStemmer
p = PorterStemmer()

data_ready = []

for text in data_words:
    data_stemmed = p.stem_documents(text)
    data_ready.append(data_stemmed)
# data_ready

In [19]:
len(data_ready)

415

## Lemmatization

In [20]:
'''
too slow, do not run here
'''



# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
for sent in data_words:
    # Parse the sentence using the loaded 'en' model object `nlp`. Extract the lemma for each token and join
    doc = nlp(" ".join(sent)) 
    data_ready.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
# remove stopwords once more after lemmatization
data_ready = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_ready]


## Construct TF-IDF

In [21]:
import gensim.corpora as corpora
from gensim.sklearn_api import TfIdfTransformer

In [22]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

model = TfIdfTransformer(dictionary=id2word)

In [23]:
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

num_docs = id2word.num_docs
num_terms = len(id2word.keys())

In [24]:
for doc in corpus[:1]:
    print([[id, id2word[id], freq] for id, freq in doc])

[[0, 'address', 3], [1, 'adult', 1], [2, 'amateurish', 1], [3, 'anyth', 1], [4, 'applaus', 1], [5, 'appropri', 1], [6, 'arkansa', 1], [7, 'awai', 1], [8, 'bad', 1], [9, 'barack', 1], [10, 'becam', 1], [11, 'big', 1], [12, 'bill', 2], [13, 'bipartisan', 1], [14, 'bobbi', 1], [15, 'budget', 1], [16, 'care', 1], [17, 'career', 1], [18, 'character', 1], [19, 'clinch', 1], [20, 'clinton', 3], [21, 'collaps', 1], [22, 'congress', 1], [23, 'correct', 2], [24, 'critic', 1], [25, 'david', 1], [26, 'debat', 1], [27, 'deliv', 1], [28, 'deliveri', 3], [29, 'democrat', 3], [30, 'derid', 1], [31, 'despit', 1], [32, 'earlier', 1], [33, 'earnest', 1], [34, 'econom', 1], [35, 'embarrass', 1], [36, 'emce', 1], [37, 'enter', 1], [38, 'enunci', 1], [39, 'esqu', 1], [40, 'expect', 1], [41, 'fair', 1], [42, 'featur', 1], [43, 'februari', 1], [44, 'first', 1], [45, 'flip', 1], [46, 'flop', 2], [47, 'focu', 1], [48, 'focus', 1], [49, 'fortun', 1], [50, 'fox', 1], [51, 'frequent', 1], [52, 'full', 1], [53, 'go

In [25]:
tfidf_corpus = model.fit_transform(corpus)

In [26]:
tfidf_corpus[0]

[(0, 0.12735170398523443),
 (1, 0.0552752298872277),
 (2, 0.11518815299657885),
 (3, 0.04645082547288752),
 (4, 0.09125039829609259),
 (5, 0.06545604180041066),
 (6, 0.07025814985131087),
 (7, 0.044463793355628944),
 (8, 0.02519860066414722),
 (9, 0.0416196558681122),
 (10, 0.04866872880566115),
 (11, 0.018391876384623546),
 (12, 0.0454019158210248),
 (13, 0.057945796608287685),
 (14, 0.07668745057803678),
 (15, 0.04543149218120035),
 (16, 0.024111201894351834),
 (17, 0.05892590723552288),
 (18, 0.06936924688168661),
 (19, 0.10194351885840192),
 (20, 0.10243505152587626),
 (21, 0.07119043074646463),
 (22, 0.02519860066414722),
 (23, 0.09675399687380981),
 (24, 0.03632700127862169),
 (25, 0.038790661973341294),
 (26, 0.03844323480232415),
 (27, 0.0599590219688385),
 (28, 0.2930390970684555),
 (29, 0.06787674363960551),
 (30, 0.10744053868997408),
 (31, 0.05943548324710586),
 (32, 0.043768881574957015),
 (33, 0.10194351885840192),
 (34, 0.0416196558681122),
 (35, 0.07545425058204805),
 (

In [27]:
# construct an array of tf-idf vectors
from gensim.matutils import corpus2dense, corpus2csc

corpus_tfidf_dense = corpus2dense(tfidf_corpus, num_terms, num_docs)

In [28]:
corpus_tfidf_dense.shape

(16445, 830)

In [29]:
X = corpus_tfidf_dense[corpus_tfidf_dense.max(axis=1) > 0.1]
X.shape

(4667, 830)

## Clustering & Projection

In [30]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.cluster import KMeans


In [31]:
# result_pca = PCA(n_components=2).fit_transform(X.T)
result_tsne = TSNE(n_components=2, perplexity=10).fit_transform(X.T)

tsne_df = pd.DataFrame(data=result_tsne, columns=['x','y'])

alt.Chart(tsne_df).mark_circle().encode(
    x='x:Q',
    y='y:Q'
)

In [32]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(tsne_df[['x','y']])
tsne_df['label'] = kmeans.labels_

alt.Chart(tsne_df).mark_circle(opacity = .7).encode(
    x='x:Q',
    y='y:Q',
    color='label:N'
)

In [33]:
words = []
for id in id2word.keys():
    words.append(id2word[id])

In [34]:
mat = pd.DataFrame(data=corpus_tfidf_dense, index=words)
mat = mat[mat.max(axis=1) > 0.1]

wordtfidf = pd.DataFrame(data=mat.values.T, columns=mat.index)
wordtfidf.head()

Unnamed: 0,address,adult,amateurish,bill,bipartisan,bobbi,budget,care,character,clinch,...,victimization,rigamarole,exploitation,exploitative,overstay,hannity,nightly,trustworthy,meditation,sinister
0,0.127352,0.055275,0.115188,0.045402,0.057946,0.076687,0.045431,0.024111,0.069369,0.101944,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.01325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.225235,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
mat.max(axis=1).shape

(4667,)

### Updated: update the line 8 below.
In the old version, the sorted values are not assigned to the original list, so the result keyword lists for different groups are not distinctive. Now the sorted values are applied as shown in line 8.

All the code below 

In [36]:
''' top 10 words for each cluster '''
group_key_words = []
for label in range(10):
    # get the sum tf-idf for each word, do sum() across rows for each column
    group_df = wordtfidf[kmeans.labels_ == label].mean(axis=0)
    # sort the tf-idf values
    to_sort = [{'freq': group_df[x], 'word': x} for x in group_df.index]
    to_sort = sorted(to_sort, key=lambda d: d['freq'], reverse=True)
    # add the words to the list
    for i in range(10):
        group_key_words.append([label, to_sort[i]['word'], to_sort[i]['freq']])
        
keyword_df = pd.DataFrame(data=group_key_words, columns=['label', 'keyword', 'tfidf'])

In [37]:
''' try to plot bar chart for one cluster '''
alt.Chart(keyword_df[keyword_df['label']==2]).mark_bar().encode(
    x=alt.X('tfidf:Q',  scale=alt.Scale(domain=[0, 0.05]), title='label'),
    y=alt.Y('keyword:N', sort='-x')
).properties(
    width = 50
)

In [44]:
chart = alt.hconcat()
for label in range(10):
    chart |= alt.Chart(keyword_df[keyword_df['label']==label]).mark_bar().encode(
    x=alt.X('tfidf:Q', scale=alt.Scale(domain=[0,0.2]),title='cluster'+str(label)),
    y=alt.Y('keyword:N', sort='-x')
).properties(
    width = 50
)
    
chart

## Trend

In [45]:
''' distribution of authors'''
author_count = processed.groupby('author').size().reset_index(name='count')

alt.Chart(author_count).mark_bar().encode(
    x = alt.X('author:N',  sort='-y'),
    y = alt.Y('count:Q'),
)

### update the cell below
I added `drop=True` so that `author_count['author'][:10]` gives me the authors in the descending order based on the number of their articles.

In the old version, `author_count['author'][:10]` gives me the authors in the order of the orignal index.

In [46]:
author_count = author_count.sort_values(by='count', ascending=False).reset_index(drop=True)

In [47]:
author_count

Unnamed: 0,author,count
0,Andrew Prokop,64
1,German Lopez,56
2,Matthew Yglesias,53
3,Sarah Kliff,39
4,Dara Lind,32
5,Dylan Matthews,32
6,Libby Nelson,28
7,Timothy B. Lee,21
8,Ezra Klein,19
9,Jonathan Allen,17


## Old version: keep it here for reference.
So in our example here, `processed` is the DataFrame we get by filtering the category of politices. So basically all the authors are talking about the candidates and some other political things. But different authors are focusing on different aspects. If we want to check what are the unique parts in the articles from different authors, we need to extract keywords from the documents of specific authors. That is, we think the documents/articles of an author is a whole document. 

In [48]:
wordtfidf['author'] = processed['author']

In [55]:
author_key_words = []

''' key words for top 10 authors'''
for author in author_count['author'][:10]:
    # get the mean tf-idf for each word, do sum() across rows for each column
    group_df = wordtfidf[wordtfidf['author'] == author].mean(axis=0)
    # sort the tf-idf values
    to_sort = [{'freq': group_df[x], 'word': x} for x in group_df.index]
    to_sort = sorted(to_sort, key=lambda d: d['freq'], reverse=True)
    # add the words to the list
    for i in range(10):
        author_key_words.append([author, to_sort[i]['word'], to_sort[i]['freq']])
        
author_keyword_df = pd.DataFrame(data=author_key_words, columns=['author', 'keyword', 'tfidf'])

In [56]:
author_keyword_df

Unnamed: 0,author,keyword,tfidf
0,Andrew Prokop,lessig,0.358944
1,Andrew Prokop,scalis,0.187756
2,Andrew Prokop,bonica,0.165911
3,Andrew Prokop,kitzhab,0.150110
4,Andrew Prokop,santorum,0.150110
...,...,...,...
95,Jonathan Allen,folei,0.168684
96,Jonathan Allen,ja,0.168684
97,Jonathan Allen,pp,0.168684
98,Jonathan Allen,scout,0.126513


In [58]:
chart = alt.hconcat()
for author in author_count['author'][:10]:
    chart |= alt.Chart(author_keyword_df[author_keyword_df['author']==author]).mark_bar().encode(
        x=alt.X('tfidf:Q', scale=alt.Scale(domain=[0,1]), title=author),
        y=alt.Y('keyword:N', sort='-x'),
    ).properties(
        width = 50
    )

chart


## Updated version below:
Here we first construct documents each consists of the articles from a specific author. 

Then we construct tf-idf vectors for all the author based documents.

Then we extract keywords with highest tf-idf values.

In [52]:
def get_tfidf(input_data):
    # Create Dictionary
    id2word = corpora.Dictionary(input_data)

    model = TfIdfTransformer(dictionary=id2word)

    # Create Corpus: Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in input_data]

    num_docs = id2word.num_docs
    num_terms = len(id2word.keys())
    
    # Get tfidf matrix
    tfidf_corpus = model.fit_transform(corpus)
    corpus_tfidf_dense = corpus2dense(tfidf_corpus, num_terms, num_docs)
    # Map index back to word
    words = []
    for id in id2word.keys():
        words.append(id2word[id])
    mat = pd.DataFrame(data=corpus_tfidf_dense, index=words)
    wordtfidf = pd.DataFrame(data=mat.values.T, columns=mat.index)
    return wordtfidf

In [53]:
''' keywords for different authors '''
author_key_words = []

''' construct documents for top 10 authors'''
author_data = []
for author in author_count['author'][:10]:
    print(author)
    article_idx = processed[processed['author']==author].index.values.astype(int)
    document = []
    for i in article_idx:
        document.extend(data_ready[i])
    author_data.append(document)
    
wordtfidf = get_tfidf(author_data)
wordtfidf['author'] = author_count['author'][:10]
    
for author in author_count['author'][:10]:
    # get the tf-idf for each word, do mean() across rows for each column
    group_df = wordtfidf[wordtfidf['author'] == author].mean(axis=0)
    # sort the tf-idf values
    to_sort = [{'freq': group_df[x], 'word': x} for x in group_df.index]
    to_sort = sorted(to_sort, key=lambda d: d['freq'], reverse=True)
    # add the words to the list
    for i in range(10):
        author_key_words.append([author, to_sort[i]['word'], to_sort[i]['freq']])
        
author_keyword_df = pd.DataFrame(data=author_key_words, columns=['author', 'keyword', 'tfidf'])


Andrew Prokop
German Lopez
Matthew Yglesias
Sarah Kliff
Dara Lind
Dylan Matthews
Libby Nelson
Timothy B. Lee
Ezra Klein
Jonathan Allen


In [54]:
chart = alt.hconcat()
for author in author_count['author'][:10]:
    chart |= alt.Chart(author_keyword_df[author_keyword_df['author']==author]).mark_bar().encode(
        x=alt.X('tfidf:Q', scale=alt.Scale(domain=[0,1]), title=author),
        y=alt.Y('keyword:N', sort='-x'),
    ).properties(
        width = 50
    )

chart