In [17]:
from app.models import Session, Article, Agency, Headline
import pandas as pd
import wordcloud
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import numpy as np

In [2]:
s = Session()

In [5]:
data = s.query(Headline.last_accessed, Headline.title, Headline.comp, Article.url, Agency.name, Agency._bias, Agency._credibility).join(Headline.article).join(Article.agency).all()
df = pd.DataFrame(data, columns=['date', 'title', 'comp', 'url', 'name', 'bias', 'credibility'])
df['date'] = pd.to_datetime(df['date'])
df.head()

In [70]:
import string
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))
include_stopwords = {'dear', 'New York Times', 'Getty Images', 'AP', "'s", "’", "``", "''", "—", "–", "“", "”", "‘", "’"}
include_stopwords.update(string.punctuation)
exclude_stopwords = {'not', 'no', 'nor', 'none', 'neither', 'never', 'nothing', 'nowhere', 'nobody', 'noone', 'nought', 'nay', 'nix', 'nil', 'negatory', 'nay', 'nope', 'nah', 'naw', 'no way', 'no way', 'ago', 'said', 'go'}
stopwords |= include_stopwords
stopwords -= exclude_stopwords
def remove_stop(text):
    return [word for word in text if word not in stopwords]

POS = ['NN', 'NNS', 'NNP', 'NNPS']

def pos_filter(text, pos=POS):
    return [word for word, tag in nltk.pos_tag(text) if tag in pos]

pipeline = [str.lower, nltk.word_tokenize, pos_filter, remove_stop]
def prepare(text, pipeline):
    for transform in pipeline:
        text = transform(text)
    return text

In [46]:
df['tokens'] = df['title'].apply(prepare, pipeline=pipeline)
df.head()

In [47]:
from collections import Counter
def compute_idf(df, column='tokens', preprocess=None, min_df=2):
    def update(doc):
        tokens = doc if preprocess is None else preprocess(doc)
        counter.update(set(tokens))
    counter = Counter()
    df[column].map(update)
    
    idf_df = pd.DataFrame.from_dict(counter, orient='index', columns=['df'])
    idf_df = idf_df.query('df >= @min_df')
    idf_df['idf'] = np.log(len(df) / idf_df['df']) + 0.1
    idf_df.index.name = 'token'
    return idf_df

In [48]:
idf_df = compute_idf(df)
idf_df.head()

In [49]:
def count_words(df, column='tokens', preprocess=None):
    def update(doc):
        tokens = doc if preprocess is None else preprocess(doc)
        counter.update(tokens)
    counter = Counter()
    df[column].map(update)
    freq_df = pd.DataFrame.from_dict(counter, orient='index', columns=['freq'])
    freq_df = freq_df[freq_df['freq'] >= freq_df['freq'].min()]
    freq_df.index.name = 'token'
    return freq_df.sort_values('freq', ascending=False)

In [50]:
freq_df = count_words(df)
freq_df.head()

In [51]:
freq_df['tfidf'] = freq_df['freq'] * idf_df['idf']


In [52]:
wc = wordcloud.WordCloud(width=800, height=400, max_words=100, background_color='white').generate_from_frequencies(freq_df['tfidf'])
plt.imshow(wc, interpolation='bilinear')

In [65]:
from textacy.extract import kwic
for k in kwic.keyword_in_context(' '.join(df['title']), ' go ', window_width=35):
    print(k)
    

In [69]:
# Search df title for phrase 'Go to' and show the agency:

df[df['title'].str.contains('go to item', case=False, na=False)]['title']