In [None]:
### jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Import packages
import pandas as pd
import numpy as np
import glob
import re

# Tokenization
import gensim

# Lemmatization
import spacy
from __future__ import unicode_literals

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

# NMF model
from sklearn.decomposition import NMF

In [None]:
#Open csv files (raw data)
news_files = glob.glob('.csv') 

In [None]:
# Files into dataframe
df = pd.DataFrame()

for f in news_files:
    infile = pd.read_csv(f)
    df_temp = pd.DataFrame({'timestamp': infile.Timestamp,
                            'source': infile.Source,
                            'text': infile.Text,
                            'keywords': infile.Keywords})
    df = pd.concat([df, df_temp])

df.drop_duplicates(subset='text', keep='first').dropna()

In [None]:
# Organize dataframe
df['newspaper'] = df['timestamp'].apply(lambda s: s.split('_')[0])
df['date'] = df['timestamp'].apply(lambda s: s.split('_')[1])
df['hour'] = df['timestamp'].apply(lambda s: s.split('_')[2])
df['minute'] = (df['timestamp'].apply(lambda s: s.split('_')[3])).apply(lambda s: s.split('.')[0])

In [None]:
# Select Text, convert to list
data = df.text.values.tolist()

In [None]:
# Remove distracting single quotes
data = [re.sub("\'", "", str(sent)) for sent in data]

# Remove Links
data = [re.sub(r"http\S+", "", sent) for sent in data]
data = [re.sub(r"www\S+", "", sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', " ", str(sent)) for sent in data]

In [None]:
# Tokenize text and Clean-up
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [None]:
# Lemmatize text

# Create lemmatization function
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'pt' model, keeping only tagger component (for efficiency)
nlp = spacy.load('pt', disable=['parser', 'ner']) 

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [None]:
# Stopwords list (portuguese)
stoplist = open("stopwords_pt.txt", "r")
portuguese = [i.strip() for i in stoplist]

In [None]:
# Vectorize TF-IDF
vectorizer = TfidfVectorizer(analyzer='word',       
                             min_df=5,                         # minimum required occurences of a word 
                             max_df=0.95,
                             stop_words=portuguese,            # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             max_features=50000,               # max number of uniq words
                             use_idf = True,                   # enable idf reweighting
                             ngram_range=(1,2)
                             )

data_vectorized = vectorizer.fit_transform(data_lemmatized)
feature_names = vectorizer.get_feature_names()

In [None]:
# Build NMF Model
nmf = NMF(n_components=100, 
          random_state=1, 
          alpha=.1, 
          l1_ratio=.5, 
          init='nndsvd').fit(data_vectorized)

In [None]:
# Create Document - Topic Matrix
nmf_output = nmf.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(nmf.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(nmf_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics2 = df_document_topic.head(1241).style.applymap(color_green).applymap(make_bold)
df_document_topics2

In [None]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

In [None]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=nmf, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=nmf, n_words=15)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

In [None]:
# Save output - matrix W 
df_topic_keywords.to_csv("topic_keywords.csv")

# Save output - matrix H
df_document_topic.to_csv("document_topic.csv")