# Install dependencies

In [None]:
!pip install newspaper3k

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Spark setup & load model

In [1]:
from pyspark import SparkContext,SQLContext,SparkConf

conf=SparkConf()
conf.setMaster('local[4]')
conf.set('spark.executor.memory','8g')
conf.set('spark.driver.memory','14g')
conf.set('spark.driver.maxResultSize','14g')
#conf.set('spark.yarn.executor.memoryOverhead','800m')

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [12]:
from pyspark.ml import Pipeline,PipelineModel
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, StopWordsRemover
from pyspark.ml.clustering import LocalLDAModel

textModel = PipelineModel.load('file:///wikipedia/text_model')
ldaModel = LocalLDAModel.load('file:///wikipedia/lda_model')

In [31]:
import numpy as np
import pandas as pd

X=ldaModel.topicsMatrix().toArray()
vocab = np.array(textModel.stages[2].vocabulary)

topicLabels = [' '.join(vocab[np.argsort(X[:,i])[::-1][:5]]) for i in range(100)]

def score_topics(text):
    df = sqlContext.createDataFrame(pd.DataFrame({'text':[text]}))
    vec = textModel.transform(df)
    scores = ldaModel.transform(vec).select('topicDistribution').collect()[0].topicDistribution.toArray()
    return pd.Series(dict(zip(topicLabels, scores)))

# Score URL topics

In [None]:
%pylab inline

In [100]:
from newspaper import Article
from ipywidgets import Text,Button,HBox,VBox, Output
from IPython.display import clear_output

o = Output()

def plot_topics(e):
    url = text.value
    a = Article(url)
    a.download()
    a.parse()
    with o:
        clear_output()
        score_topics(a.text).sort_values(ascending=False).head()[::-1].plot(kind='barh')
        plt.show()
    
text=Text('URL')
button=Button(description='Score Topics')
button.on_click(plot_topics)
VBox([HBox([text,button]),o])

A Jupyter Widget