In [1]:
!pip install pandas numpy boto3 textblob



In [2]:
import sys
sys.path.append('../utils/')

In [3]:
from utils import *

In [4]:
from textblob import TextBlob

In [5]:
from collections import Counter

In [6]:
import pandas as pd
import numpy as np
from collections import OrderedDict
import boto3

In [7]:
import pyspark as ps
import warnings
from pyspark.sql import SQLContext

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.types import *

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/hadoop/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
try:
    # create SparkContext on all CPUs available: in my case I have 1 CPUs on my laptop
    # https://towardsdatascience.com/sentiment-analysis-with-pyspark-bc8e83f80c35
    conf = ps.SparkConf().setAppName("App")
    conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '40g')
        .set('spark.driver.memory', '40g')
        .set('spark.driver.maxResultSize', '40g'))
    sc = ps.SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")

Just created a SparkContext


In [10]:
s3 = boto3.resource('s3',  region_name='us-east-1')
# obj = s3.Object(bucket_name='st1800newsdataset', key='dataset/news.csv')
obj = s3.Object(bucket_name='newsaws', key='news.csv')
data_frame = pd.read_csv(obj.get()['Body'])

In [11]:
data_frame.astype(str).head(20)

Unnamed: 0,id,id_news,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."
5,5,17288,"Sick With a Cold, Queen Elizabeth Misses New Y...",New York Times,Sewell Chan,2017-01-02,2017.0,1.0,,"LONDON — Queen Elizabeth II, who has been b..."
6,6,17289,Taiwan’s President Accuses China of Renewed In...,New York Times,Javier C. Hernández,2017-01-02,2017.0,1.0,,BEIJING — President Tsai of Taiwan sharpl...
7,7,17290,"After ‘The Biggest Loser,’ Their Bodies Fought...",New York Times,Gina Kolata,2017-02-08,2017.0,2.0,,"Danny Cahill stood, slightly dazed, in a blizz..."
8,8,17291,"First, a Mixtape. Then a Romance. - The New Yo...",New York Times,Katherine Rosman,2016-12-31,2016.0,12.0,,"Just how is Hillary Kerr, the founder of ..."
9,9,17292,Calling on Angels While Enduring the Trials of...,New York Times,Andy Newman,2016-12-31,2016.0,12.0,,Angels are everywhere in the Muñiz family’s ap...


In [12]:
data_frame.count()

id             142570
id_news        142570
title          142568
publication    142570
author         126694
date           139929
year           139929
month          139929
url             85559
content        142570
dtype: int64

In [13]:
data_frame.dropna(axis=0, inplace=True)
data_frame.count()

id             75288
id_news        75288
title          75288
publication    75288
author         75288
date           75288
year           75288
month          75288
url            75288
content        75288
dtype: int64

In [14]:
df = sqlContext.createDataFrame(data_frame.astype(str))

In [15]:
df.show(100)

+-----+-------+--------------------+-----------+--------------------+----------+------+-----+--------------------+--------------------+
|   id|id_news|               title|publication|              author|      date|  year|month|                 url|             content|
+-----+-------+--------------------+-----------+--------------------+----------+------+-----+--------------------+--------------------+
|60330|  82596|Donald Trump blas...|   Fox News|  Christopher Snyder|2015-12-30|2015.0| 12.0|https://web.archi...| Donald Trump lau...|
|60334|  82600|Drop in oil price...|   Fox News|      Brooke Singman|2016-01-01|2016.0|  1.0|https://web.archi...| The plunge in oi...|
|60338|  82605|Open carry comes ...|   Fox News|         John R Lott|2015-12-30|2015.0| 12.0|https://web.archi...| With about 900, ...|
|60352|  82621|GOP field rips Ob...|   Fox News|        Joseph Weber|2016-01-03|2016.0|  1.0|https://web.archi...| Republican presi...|
|60360|  82630|President Obama w...|   Fox News|

In [16]:
df.count()

75288

In [17]:
df = df.dropna()
df.count()

75288

In [18]:
news_list = df.toPandas()['content'].to_list()

In [19]:
polarity, _ = getSentimentAnalysis(news_list)

In [20]:
sentiment_labels = [getSentimentLabels(score) for score in polarity]

In [21]:
sentiment_labels

['NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'POSITIVE',
 'POSITIVE',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'POSITIVE',
 'NEUTRAL',
 'POSITIVE',
 'POSITIVE',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'POSITIVE',
 'NEUTRAL',
 'NEUTRAL',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'NEUTRAL',
 'POSITIVE',
 'NEUTRAL',
 'POSITIVE',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'POSITIVE',
 'NEUTRAL',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'NEUTRAL',
 'NEUTRAL',
 'POSITIVE',
 'POSITIVE',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'POSITIVE',
 'NEUTRAL',
 'NEUTRAL',
 'POSITIVE',
 'NEUTRAL',
 'NEUTRAL',
 'POSITIVE',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'POSITIVE',
 'POSITIVE',
 'NEUTRAL',
 'POSITIVE',
 'POSITIVE',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'POSITIVE',
 'POSIT

In [26]:
num_labels = Counter(sentiment_labels)

In [27]:
num_labels

Counter({'NEUTRAL': 46139, 'POSITIVE': 28155, 'NEGATIVE': 994})

In [28]:
from pyspark.sql.types import StringType

In [37]:
labels_df = ps.createDataFrame(sentiment_labels, IntegerType()).show()

NameError: name 'spark' is not defined

In [31]:
(train_set, val_set, test_set) = df.randomSplit([0.6, 0.2, 0.2], seed = 2000)

TypeError: randomSplit() got an unexpected keyword argument 'test_size'

In [None]:
train_set.show(100)

In [None]:
tokenizer = Tokenizer(inputCol="content", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

In [None]:
pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.show(5)

In [None]:
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_set)
predictions = lrModel.transform(val_set)