In [1]:
import numpy as np
import pandas as pd
import os
import csv
import seaborn as sns
import json
from decimal import Decimal
import nltk
#nltk.download()
from nltk.corpus import stopwords
import string
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from afinn import Afinn
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# analyser = SentimentIntensityAnalyzer()
afinn=Afinn()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

In [2]:
def text_to_sentiment_score(text):
   return afinn.score(text)

def sentiment_score_to_category(score):
    if(score>0):
        return 'positive'
    if(score<0):
        return 'negative'
    if(score==0):
        return 'neutral'
      

def clean_text(normalized_text):
    nopunc = [char for char in normalized_text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
      
maxUdf=udf(text_to_sentiment_score, FloatType())
maxUdf1=udf(sentiment_score_to_category, StringType())
maxUdf2=udf(clean_text, StringType())


In [3]:
inputPath = "/FileStore/tables/"

df_episode = sqlContext.read.format('csv').options(header='true', inferSchema='true').load('/FileStore/tables/Episode24.csv')

In [4]:
display(df_episode)

In [5]:
data = sqlContext.read.format("csv").option("header", "true").option("inferSchema", "true").load("/FileStore/tables/Episode24.csv")

data.cache() # Cache data for faster reuse
data = data.dropna() # drop rows with missing values

# Register table so it is accessible via SQL Context
# For Apache Spark = 2.0
data.createOrReplaceTempView("data_tbl")
#display(data)

In [6]:
data = data.withColumn("sentiment_score", maxUdf('normalized_text'))
data = data.withColumn("sentiment", maxUdf1('sentiment_score'))

In [7]:
data.createOrReplaceTempView("data_tbl")
display(data)

In [8]:
data=data.select("episode_id","normalized_text","sentiment").rdd.toDF()
# display(data)

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

categoricalColumns = ["normalized_text"]
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
  # Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  # Use OneHotEncoder to convert categorical variables into binary SparseVectors
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  # Add stages.  These are not run here, but will run all at once later on.
  stages += [stringIndexer, encoder]

In [10]:
label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")
stages += [label_stringIdx]

In [11]:
cols = data.columns
print(cols)

In [12]:
# Transform all features into a vector using VectorAssembler
numericCols = ["episode_id"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [13]:
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(data)
data = pipelineModel.transform(data)

# Keep relevant columns
selectedcols = ["label", "features"] + cols
data = data.select(selectedcols)
display(data)

In [14]:
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed = 100)
print trainingData.count()
print testData.count()

In [15]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# Train model with Training Data
lrModel = lr.fit(trainingData)

In [16]:
# Make predictions on test data using the transform() method.
# LogisticRegression.transform() will only use the 'features' column.
predictions = lrModel.transform(testData)

In [17]:
predictions.printSchema()

In [18]:
selected = predictions.select("label", "prediction", "probability", "normalized_text")
display(selected)

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

In [20]:
evaluator.getMetricName()

In [21]:
print lr.explainParams()

In [22]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

In [23]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(trainingData)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing

In [24]:
predictions = cvModel.transform(testData)

In [25]:
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator.evaluate(predictions)

In [26]:
print 'Model Intercept: ', cvModel.bestModel.interceptVector

In [27]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3)

# Train model with Training Data
dtModel = dt.fit(trainingData)

In [28]:
print "numNodes = ", dtModel.numNodes
print "depth = ", dtModel.depth

In [29]:
# Make predictions on test data using the Transformer.transform() method.
predictions = dtModel.transform(testData)

In [30]:
predictions.printSchema()

In [31]:
# View model's predictions and probabilities of each prediction class
selected = predictions.select("label", "prediction", "probability", "normalized_text", "sentiment")
display(selected)

In [32]:
Random Forest

In [33]:
from pyspark.ml.classification import RandomForestClassifier

# Create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Train model with Training Data
rfModel = rf.fit(trainingData)

In [34]:
# Make predictions on test data using the Transformer.transform() method.
predictions = rfModel.transform(testData)
predictions.printSchema()

In [35]:
# View model's predictions and probabilities of each prediction class
selected = predictions.select("label", "prediction", "probability", "normalized_text", "sentiment")
display(selected)

In [36]:
Evaluate using Binary classification

In [37]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

In [38]:
# Create ParamGrid for Cross Validation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [2, 4, 6])
             .addGrid(rf.maxBins, [20, 60])
             .addGrid(rf.numTrees, [5, 20])
             .build())

In [39]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations.  This can take about 6 minutes since it is training over 20 trees!
cvModel = cv.fit(trainingData)

In [40]:
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator.evaluate(predictions)

In [41]:
# View Best model's predictions and probabilities of each prediction class
selected = predictions.select("label", "prediction", "probability", "normalized_text", "sentiment")
display(selected)

In [42]:
bestModel = cvModel.bestModel

In [43]:
# Generate predictions for entire dataset
finalPredictions = bestModel.transform(data)

In [44]:
# Evaluate best model
evaluator.evaluate(finalPredictions)

In [45]:
finalPredictions.createOrReplaceTempView("finalPredictions")

In [46]:
%sql
select * from finalPredictions