In [1]:
##Part 2:
##BUilding Logistic Regrssion Model building
##importing required libraries 
import pandas as pd
import sys
import csv
import time
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import json
from textblob import TextBlob
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.streaming import StreamingContext
import pyspark.sql.types as tp
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, Word2Vec, RegexTokenizer
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row
import datetime

# initializing spark session
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
    
# define the schema
my_schema = tp.StructType([
  tp.StructField(name= 'label',       dataType= tp.IntegerType(),  nullable= True),
  tp.StructField(name= 'id',          dataType= tp.IntegerType(),  nullable= True),
  tp.StructField(name= 'Day',       dataType= tp.StringType(),   nullable= True),
  tp.StructField(name= 'device',       dataType= tp.StringType(),   nullable= True),
  tp.StructField(name= 'user',       dataType= tp.StringType(),   nullable= True),
  tp.StructField(name= 'tweet',       dataType= tp.StringType(),   nullable= True)
])
    
# read the training dataset  
my_data = spark.read.csv('testdata.manual.2009.06.14.csv',
                         schema=my_schema,
                         header=True)

# view the data
my_data.show(5)

# print the schema of the file
my_data.printSchema()

+-----+---+--------------------+-------+------------+--------------------+
|label| id|                 Day| device|        user|               tweet|
+-----+---+--------------------+-------+------------+--------------------+
|    4|  4|Mon May 11 03:18:...|kindle2|      vcu451|Reading my kindle...|
|    4|  5|Mon May 11 03:18:...|kindle2|      chadfu|Ok, first assesme...|
|    4|  6|Mon May 11 03:19:...|kindle2|       SIX15|@kenburbary You'l...|
|    4|  7|Mon May 11 03:21:...|kindle2|    yamarama|@mikefish  Fair e...|
|    4|  8|Mon May 11 03:22:...|kindle2|GeorgeVHulme|@richardebaker no...|
+-----+---+--------------------+-------+------------+--------------------+
only showing top 5 rows

root
 |-- label: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- Day: string (nullable = true)
 |-- device: string (nullable = true)
 |-- user: string (nullable = true)
 |-- tweet: string (nullable = true)



In [2]:

#stage 1: tokenizing the tweet into tokens   
stage_1 = RegexTokenizer(inputCol= 'tweet' , outputCol= 'tokens', pattern= '\\W')
#stage 2: removing the stop words
stage_2 = StopWordsRemover(inputCol= 'tokens', outputCol= 'filtered_words')
#stage 3: creating the word vector of size 100
stage_3 = Word2Vec(inputCol= 'filtered_words', outputCol= 'vector', vectorSize= 100)
# define stage 4: Logistic Regression Model
model = LogisticRegression(featuresCol= 'vector', labelCol= 'label')

In [3]:
# setting up the pipeline
pipeline = Pipeline(stages= [stage_1, stage_2, stage_3, model])

# fit the pipeline model with the training data
pipelineFit = pipeline.fit(my_data)

In [None]:
# define a function to compute sentiments of the received tweets
import time
x_value=0
def get_prediction(tweet_text):
	try:
		global x_value   
		fieldnames = ["x_value","total_1","pospercent","neupercent"]

 # filter the tweets whose length is greater than 0
		tweet_text = tweet_text.filter(lambda x: len(x) > 0)
    # create a dataframe with column name 'tweet' and each row will contain the tweet
		rowRdd = tweet_text.map(lambda w: Row(tweet=w))
    # create a spark dataframe
		wordsDataFrame = spark.createDataFrame(rowRdd)
    # run the model using the pipeline and get the predicted value

		total=pipelineFit.transform(wordsDataFrame).select('tweet','prediction')
		negtweets=total.where(total['prediction']=='0.0').select('tweet','prediction')
		postweets=total.where(total['prediction']=='4.0').select('tweet','prediction')
		nuetraltweet=total.where(total['prediction']=='2.0').select('tweet','prediction')        
		percent=((negtweets.count()/total.count())*100)
		pospercent=((postweets.count()/total.count())*100)
		neupercent=((nuetraltweet.count()/total.count())*100)        
		print("Negative tweets percentage:%",percent)
		print("Positive tweets percentage:%",pospercent)
		print("Neutral tweets percentage:%",neupercent)        
		total.show()
    ##Writing the csv file with the % share of tweets from each fetch
		with open('percent.csv', 'a') as csv_file:
			csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
			info = {
			"total_1": percent,
			"pospercent": pospercent,
			"neupercent": neupercent                    
			}
			csv_writer.writerow(info)

		time.sleep(1)

	except : 
		print('No data')
		e = sys.exc_info()[0]
		print("This Error: %s" % e)
    
# Initialize the streaming context 
ssc = StreamingContext(sc, batchDuration= 3)##Time interval of 3 seconds

# Creating the DStream which connects to hostname:port, like localhost:9099
lines = ssc.socketTextStream('localhost', 9099) ##ports to be noted before fetching tweets

# split the tweet text by a keyword 'TWEET_APP' so that we can identify which set of words is from a single tweet
words = lines.flatMap(lambda line : line.split('TWEET_APP'))

##Filter to seperate tweets for a Particular Store (Costco or Walmart)
words=words.filter(lambda s:'costco' in s)

##Calling the get_Prediction function to predict the sentiments 
words.foreachRDD(get_prediction)

# Start streaming
ssc.start()             

# Wait for the computation to terminate
ssc.awaitTermination() 

Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|RT @SusanLiTV: #C...|       0.0|
|Concerns over a #...|       0.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 0.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 100.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|Have fun in your ...|       2.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 0.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 100.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|@Walmart

No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 66.66666666666666
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 33.33333333333333
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|@IamTheBeard10 @T...|       0.0|
|* You don’t think...|       0.0|
|Nice approach at ...|       2.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|NOW @Costco IS "U...|       0.0|
|FUCK THEM AND THE...|       0.0|
|😡😡😡😡… https:/...|       0.0|
|@TreadLightly_RE ...|       0.0|
+---------

No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 0.0
Positive tweets percentage:% 100.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|Someone let me us...|       4.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|bro what i wouldn...|       0.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 0.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 100.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|R

No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 50.0
Positive tweets percentage:% 50.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|I want Costco coo...|       4.0|
|I had a temp job ...|       0.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 0.0
Positive tweets percentage:% 100.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|RT @SEDLAW15: Quo...|       4.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No dat

No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 0.0
Positive tweets percentage:% 100.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|@giirlybabe I got...|       4.0|
|RT @SEDLAW15: Quo...|       4.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 66.66666666666666
Positive tweets percentage:% 16.666666666666664
Neutral tweets percentage:% 16.666666666666664
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|RT @BardsOfWar: C...|       0.0|
|              Masks.|       0.0|
|  Social distancing.|       0.0|
|     Food rationing.|       2.0|
|    You are witness…|       0.0|
|RT @SEDLAW15: Quo...|       4.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>

Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|@krismeetsworld T...|       0.0|
+--------------------+----------+

Edn
Negative tweets percentage:% 0.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 100.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|@BrandiKruse @EME...|       2.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|I will always fuc...|       0.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
Negative tw

No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 0.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 100.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|That is one well-...|       2.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 80.0
Positive tweets percentage:% 20.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|RT @MrHodl: Alway...|       4.0|
|https://t.co/QPU1...|       0.0|
|RT @Sam2323_43433...|       0.0|
|        🚧🚧🚧🚧🚧🚧|       0.0|
|All, Employees An...|       0.0|
+--------------------+---------

No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 75.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 25.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|RT @V_actually: C...|       0.0|
|Is this a ploy to...|       2.0|
|Or a push for lab...|       0.0|
|Costco, Kroger ra...|       0.0|
+--------------------+----------+

Edn
Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|RT @Sam2323_43433...|       0.0|
|        🚧🚧🚧🚧🚧🚧|       0.0|
|All, Employees An...|       0.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 66.66666666666666
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 33.33333333333333
+--------------------+----------+
|               tweet|prediction|
+---------

Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|Support your loca...|       0.0|
| #PopowichMeats #Yeg|       0.0|
+--------------------+----------+

Edn
Negative tweets percentage:% 40.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 60.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
| RT @COsweda: Thread|       2.0|
|      What a shame. |       0.0|
|@Timcast went bac...|       2.0|
|It looks like he'...|       0.0|
|RT @Renata564: "W...|       2.0|
+--------------------+----------+

Edn
Negative tweets percentage:% 0.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 100.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|As I havd said.  ...|       2.0|
+--------------------+----------+

Edn
No da

No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|@RinTohsaka710 @B...|       0.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|RT @Sam2323_43433...|       0.0|
|        🚧🚧🚧🚧🚧🚧|       0.0|
|All, Employees An...|       0.0|
+--------------------+----------+

Edn
Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|       

No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|Like damn did y’a...|       0.0|
+--------------------+----------+

Edn
Negative tweets percentage:% 75.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 25.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|RT @Sam2323_43433...|       0.0|
|        🚧🚧🚧🚧🚧🚧|       0.0|
|All, Employees An...|       0.0|
|When I am DONE wi...|       2.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 50.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 50.0
+-------

Negative tweets percentage:% 0.0
Positive tweets percentage:% 100.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|RT @SEDLAW15: Quo...|       4.0|
+--------------------+----------+

Edn
Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|@kefimochi @piq91...|       0.0|
+--------------------+----------+

Edn
Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|RT @nypost: The m...|       0.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|RT @ForbesLorne: ...|       2.0|
|#PopowichMeats #Y...|       0.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|@PamelaGeller @Ox...|       0.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 0.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 100.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|@takintime @Costc...|       2.0|
+--------------------+----------+

Edn
Negative tweets percentage:% 100.0
Positive tweets percentage:% 0.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|pre

+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|@LisaMei62 Costco...|       0.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 0.0
Positive tweets percentage:% 100.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|RT @PeterFotopoul...|       4.0|
+--------------------+----------+

Edn
No data
This Error: <class 'ValueError'>
Negative tweets percentage:% 0.0
Positive tweets percentage:% 100.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|So much ass in th...|       4.0|
+--------------------+----------+

Edn
Negative tweets percentage:% 0.0
Positive tweets percentage:% 100.0
Neutral tweets percentage:% 0.0
+--------------------+----------+
|               tweet|prediction|
+--------------------+---