In [None]:
from pyspark.sql import SparkSession


In [None]:
spark = SparkSession.builder.appName('CoronavirusNLP').getOrCreate()

In [None]:
data = spark.read.csv('Corona_NLP_train.csv', header = True)

In [None]:
data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|            UserName|          ScreenName|            Location|             TweetAt|       OriginalTweet|Sentiment|
+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|                3799|               48751|              London|          16-03-2020|@MeNyrbie @Phil_G...|  Neutral|
|                3800|               48752|                  UK|          16-03-2020|advice Talk to yo...| Positive|
|                3801|               48753|           Vagabonds|          16-03-2020|Coronavirus Austr...| Positive|
|                3802|               48754|                null|          16-03-2020|My food stock is ...|     null|
|              PLEASE|         don't panic| THERE WILL BE EN...|                null|                null|     null|
|           Stay calm|          stay safe.|                null|

In [None]:
data.describe()

DataFrame[summary: string, UserName: string, ScreenName: string, Location: string, TweetAt: string, OriginalTweet: string, Sentiment: string]

In [None]:
data.columns

['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']

In [None]:
print((data.count(),len(data.columns)))

(68046, 6)


In [1]:
#Data preparation


In [None]:
from pyspark.sql.functions import length

In [None]:
data = data.withColumn('Tweet_length', length(data['OriginalTweet']))

In [None]:
data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+---------+------------+
|            UserName|          ScreenName|            Location|             TweetAt|       OriginalTweet|Sentiment|Tweet_length|
+--------------------+--------------------+--------------------+--------------------+--------------------+---------+------------+
|                3799|               48751|              London|          16-03-2020|@MeNyrbie @Phil_G...|  Neutral|         111|
|                3800|               48752|                  UK|          16-03-2020|advice Talk to yo...| Positive|         237|
|                3801|               48753|           Vagabonds|          16-03-2020|Coronavirus Austr...| Positive|         131|
|                3802|               48754|                null|          16-03-2020|My food stock is ...|     null|          51|
|              PLEASE|         don't panic| THERE WILL BE EN...|                null|     

In [None]:
sentiments= ['Positive','Negative','Neutral','Extremely Positive','Extremely Negative']

In [None]:
#data.groupby('Sentiment').mean().show()

In [None]:
df = data.filter(data.Sentiment.isin(sentiments))

In [None]:
df.select('Sentiment').distinct().show()

+------------------+
|         Sentiment|
+------------------+
|Extremely Negative|
|           Neutral|
|          Positive|
|          Negative|
|Extremely Positive|
+------------------+



In [None]:
df.select('Sentiment').distinct().count()

5

In [None]:
df.groupby('Sentiment').count().show()

+------------------+-----+
|         Sentiment|count|
+------------------+-----+
|Extremely Negative| 3751|
|           Neutral| 5224|
|          Positive| 7718|
|          Negative| 6857|
|Extremely Positive| 4412|
+------------------+-----+



## Here we have grouped the type of sentiments i.e. extremely negative,neutral,positive,negative,extremely positive


In [None]:
df.groupby('Sentiment').mean().show()

+------------------+------------------+
|         Sentiment| avg(Tweet_length)|
+------------------+------------------+
|Extremely Negative| 209.6656891495601|
|           Neutral| 151.2949846860643|
|          Positive|193.66195905675045|
|          Negative| 189.6651596908269|
|Extremely Positive| 215.0605167724388|
+------------------+------------------+



## We have calculated the mean of tweets

In [None]:
df.groupby('Location').mean().show()

+--------------------+------------------+
|            Location| avg(Tweet_length)|
+--------------------+------------------+
|                 ...|             197.0|
| Mumbai, Maharashtra|154.66666666666666|
| Brisbane, Australia|             207.0|
|West Woofle-Dust ...|             157.0|
|   St Petersburg, FL|169.57142857142858|
| All across Michigan|             224.0|
|     Northumberland |             280.0|
|     stoke on trent |             187.0|
|some where around...|             126.0|
|           Bangalore|176.21052631578948|
|           Norn Iron|             244.0|
|Horsham, Pennsylv...|             189.0|
|       Shimla  India|              89.0|
|Ferrara, Emilia R...|             230.0|
|      Luton, England|             198.0|
|              Heaven|             198.0|
|       St George, UT|             188.0|
|Just to the left ...|             205.0|
|           Worcester|             258.5|
|      Nellore/Canada|             280.0|
+--------------------+------------

In [None]:
df.groupby('Location').count().show()

+--------------------+-----+
|            Location|count|
+--------------------+-----+
|                 ...|    1|
| Mumbai, Maharashtra|    3|
| Brisbane, Australia|    4|
|West Woofle-Dust ...|    1|
|   St Petersburg, FL|    7|
| All across Michigan|    1|
|     Northumberland |    1|
|     stoke on trent |    1|
|some where around...|    1|
|           Bangalore|   19|
|           Norn Iron|    1|
|Horsham, Pennsylv...|    1|
|       Shimla  India|    1|
|Ferrara, Emilia R...|    1|
|      Luton, England|    1|
|              Heaven|    1|
|       St George, UT|    1|
|Just to the left ...|    1|
|           Worcester|    2|
|      Nellore/Canada|    1|
+--------------------+-----+
only showing top 20 rows



In [None]:
df.show()

+--------+----------+--------------------+----------+--------------------+------------------+------------+
|UserName|ScreenName|            Location|   TweetAt|       OriginalTweet|         Sentiment|Tweet_length|
+--------+----------+--------------------+----------+--------------------+------------------+------------+
|    3799|     48751|              London|16-03-2020|@MeNyrbie @Phil_G...|           Neutral|         111|
|    3800|     48752|                  UK|16-03-2020|advice Talk to yo...|          Positive|         237|
|    3801|     48753|           Vagabonds|16-03-2020|Coronavirus Austr...|          Positive|         131|
|    3804|     48756|ÜT: 36.319708,-82...|16-03-2020|As news of the re...|          Positive|         249|
|    3805|     48757|35.926541,-78.753267|16-03-2020|"Cashier at groce...|          Positive|         184|
|    3807|     48759|     Atlanta, GA USA|16-03-2020|Due to COVID-19 o...|          Positive|         280|
|    3808|     48760|    BHAVNAGAR,GU

In [None]:
print((df.count(),len(df.columns)))

(27962, 7)


In [None]:
from pyspark.sql.functions import isnan,when,count,col

In [None]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+--------+----------+--------+-------+-------------+---------+------------+
|UserName|ScreenName|Location|TweetAt|OriginalTweet|Sentiment|Tweet_length|
+--------+----------+--------+-------+-------------+---------+------------+
|       0|         0|    6152|      0|            0|        0|           0|
+--------+----------+--------+-------+-------------+---------+------------+



In [None]:
#Feature Transformation

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, RegexTokenizer

In [None]:
tokenizer = Tokenizer(inputCol = "OriginalTweet", outputCol = "token_text")
stopremove = StopWordsRemover(inputCol="token_text", outputCol = "stop_tokens")
#Cleaned version of Tokens
#Counting Occurnace of tokens
count_vec = CountVectorizer(inputCol = "stop_tokens", outputCol = "c_vec")
idf = IDF(inputCol = "c_vec", outputCol = "tf_idf")

Corona_to_num = StringIndexer(inputCol = "Sentiment", outputCol = "label")

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [None]:
clean_up = VectorAssembler(inputCols =["tf_idf", "Tweet_length"], outputCol = "features")

In [None]:
#Model

In [None]:
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier, DecisionTreeClassifier

In [None]:
NB= NaiveBayes()
RF = RandomForestClassifier(numTrees = 50)
DTC = DecisionTreeClassifier (maxDepth = 10)

In [None]:
#pipeline

In [None]:
from pyspark.ml import Pipeline

In [None]:
data_prep_pipeline = Pipeline(stages =[Corona_to_num,tokenizer, stopremove,count_vec,idf,clean_up])

In [None]:
cleaner = data_prep_pipeline.fit(df)

In [None]:
clean_data = cleaner.transform(df)

In [None]:
clean_data.show()

+--------+----------+--------------------+----------+--------------------+------------------+------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|UserName|ScreenName|            Location|   TweetAt|       OriginalTweet|         Sentiment|Tweet_length|label|          token_text|         stop_tokens|               c_vec|              tf_idf|            features|
+--------+----------+--------------------+----------+--------------------+------------------+------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|    3799|     48751|              London|16-03-2020|@MeNyrbie @Phil_G...|           Neutral|         111|  2.0|[@menyrbie, @phil...|[@menyrbie, @phil...|(78305,[14499,289...|(78305,[14499,289...|(78306,[14499,289...|
|    3800|     48752|                  UK|16-03-2020|advice Talk to yo...|          Positive|         237|  0.0|[advice, talk, t

In [None]:
clean_data = clean_data.select(['label','features'])

In [None]:
clean_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  2.0|(78306,[14499,289...|
|  0.0|(78306,[13,14,133...|
|  0.0|(78306,[8,14,37,7...|
|  0.0|(78306,[7,8,31,47...|
|  0.0|(78306,[3,6,18,60...|
|  0.0|(78306,[1,6,8,13,...|
|  1.0|(78306,[11,13,14,...|
|  2.0|(78306,[48,70,147...|
|  3.0|(78306,[13,14,23,...|
|  0.0|(78306,[8,10,23,5...|
|  0.0|(78306,[4,8,24,38...|
|  4.0|(78306,[1,4,9,11,...|
|  1.0|(78306,[4,21,44,7...|
|  3.0|(78306,[10,37,54,...|
|  1.0|(78306,[4,8,24,33...|
|  4.0|(78306,[1,7,11,36...|
|  1.0|(78306,[1,4,7,34,...|
|  2.0|(78306,[5,47,48,6...|
|  0.0|(78306,[8,12,23,2...|
|  1.0|(78306,[6,28,33,9...|
+-----+--------------------+
only showing top 20 rows



In [None]:
#ML Training

In [None]:
(training,testing)=clean_data.randomSplit([0.8,0.2])

In [None]:
PredictNB = NB.fit(training)

In [None]:
PredictRF= RF.fit(training)

In [None]:
#testing the model

In [None]:
NB_results = PredictNB.transform(testing)

In [None]:
RF_results = PredictRF.transform(testing)

In [None]:
#DTC_results = PredictDTC.transform(testing)

In [None]:
NB_results.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(78306,[0,1,2,7,1...|[-1207.2654728345...|[1.23172711334096...|       4.0|
|  0.0|(78306,[0,1,2,12,...|[-1155.6053393367...|[4.49669991632944...|       3.0|
|  0.0|(78306,[0,1,2,16,...|[-1247.5586027990...|[0.99992116013007...|       0.0|
|  0.0|(78306,[0,1,2,16,...|[-1388.3384940412...|[5.52243859179497...|       4.0|
|  0.0|(78306,[0,1,2,29,...|[-1850.6003542964...|[2.37600010972831...|       4.0|
|  0.0|(78306,[0,1,2,30,...|[-2398.4784545940...|[0.99998786596188...|       0.0|
|  0.0|(78306,[0,1,2,40,...|[-1477.6725368856...|[6.24553590260341...|       2.0|
|  0.0|(78306,[0,1,3,4,6...|[-1168.4420450442...|[6.68937393488242...|       4.0|
|  0.0|(78306,[0,1,3,4,1...|[-1221.7800777800...|[5.24914792017073...|       2.0|
|  0.0|(78306,[0

In [None]:
RF_results.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(78306,[0,1,2,7,1...|[13.9174724015062...|[0.27834944803012...|       0.0|
|  0.0|(78306,[0,1,2,12,...|[14.4229994994904...|[0.28845998998980...|       0.0|
|  0.0|(78306,[0,1,2,16,...|[13.9739187701817...|[0.27947837540363...|       0.0|
|  0.0|(78306,[0,1,2,16,...|[13.8970632598946...|[0.27794126519789...|       0.0|
|  0.0|(78306,[0,1,2,29,...|[13.7672246487022...|[0.27534449297404...|       0.0|
|  0.0|(78306,[0,1,2,30,...|[14.1322917944135...|[0.28264583588827...|       0.0|
|  0.0|(78306,[0,1,2,40,...|[14.0790984865649...|[0.28158196973129...|       0.0|
|  0.0|(78306,[0,1,3,4,6...|[14.0381486927753...|[0.28076297385550...|       0.0|
|  0.0|(78306,[0,1,3,4,1...|[14.0958876813158...|[0.28191775362631...|       0.0|
|  0.0|(78306,[0

In [None]:
#DTC_results.show()

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
eva = MulticlassClassificationEvaluator()
acc_NB = eva.evaluate(NB_results)

In [None]:
eva = MulticlassClassificationEvaluator()
acc_RF = eva.evaluate(RF_results)

In [None]:
print("accuracy of the NB and RF is ::", acc_NB, acc_RF)

accuracy of the NB and RF is :: 0.4050110016069719 0.11208634595726202
