In [152]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from textblob import TextBlob

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover

In [4]:
# !pip install pyspark

In [126]:
# Create a Spark session
spark = SparkSession.builder.appName("Catch_tweets").getOrCreate()

# Load the CSV file into a Spark DataFrame
tweets_data = spark.read.csv("/content/drive/MyDrive/Data/tweets.csv", header=True, inferSchema=True)

# Define a user-defined function (UDF) for sentiment analysis using TextBlob
def analyze_sentiment(text):
    analysis = TextBlob(text)
    # Classify polarity as 'positive', 'negative', or 'neutral'
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'

    else:
        return 'neutral'

In [127]:
# Register the UDF with Spark
sentiment_udf = udf(analyze_sentiment, StringType())

In [128]:
tweets_data = tweets_data.withColumn("sentiments", sentiment_udf("tweets"))

In [129]:
# Display the resulting DataFrame
tweets_data.show()

+----------+-------------------+--------+---------------+--------------------+----------+
|        id|               date|    flag|       username|              tweets|sentiments|
+----------+-------------------+--------+---------------+--------------------+----------+
|1467810672|2009-04-06 22:19:49|NO_QUERY|  scotthamilton|is upset that he ...|   neutral|
|1467810917|2009-04-06 22:19:53|NO_QUERY|       mattycus|@Kenichan I dived...|  positive|
|1467811184|2009-04-06 22:19:57|NO_QUERY|        ElleCTF|my whole body fee...|  positive|
|1467811193|2009-04-06 22:19:57|NO_QUERY|         Karoli|@nationwideclass ...|  negative|
|1467811372|2009-04-06 22:20:00|NO_QUERY|       joy_wolf|@Kwesidei not the...|  positive|
|1467811592|2009-04-06 22:20:03|NO_QUERY|        mybirch|         Need a hug |   neutral|
|1467811594|2009-04-06 22:20:03|NO_QUERY|           coZZ|@LOLTrish hey  lo...|  positive|
|1467811795|2009-04-06 22:20:05|NO_QUERY|2Hood4Hollywood|@Tatiana_K nope t...|   neutral|
|146781202

In [130]:
# convert label into integer
from pyspark.sql.functions import when


sentiment_mapping = {"positive": 1, "negative": 2, "neutral": 0}

# Create a new column 'label' using when and otherwise
tweets_data = tweets_data.withColumn(
    "label",
    when(tweets_data["sentiments"] == "positive", sentiment_mapping["positive"])
    .when(tweets_data["sentiments"] == "negative", sentiment_mapping["negative"])
    .otherwise(sentiment_mapping["neutral"])
)


In [131]:
tweets_data.show(n=4)

+----------+-------------------+--------+-------------+--------------------+----------+-----+
|        id|               date|    flag|     username|              tweets|sentiments|label|
+----------+-------------------+--------+-------------+--------------------+----------+-----+
|1467810672|2009-04-06 22:19:49|NO_QUERY|scotthamilton|is upset that he ...|   neutral|    0|
|1467810917|2009-04-06 22:19:53|NO_QUERY|     mattycus|@Kenichan I dived...|  positive|    1|
|1467811184|2009-04-06 22:19:57|NO_QUERY|      ElleCTF|my whole body fee...|  positive|    1|
|1467811193|2009-04-06 22:19:57|NO_QUERY|       Karoli|@nationwideclass ...|  negative|    2|
+----------+-------------------+--------+-------------+--------------------+----------+-----+
only showing top 4 rows



In [132]:
data = tweets_data.select('tweets','label')
data.show()

+--------------------+-----+
|              tweets|label|
+--------------------+-----+
|is upset that he ...|    0|
|@Kenichan I dived...|    1|
|my whole body fee...|    1|
|@nationwideclass ...|    2|
|@Kwesidei not the...|    1|
|         Need a hug |    0|
|@LOLTrish hey  lo...|    1|
|@Tatiana_K nope t...|    0|
|@twittera que me ...|    0|
|spring break in p...|    2|
|I just re-pierced...|    0|
|@caregiving I cou...|    0|
|@octolinz16 It it...|    0|
|@smarrison i woul...|    1|
|@iamjazzyfizzle I...|    0|
|Hollis' death sce...|    0|
|about to file taxes |    0|
|@LettyA ahh ive a...|    1|
|@FakerPattyPattz ...|    0|
|@alydesigns i was...|    1|
+--------------------+-----+
only showing top 20 rows



In [133]:
#Divide data into 70% for training, 30% for testing

(trainingData , testingData) = data.randomSplit([0.7, 0.3])

# train_rows = trainingData.count()
# test_rows = testingData.count()

# print ("Training data rows:", train_rows, "; Testing data rows:", test_rows)

In [138]:
def clean_text(text):
    # You can customize this function based on your specific cleaning requirements
    # For now, let's remove mentions, hashtags, and special characters
    cleaned_text = ' '.join([word for word in text.split() if not word.startswith('@') and not word.startswith('#')])
    cleaned_text = ''.join(e for e in cleaned_text if e.isalnum() or e.isspace())
    return cleaned_text

In [139]:
clean_udf = udf(clean_text, StringType())

In [140]:
trainingData = trainingData.withColumn("tweets", clean_udf(("tweets")))

***Clean training data***

In [141]:
# Separate "tweets" into individual words using tokenizer
tokenizer = Tokenizer(inputCol="tweets", outputCol="tweetWords")
tokenizedTrain = tokenizer.transform(trainingData)
tokenizedTrain.show(truncate=False, n=5)

+----------------------------------------------------------------------------+-----+-------------------------------------------------------------------------------------------+
|tweets                                                                      |label|tweetWords                                                                                 |
+----------------------------------------------------------------------------+-----+-------------------------------------------------------------------------------------------+
|exhausted                                                                   |2    |[exhausted]                                                                                |
|is so sad for my APL friend                                                 |2    |[is, so, sad, for, my, apl, friend]                                                        |
|I HAVE NOOOOOOOOOO FRIENDS ON TWITTER IT MAKES ME SAD WILL SOMEONE FOLLOW ME|2    |[i, have, noooooooooo, friends,

In [142]:
# Removing stop words (unimportant words to be features)
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                       outputCol="MeaningfulWords")
SwRemovedTrain = swr.transform(tokenizedTrain)
SwRemovedTrain.show(truncate=False, n=5)

+----------------------------------------------------------------------------+-----+-------------------------------------------------------------------------------------------+------------------------------------------------------------+
|tweets                                                                      |label|tweetWords                                                                                 |MeaningfulWords                                             |
+----------------------------------------------------------------------------+-----+-------------------------------------------------------------------------------------------+------------------------------------------------------------+
|exhausted                                                                   |2    |[exhausted]                                                                                |[exhausted]                                                 |
|is so sad for my APL friend                    

In [144]:
# Converting words feature into numerical feature withHashingTF funtion for model training
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
numericTrainData = hashTF.transform(SwRemovedTrain).select(
    'label', 'MeaningfulWords', 'features')
numericTrainData.show(truncate=False, n=3)

+-----+------------------------------------------------------------+---------------------------------------------------------------------------------------+
|label|MeaningfulWords                                             |features                                                                               |
+-----+------------------------------------------------------------+---------------------------------------------------------------------------------------+
|2    |[exhausted]                                                 |(262144,[148003],[1.0])                                                                |
|2    |[sad, apl, friend]                                          |(262144,[74520,74989,125638],[1.0,1.0,1.0])                                            |
|2    |[noooooooooo, friends, twitter, makes, sad, someone, follow]|(262144,[1512,125638,130047,148039,182401,199581,213767],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
+-----+---------------------------------------------------

***Modeling***

In [145]:
# Train our classifier model using training data
lr = LogisticRegression(labelCol="label", featuresCol="features",
                        maxIter=10, regParam=0.01)
model = lr.fit(numericTrainData)
print ("Training is done!")

Training is done!


In [148]:
testingData = testingData.withColumn("tweets", clean_udf(("tweets")))

In [149]:
# # Prepare testing data
tokenizedTest = tokenizer.transform(testingData)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest).select(
    'Label', 'MeaningfulWords', 'features')
numericTest.show(truncate=False, n=2)

+-----+----------------------------+------------------------------------------------------+
|Label|MeaningfulWords             |features                                              |
+-----+----------------------------+------------------------------------------------------+
|1    |[miss, much, already]       |(262144,[2306,76764,232735],[1.0,1.0,1.0])            |
|1    |[missed, new, moon, trailer]|(262144,[64344,89833,165360,201103],[1.0,1.0,1.0,1.0])|
+-----+----------------------------+------------------------------------------------------+
only showing top 2 rows



In [150]:
# Predict testing data and calculate the accuracy model
prediction = model.transform(numericTest)
predictionFinal = prediction.select(
    "MeaningfulWords", "prediction", "Label")
predictionFinal.show(n=4, truncate = False)
correctPrediction = predictionFinal.filter(
    predictionFinal['prediction'] == predictionFinal['Label']).count()
totalData = predictionFinal.count()
print("correct prediction:", correctPrediction, ", total data:", totalData,
      ", accuracy:", correctPrediction/totalData)

+----------------------------------+----------+-----+
|MeaningfulWords                   |prediction|Label|
+----------------------------------+----------+-----+
|[miss, much, already]             |1.0       |1    |
|[missed, new, moon, trailer]      |1.0       |1    |
|[head, feels, like, bowling, ball]|0.0       |0    |
|[heart, hurts, badly]             |2.0       |2    |
+----------------------------------+----------+-----+
only showing top 4 rows

correct prediction: 424919 , total data: 480624 , accuracy: 0.8840985885016146


In [155]:
limit_train_data = numericTrainData.limit(1000)

In [158]:
# limit_train_data.show(n=3)

In [153]:
# RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

In [159]:
# Train the RandomForest model
model_rf = rf.fit(numericTrainData)

In [160]:
# Make predictions on the testing set
predictions_rf = model_rf.transform(numericTest)

In [None]:
# Evaluate the RandomForest model
evaluator = MulticlassClassificationEvaluator(labelCol="sentiments", predictionCol="prediction", metricName="accuracy")

In [None]:
accuracy_rf = evaluator.evaluate(predictions_rf)
print(f"RandomForest Accuracy: {accuracy_rf}")

In [None]:
# NaiveBayes
nb = NaiveBayes(labelCol="label", featuresCol="features", smoothing=1.0, modelType="multinomial")


In [None]:
# Train the NaiveBayes model
model_nb = nb.fit(numericTrainData)

In [None]:
# Make predictions on the testing set
predictions_nb = model_nb.transform(numericTest)

In [None]:
# Evaluate the NaiveBayes model
accuracy_nb = evaluator.evaluate(predictions_nb)
print(f"NaiveBayes Accuracy: {accuracy_nb}")

In [None]:
# # Save the DataFrame to a CSV file
tweets_data.coalesce(1).write.csv('sentiments.csv', header=True, mode='overwrite')

In [None]:
df3 = pd.read_csv('/content/sentiments.csv/open.csv', error_bad_lines=False)

In [None]:
df3

In [None]:
# import csv
# field = ["id","date","flag","username","tweets","sentiments"]
# def spark_to_csv(data, file_path):
#     """ Converts spark dataframe to CSV file """
#     with open(file_path, "w") as f:
#         writer = csv.DictWriter(f, fieldnames= field)
#         writer.writerow(dict(zip(fieldnames, fieldnames)))
#         for row in data.toLocalIterator():
#             writer.writerow(row.asDict())

In [None]:
df3['date'] = pd.to_datetime(df3['date'].dt.strftime('%Y-%m-%d %H:%M:%S'))

In [None]:
df3