### Installing Dependencies

**Step 1: Install Dependencies**
We need to install following components to run pyspark seamlessly:
OpenJDK 8,
Spark Environment,
FindSpark package

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

**Step 2: Add environment variables**
After installing dependencies, we need to some variables to the environment so that pyspark knows where to look for using dependencies. We can do that using following commands:

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

**Step 3: Initilize pyspark**
Finally, we just need to initilize pyspark which can be easily achieved using third-party package named findspark as shown below:

In [None]:
import findspark
findspark.init()

You can try running following commands to check if pyspark is properly installed or not:

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

## Mounting Google Drive

In [None]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/cloud_computing/Project

/content/drive/MyDrive/cloud_computing/Project


# Cleaning dataframes

#### Reading CSV file

In [None]:
import pandas as pd

# read in CSV file with low_memory=False
df = pd.read_csv('/content/drive/MyDrive/cloud_computing/Project/microsoft_tweets_2022.csv', engine='python')

In [None]:
df.shape

(164294, 8)

In [None]:
df.head()

Unnamed: 0,date,rawContent,replyCount,retweetCount,likeCount,quoteCount,hashtags,cashtags
0,2022-12-30 23:46:44+00:00,@start9labs @AdlaiAriella @TheRealBrandMcN @JW...,3.0,1.0,7.0,0,,
1,2022-12-30 23:41:10+00:00,🌎 How many cards were dealt in 2022? ☀️\n\nYou...,4.0,4.0,19.0,1,,
2,2022-12-30 23:37:52+00:00,"@NEWSMAX No, really he divested himself of eno...",3.0,3.0,54.0,0,,
3,2022-12-30 23:34:12+00:00,2022 was a crazy year.\n\nJanuary: Microsoft a...,129.0,84.0,2254.0,3,,
4,2022-12-30 23:24:46+00:00,A judge has set January 3rd for the first pre-...,4.0,5.0,49.0,1,,"['MSFT', 'ATVI']"


In [None]:
# Checking for null values
df.isna().sum()

date                 0
rawContent           2
replyCount           4
retweetCount         4
likeCount            4
quoteCount           5
hashtags        124972
cashtags        160294
dtype: int64

#### Dropping null values

In [None]:
df.dropna(subset=['date','rawContent'],inplace = True)

In [None]:
df.shape

(164292, 8)

#### Dropping columns

In [None]:
df.drop(['hashtags', 'cashtags'], axis=1, inplace=True)

In [None]:
df.columns

Index(['date', 'rawContent', 'replyCount', 'retweetCount', 'likeCount',
       'quoteCount'],
      dtype='object')

#### Filling null values

In [None]:
df[['replyCount','retweetCount','likeCount','quoteCount']] = df[['replyCount','retweetCount','likeCount','quoteCount']].fillna(0)

In [None]:
df.isna().sum()

date            0
rawContent      0
replyCount      0
retweetCount    0
likeCount       0
quoteCount      0
dtype: int64

In [None]:
df.head()

Unnamed: 0,date,rawContent,replyCount,retweetCount,likeCount,quoteCount
0,2022-12-30 23:46:44+00:00,@start9labs @AdlaiAriella @TheRealBrandMcN @JW...,3.0,1.0,7.0,0
1,2022-12-30 23:41:10+00:00,🌎 How many cards were dealt in 2022? ☀️\n\nYou...,4.0,4.0,19.0,1
2,2022-12-30 23:37:52+00:00,"@NEWSMAX No, really he divested himself of eno...",3.0,3.0,54.0,0
3,2022-12-30 23:34:12+00:00,2022 was a crazy year.\n\nJanuary: Microsoft a...,129.0,84.0,2254.0,3
4,2022-12-30 23:24:46+00:00,A judge has set January 3rd for the first pre-...,4.0,5.0,49.0,1


In [None]:
df.dtypes

date             object
rawContent       object
replyCount      float64
retweetCount    float64
likeCount       float64
quoteCount       object
dtype: object

#### Converting column datatype

In [None]:
from numpy import float64
df['quoteCount'] = df['quoteCount'].astype('str')

In [None]:
# Dropping row 
df = df.drop(df[df['quoteCount'] == "['editor', 'amediting']"].index)

In [None]:
df['quoteCount'] = df['quoteCount'].astype('float64')

In [None]:
df.dtypes

date             object
rawContent       object
replyCount      float64
retweetCount    float64
likeCount       float64
quoteCount      float64
dtype: object

In [None]:
pwd

'/content/drive/MyDrive/cloud_computing/Project'

In [None]:
df.to_csv('Cleaned Data/cleaned_microsoft_tweets.csv',index = False)

#### Reading cleaned csv file into pyspark dataframe

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/cloud_computing/Project/Cleaned Data/cleaned_microsoft_tweets.csv', engine = 'python')

In [None]:
df.shape

(164291, 6)

In [None]:
# convert the Pandas DataFrame to a PySpark DataFrame
ms_twts_df = spark.createDataFrame(df)

  for column, series in pdf.iteritems():


In [None]:
ms_twts_df.count()

164291

In [None]:
ms_twts_df.show(10)

+--------------------+--------------------+----------+------------+---------+----------+
|                date|          rawContent|replyCount|retweetCount|likeCount|quoteCount|
+--------------------+--------------------+----------+------------+---------+----------+
|2022-12-30 23:46:...|@start9labs @Adla...|       3.0|         1.0|      7.0|       0.0|
|2022-12-30 23:41:...|🌎 How many cards...|       4.0|         4.0|     19.0|       1.0|
|2022-12-30 23:37:...|@NEWSMAX No, real...|       3.0|         3.0|     54.0|       0.0|
|2022-12-30 23:34:...|2022 was a crazy ...|     129.0|        84.0|   2254.0|       3.0|
|2022-12-30 23:24:...|A judge has set J...|       4.0|         5.0|     49.0|       1.0|
|2022-12-30 22:58:...|Battlefield takin...|       0.0|         8.0|     17.0|       0.0|
|2022-12-30 22:55:...|First pre-trial h...|       8.0|        12.0|     49.0|       1.0|
|2022-12-30 22:38:...|These things have...|       2.0|         3.0|     16.0|       0.0|
|2022-12-30 22:31:...|

### Filtering stock tweets

In [None]:
import re

def extract_stock_tweets(tweet):
    # Define a regular expression pattern to match stock market-related keywords and hashtags
    pattern = r'\b$msft\b|\bstock market\b|\bstocks\b|\bshares\b|\btrading\b|\binvesting\b|\binvestor\b|\bbullish\b|\bbearish\b|\bportfolio\b|\bETF\b|\bindex\b|\b#stockmarket\b|\bstockmarkets\b|\bstockmarketnews\b|\bstockmarketinvesting\b|\bindianstockmarket\b|\bstockmarketindia\b|\bstockmarketeducation\b|\bstockmarketcrash\b|\bstockmarkettips\b|\bstockmarketquotes\b|\bstockmarkettrader\b|\bpakistanstockmarket\b|\bstockmarketanalysis\b|\bstockmarketlab\b|\bstockmarketinvestor\b|\bstockmarketmemes\b|\bstockmarketca\b|\bstockmarketmindgames\b|\bwoodstockmarket\b|\bstockmarketing\b|\bstockmarketmonitor\b|\bstockmarkettrading\b|\bstockmarketcourse\b|\bstockmarketupdate\b|\busstockmarket\b|\bstockmarket101\b|\bphilippinestockmarket\b|\bstockmarketopportunities\b|\binstastockmarket\b|\bstockmarketprice\b|\bstockmarketph\b|\bthestockmarket\b|\bstockmarketgame\b|\bstockmarketcrash2022\b|\bstockmarketadvisory\b|\bstockmarkettraining\b|\blearnstockmarket\b|\bstockmarketgains\b|\bstockmarketleader\b|\blivestockmarketing\b'

    if re.search(pattern, tweet, re.IGNORECASE):
        return True
    else:
        return False

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType,StringType

# Convert the custom function into a PySpark UDF
custom_filter_udf = udf(extract_stock_tweets, BooleanType())

ms_stock_twts_df = ms_twts_df.filter(custom_filter_udf(ms_twts_df['rawContent']))

In [None]:
ms_stock_twts_df.show(10)

+--------------------+--------------------+----------+------------+---------+----------+
|                date|          rawContent|replyCount|retweetCount|likeCount|quoteCount|
+--------------------+--------------------+----------+------------+---------+----------+
|2022-12-30 16:10:...|Microsoft Excel F...|       1.0|        25.0|     87.0|       0.0|
|2022-12-30 15:45:...|Stuff like this i...|      41.0|        15.0|    281.0|       0.0|
|2022-12-30 00:13:...|8/ Here’s a data ...|       1.0|         1.0|     10.0|       0.0|
|2022-12-30 00:04:...|trading microsoft...|       2.0|        14.0|     48.0|       0.0|
|2022-12-29 21:39:...|How the largest s...|       6.0|         9.0|     88.0|       2.0|
|2022-12-29 17:45:...|✅@Sony and @LEGO_...|       1.0|         1.0|     12.0|       0.0|
|2022-12-29 16:13:...|Stocks that lost ...|       1.0|        15.0|     47.0|       0.0|
|2022-12-29 14:00:...|A researcher shar...|       0.0|        12.0|     46.0|       0.0|
|2022-12-29 00:53:...

In [None]:
ms_stock_twts_df.count()

2995

### Preprocessing the filtered stock tweets

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stopwords = stopwords.words('english') 
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

# Define a function to perform lemmatization
def lemmatization(tweet):
    words = tweet.split()
    lem_sentence = " ".join([lemmatizer.lemmatize(word) for word in words])
    return lem_sentence

def remove_stop_words(tweet):
  words = tweet.split()  # split the sentence into words
  filtered_words = [word for word in words if word.casefold() not in stopwords]
  filtered_text = ' '.join(filtered_words)  # join the filtered words into a sentence
  return filtered_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.2/103.2 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting anyascii
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


In [None]:
import re
import contractions

def preprocess_tweets(tweet):
    tweet = tweet.lower() #converting lowercase
    tweet = re.sub(r'@\w+', ' ', tweet) #removes all mentions from the tweet
    tweet = re.sub(r'#\w+', ' ', tweet) #removes all hashtags from the tweet
    tweet = re.sub(r'http\S+', ' ', tweet) #removes all URLs from the tweet
    tweet = re.sub(r'www\S+', ' ', tweet) #removes all URLs that begin with "www" from the tweet
    tweet = re.sub(r'\d+\w*|\w*\d+\w*', ' ', tweet) #replaces alphanumeric or numeric with word number
    tweet = re.sub(r'(.)\1\1+', r'\1\1', tweet) #keep two consecutive duplicates and remove the rest
    tweet = re.sub(r"'s\b", ' ',tweet) # remove 's
    tweet = re.sub(r'[^\w\s]', ' ', tweet) #removes all non-alphanumeric characters and non-whitespace characters from the tweet
    tweet = re.sub(r'[^\x00-\x7F]+', ' ', tweet)  #removes all non-ASCII characters from the tweet 
    tweet = re.sub(r'[^\w\s#@/:%.,_-]', ' ', tweet) #removes any remaining non-alphanumeric characters
    tweet = contractions.fix(tweet)
    tweet = lemmatization(tweet)
    tweet = remove_stop_words(tweet)
    return tweet

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType,StringType

# Convert the preprocessing function into a PySpark UDF
custom_filter_udf = udf(preprocess_tweets, StringType())
ms_stock_twts_cleaned_df = ms_stock_twts_df.withColumn("cleanedTweets",custom_filter_udf(ms_stock_twts_df['rawContent']))

In [None]:
ms_stock_twts_cleaned_df.show(10)

+--------------------+--------------------+----------+------------+---------+----------+--------------------+
|                date|          rawContent|replyCount|retweetCount|likeCount|quoteCount|       cleanedTweets|
+--------------------+--------------------+----------+------------+---------+----------+--------------------+
|2022-12-30 16:10:...|Microsoft Excel F...|       1.0|        25.0|     87.0|       0.0|microsoft excel f...|
|2022-12-30 15:45:...|Stuff like this i...|      41.0|        15.0|    281.0|       0.0|stuff like always...|
|2022-12-30 00:13:...|8/ Here’s a data ...|       1.0|         1.0|     10.0|       0.0|data point manage...|
|2022-12-30 00:04:...|trading microsoft...|       2.0|        14.0|     48.0|       0.0|trading microsoft...|
|2022-12-29 21:39:...|How the largest s...|       6.0|         9.0|     88.0|       2.0|largest stock per...|
|2022-12-29 17:45:...|✅@Sony and @LEGO_...|       1.0|         1.0|     12.0|       0.0|investing epic ga...|
|2022-12-2

### Calculating sentiment score for tweets

In [None]:
# Download the Vader lexicon if it is not already downloaded
nltk.download('vader_lexicon')

from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define a UDF to calculate sentiment score for a text
def calculate_sentiment_score(text):
    # Analyze the sentiment of the text
    scores = analyzer.polarity_scores(text)
    
    # Return the compound sentiment score
    return scores['compound']

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
# Register the UDF
sentiment_udf = udf(calculate_sentiment_score, DoubleType())

ms_stock_twts_cleaned_df = ms_stock_twts_cleaned_df.withColumn("sentimentScore",sentiment_udf(ms_stock_twts_cleaned_df['cleanedTweets']))

In [None]:
ms_stock_twts_cleaned_df.show(10)

+--------------------+--------------------+----------+------------+---------+----------+--------------------+--------------+
|                date|          rawContent|replyCount|retweetCount|likeCount|quoteCount|       cleanedTweets|sentimentScore|
+--------------------+--------------------+----------+------------+---------+----------+--------------------+--------------+
|2022-12-30 16:10:...|Microsoft Excel F...|       1.0|        25.0|     87.0|       0.0|microsoft excel f...|        0.4588|
|2022-12-30 15:45:...|Stuff like this i...|      41.0|        15.0|    281.0|       0.0|stuff like always...|        0.3612|
|2022-12-30 00:13:...|8/ Here’s a data ...|       1.0|         1.0|     10.0|       0.0|data point manage...|           0.0|
|2022-12-30 00:04:...|trading microsoft...|       2.0|        14.0|     48.0|       0.0|trading microsoft...|        0.4588|
|2022-12-29 21:39:...|How the largest s...|       6.0|         9.0|     88.0|       2.0|largest stock per...|        0.1779|


# Grouping by date

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def extract_date(date):
  date_value = date.split()[0]
  return date_value

# Register the UDF
sentiment_udf = udf(extract_date, StringType())

ms_stock_twts_cleaned_df = ms_stock_twts_cleaned_df.withColumn("date",sentiment_udf(ms_stock_twts_cleaned_df['date']))

In [None]:
ms_stock_twts_cleaned_df.show(10)

+----------+--------------------+----------+------------+---------+----------+--------------------+--------------+
|      date|          rawContent|replyCount|retweetCount|likeCount|quoteCount|       cleanedTweets|sentimentScore|
+----------+--------------------+----------+------------+---------+----------+--------------------+--------------+
|2022-12-30|Microsoft Excel F...|       1.0|        25.0|     87.0|       0.0|microsoft excel f...|        0.4588|
|2022-12-30|Stuff like this i...|      41.0|        15.0|    281.0|       0.0|stuff like always...|        0.3612|
|2022-12-30|8/ Here’s a data ...|       1.0|         1.0|     10.0|       0.0|data point manage...|           0.0|
|2022-12-30|trading microsoft...|       2.0|        14.0|     48.0|       0.0|trading microsoft...|        0.4588|
|2022-12-29|How the largest s...|       6.0|         9.0|     88.0|       2.0|largest stock per...|        0.1779|
|2022-12-29|✅@Sony and @LEGO_...|       1.0|         1.0|     12.0|       0.0|in

In [None]:
from pyspark.sql.functions import col, count, sum, avg, expr, percentile_approx, abs

# Group the tweets by date and calculate the daywise positive score, negative score,
# number of positive tweets, and number of negative tweets

grouped_ms_stock_twts_df = ms_stock_twts_cleaned_df.groupBy("date").agg(
    (sum(expr("sentimentScore * likeCount")) / sum("likeCount")).alias("weighted_avg_sentiment_score"),
    # Calculate the median positive sentiment score
    percentile_approx(expr("CASE WHEN sentimentScore > 0 THEN sentimentScore END"), 0.5).alias("daywise_median_positive_score"),
    # Calculate the median negative sentiment score
    abs(percentile_approx(expr("CASE WHEN sentimentScore < 0 THEN sentimentScore END"), 0.5)).alias("daywise_median_negative_score"),
    sum((col("sentimentScore") > 0.0).cast("int")).alias("daywise_positive_tweet_count"),
    sum((col("sentimentScore") < 0.0).cast("int")).alias("daywise_negative_tweet_count"),
    avg(col("replyCount")).alias("daywise_avg_replyCount"),
    avg((col("retweetCount")).cast("int")).alias("daywise_avg_retweetCount"),
    avg((col("likeCount")).cast("int")).alias("daywise_avg_likeCount"),
    avg((col("quoteCount")).cast("int")).alias("daywise_avg_quoteCount")
)

In [None]:
grouped_ms_stock_twts_df.show(10)

+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+
|      date|weighted_avg_sentiment_score|daywise_median_positive_score|daywise_median_negative_score|daywise_positive_tweet_count|daywise_negative_tweet_count|daywise_avg_replyCount|daywise_avg_retweetCount|daywise_avg_likeCount|daywise_avg_quoteCount|
+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+
|2022-01-01|          0.5562448275862069|                       0.4019|                         null|                           2|                           0|                   2.5|                     0.5|                 14.5|            

In [None]:
# replacing null values with 0
grouped_ms_stock_twts_df = grouped_ms_stock_twts_df.na.fill(value = 0.0)
grouped_ms_stock_twts_df.show(10)

+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+
|      date|weighted_avg_sentiment_score|daywise_median_positive_score|daywise_median_negative_score|daywise_positive_tweet_count|daywise_negative_tweet_count|daywise_avg_replyCount|daywise_avg_retweetCount|daywise_avg_likeCount|daywise_avg_quoteCount|
+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+
|2022-01-01|          0.5562448275862069|                       0.4019|                          0.0|                           2|                           0|                   2.5|                     0.5|                 14.5|            

In [None]:
grouped_ms_stock_twts_df.count()

363

In [None]:
# Rounding off the values using ceil
from pyspark.sql.functions import ceil

# Assume `df` is a PySpark DataFrame with a column `col` that contains values to be rounded up
grouped_ms_stock_twts_df = grouped_ms_stock_twts_df.withColumn("daywise_avg_replyCount", ceil(grouped_ms_stock_twts_df.daywise_avg_replyCount))
grouped_ms_stock_twts_df = grouped_ms_stock_twts_df.withColumn("daywise_avg_retweetCount", ceil(grouped_ms_stock_twts_df.daywise_avg_retweetCount))
grouped_ms_stock_twts_df = grouped_ms_stock_twts_df.withColumn("daywise_avg_likeCount", ceil(grouped_ms_stock_twts_df.daywise_avg_likeCount))
grouped_ms_stock_twts_df = grouped_ms_stock_twts_df.withColumn("daywise_avg_quoteCount", ceil(grouped_ms_stock_twts_df.daywise_avg_quoteCount))


In [None]:
company_id = {'google':1,'microsoft':2, 'tesla':3, 'amazon':4, 'apple':5}

In [None]:
from pyspark.sql.functions import lit
grouped_ms_stock_twts_df = grouped_ms_stock_twts_df.withColumn('company',lit(company_id['microsoft']))

In [None]:
grouped_ms_stock_twts_df.show(10)

+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+-------+
|      date|weighted_avg_sentiment_score|daywise_median_positive_score|daywise_median_negative_score|daywise_positive_tweet_count|daywise_negative_tweet_count|daywise_avg_replyCount|daywise_avg_retweetCount|daywise_avg_likeCount|daywise_avg_quoteCount|company|
+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+-------+
|2022-01-01|          0.5562448275862069|                       0.4019|                          0.0|                           2|                           0|                     3|                       1|          

## Saving the file

In [None]:
# converting pyspark dataframe to pandas dataframe
DF = grouped_ms_stock_twts_df.toPandas()

DF.to_csv('Preprocessed Data/microsoft_preprocessed.csv', index = False)

**Note**: Similarly preprocessing is done for other companies like AMAZON, GOOGLE, TESLA, MICROSOFT, APPLE