### Installing Dependencies

**Step 1: Install Dependencies**
We need to install following components to run pyspark seamlessly:
OpenJDK 8,
Spark Environment,
FindSpark package

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

**Step 2: Add environment variables**
After installing dependencies, we need to some variables to the environment so that pyspark knows where to look for using dependencies. We can do that using following commands:

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

**Step 3: Initilize pyspark**
Finally, we just need to initilize pyspark which can be easily achieved using third-party package named findspark as shown below:

In [3]:
import findspark
findspark.init()

You can try running following commands to check if pyspark is properly installed or not:

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

## Mounting Google Drive

In [5]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
%cd /content/drive/MyDrive/cloud_computing/Project

/content/drive/MyDrive/cloud_computing/Project


# Cleaning dataframes

#### Reading CSV file

In [8]:
import pandas as pd

# read in CSV file with low_memory=False
df = pd.read_csv('/content/drive/MyDrive/cloud_computing/Project/Data Collection/tesla_tweets_2022.csv', engine='python')

In [9]:
df.shape

(404011, 8)

In [10]:
df.head()

Unnamed: 0,date,rawContent,replyCount,retweetCount,likeCount,quoteCount,hashtags,cashtags
0,2022-12-30 23:57:02+00:00,"Oh, so thats what getting hacked looks like, d...",1.0,8.0,161.0,0,,
1,2022-12-30 23:55:23+00:00,Tesla Offers Discount on Model S &amp; X in US...,4.0,21.0,179.0,1,,
2,2022-12-30 23:55:15+00:00,Tesla Offers Discount on Model S &amp; X in US...,3.0,10.0,75.0,0,,
3,2022-12-30 23:52:27+00:00,@OttawaWebPro @garyblack00 Have you tried orde...,4.0,2.0,53.0,0,,
4,2022-12-30 23:52:20+00:00,Who is Nikola Tesla? Why is his work important...,4.0,10.0,43.0,0,,


In [11]:
# Checking for null values
df.isna().sum()

date                 0
rawContent           9
replyCount          10
retweetCount        10
likeCount           10
quoteCount          10
hashtags        346146
cashtags        363334
dtype: int64

#### Dropping null values

In [12]:
df.dropna(subset=['date','rawContent'],inplace = True)

In [13]:
df.shape

(404002, 8)

#### Dropping columns

In [14]:
df.drop(['hashtags', 'cashtags'], axis=1, inplace=True)

In [15]:
df.columns

Index(['date', 'rawContent', 'replyCount', 'retweetCount', 'likeCount',
       'quoteCount'],
      dtype='object')

#### Filling null values

In [16]:
df[['replyCount','retweetCount','likeCount','quoteCount']] = df[['replyCount','retweetCount','likeCount','quoteCount']].fillna(0)

In [17]:
df.isna().sum()

date            0
rawContent      0
replyCount      0
retweetCount    0
likeCount       0
quoteCount      0
dtype: int64

In [19]:
df.head()

Unnamed: 0,date,rawContent,replyCount,retweetCount,likeCount,quoteCount
0,2022-12-30 23:57:02+00:00,"Oh, so thats what getting hacked looks like, d...",1.0,8.0,161.0,0
1,2022-12-30 23:55:23+00:00,Tesla Offers Discount on Model S &amp; X in US...,4.0,21.0,179.0,1
2,2022-12-30 23:55:15+00:00,Tesla Offers Discount on Model S &amp; X in US...,3.0,10.0,75.0,0
3,2022-12-30 23:52:27+00:00,@OttawaWebPro @garyblack00 Have you tried orde...,4.0,2.0,53.0,0
4,2022-12-30 23:52:20+00:00,Who is Nikola Tesla? Why is his work important...,4.0,10.0,43.0,0


In [20]:
df.dtypes

date             object
rawContent       object
replyCount      float64
retweetCount    float64
likeCount       float64
quoteCount       object
dtype: object

#### Converting column datatype

In [21]:
from numpy import float64
df['quoteCount'] = df['quoteCount'].astype('str')

In [22]:
# Dropping row 
df = df.drop(df[df['quoteCount'] == "['Dogecoin']"].index)

In [23]:
df['quoteCount'] = df['quoteCount'].astype('float64')

In [24]:
df.dtypes

date             object
rawContent       object
replyCount      float64
retweetCount    float64
likeCount       float64
quoteCount      float64
dtype: object

In [25]:
pwd

'/content/drive/MyDrive/cloud_computing/Project'

In [26]:
df.to_csv('Cleaned Data/cleaned_tesla_tweets.csv',index = False)

#### Reading cleaned csv file into pyspark dataframe

In [27]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/cloud_computing/Project/Cleaned Data/cleaned_tesla_tweets.csv', engine = 'python')

In [28]:
df.shape

(404001, 6)

In [29]:
# convert the Pandas DataFrame to a PySpark DataFrame
tsla_twts_df = spark.createDataFrame(df)

  for column, series in pdf.iteritems():


In [30]:
tsla_twts_df.count()

404001

In [31]:
tsla_twts_df.show(10)

+--------------------+--------------------+----------+------------+---------+----------+
|                date|          rawContent|replyCount|retweetCount|likeCount|quoteCount|
+--------------------+--------------------+----------+------------+---------+----------+
|2022-12-30 23:57:...|Oh, so thats what...|       1.0|         8.0|    161.0|       0.0|
|2022-12-30 23:55:...|Tesla Offers Disc...|       4.0|        21.0|    179.0|       1.0|
|2022-12-30 23:55:...|Tesla Offers Disc...|       3.0|        10.0|     75.0|       0.0|
|2022-12-30 23:52:...|@OttawaWebPro @ga...|       4.0|         2.0|     53.0|       0.0|
|2022-12-30 23:52:...|Who is Nikola Tes...|       4.0|        10.0|     43.0|       0.0|
|2022-12-30 23:50:...|Tesla is offering...|       5.0|         6.0|     74.0|       0.0|
|2022-12-30 23:50:...|Tesla numbers han...|       7.0|         5.0|     36.0|       4.0|
|2022-12-30 23:48:...|+2,000 followers ...|      15.0|        15.0|    253.0|       2.0|
|2022-12-30 23:48:...

### Filtering stock tweets

In [32]:
import re

def extract_stock_tweets(tweet):
    # Define a regular expression pattern to match stock market-related keywords and hashtags
    pattern = r'\b$tsla\b|\bstock market\b|\bstocks\b|\bshares\b|\btrading\b|\binvesting\b|\binvestor\b|\bbullish\b|\bbearish\b|\bportfolio\b|\bETF\b|\bindex\b|\b#stockmarket\b|\bstockmarkets\b|\bstockmarketnews\b|\bstockmarketinvesting\b|\bindianstockmarket\b|\bstockmarketindia\b|\bstockmarketeducation\b|\bstockmarketcrash\b|\bstockmarkettips\b|\bstockmarketquotes\b|\bstockmarkettrader\b|\bpakistanstockmarket\b|\bstockmarketanalysis\b|\bstockmarketlab\b|\bstockmarketinvestor\b|\bstockmarketmemes\b|\bstockmarketca\b|\bstockmarketmindgames\b|\bwoodstockmarket\b|\bstockmarketing\b|\bstockmarketmonitor\b|\bstockmarkettrading\b|\bstockmarketcourse\b|\bstockmarketupdate\b|\busstockmarket\b|\bstockmarket101\b|\bphilippinestockmarket\b|\bstockmarketopportunities\b|\binstastockmarket\b|\bstockmarketprice\b|\bstockmarketph\b|\bthestockmarket\b|\bstockmarketgame\b|\bstockmarketcrash2022\b|\bstockmarketadvisory\b|\bstockmarkettraining\b|\blearnstockmarket\b|\bstockmarketgains\b|\bstockmarketleader\b|\blivestockmarketing\b'

    if re.search(pattern, tweet, re.IGNORECASE):
        return True
    else:
        return False

In [33]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType,StringType

# Convert the custom function into a PySpark UDF
custom_filter_udf = udf(extract_stock_tweets, BooleanType())

tsla_stock_twts_df = tsla_twts_df.filter(custom_filter_udf(tsla_twts_df['rawContent']))

In [34]:
tsla_stock_twts_df.show(10)

+--------------------+--------------------+----------+------------+---------+----------+
|                date|          rawContent|replyCount|retweetCount|likeCount|quoteCount|
+--------------------+--------------------+----------+------------+---------+----------+
|2022-12-30 23:25:...|Tesla's CFO exerc...|     177.0|       360.0|   3838.0|      70.0|
|2022-12-30 23:13:...|@elonmusk @teslao...|       1.0|         0.0|     27.0|       0.0|
|2022-12-30 23:12:...|@S3XYev @teslaown...|      13.0|         1.0|     24.0|       0.0|
|2022-12-30 23:03:...|People are starti...|       5.0|        17.0|     89.0|       0.0|
|2022-12-30 22:47:...|@elonmusk @teslao...|      12.0|        11.0|    196.0|       0.0|
|2022-12-30 22:45:...|Tesla short Mark ...|      31.0|        28.0|    551.0|       6.0|
|2022-12-30 21:57:...|Tesla shares fell...|      26.0|        41.0|    110.0|      16.0|
|2022-12-30 21:34:...|@JPSartre_NoExit ...|       2.0|         0.0|     11.0|       0.0|
|2022-12-30 21:15:...

In [35]:
tsla_stock_twts_df.count()

16121

### Preprocessing the filtered stock tweets

In [36]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stopwords = stopwords.words('english') 
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

# Define a function to perform lemmatization
def lemmatization(tweet):
    words = tweet.split()
    lem_sentence = " ".join([lemmatizer.lemmatize(word) for word in words])
    return lem_sentence

def remove_stop_words(tweet):
  words = tweet.split()  # split the sentence into words
  filtered_words = [word for word in words if word.casefold() not in stopwords]
  filtered_text = ' '.join(filtered_words)  # join the filtered words into a sentence
  return filtered_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [37]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.2/103.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting anyascii
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


In [38]:
import re
import contractions

def preprocess_tweets(tweet):
    tweet = tweet.lower() #converting lowercase
    tweet = re.sub(r'@\w+', ' ', tweet) #removes all mentions from the tweet
    tweet = re.sub(r'#\w+', ' ', tweet) #removes all hashtags from the tweet
    tweet = re.sub(r'http\S+', ' ', tweet) #removes all URLs from the tweet
    tweet = re.sub(r'www\S+', ' ', tweet) #removes all URLs that begin with "www" from the tweet
    tweet = re.sub(r'\d+\w*|\w*\d+\w*', ' ', tweet) #replaces alphanumeric or numeric with word number
    tweet = re.sub(r'(.)\1\1+', r'\1\1', tweet) #keep two consecutive duplicates and remove the rest
    tweet = re.sub(r"'s\b", ' ',tweet) # remove 's
    tweet = re.sub(r'[^\w\s]', ' ', tweet) #removes all non-alphanumeric characters and non-whitespace characters from the tweet
    tweet = re.sub(r'[^\x00-\x7F]+', ' ', tweet)  #removes all non-ASCII characters from the tweet 
    tweet = re.sub(r'[^\w\s#@/:%.,_-]', ' ', tweet) #removes any remaining non-alphanumeric characters
    tweet = contractions.fix(tweet)
    tweet = lemmatization(tweet)
    tweet = remove_stop_words(tweet)
    return tweet

In [39]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType,StringType

# Convert the preprocessing function into a PySpark UDF
custom_filter_udf = udf(preprocess_tweets, StringType())
tsla_stock_twts_cleaned_df = tsla_stock_twts_df.withColumn("cleanedTweets",custom_filter_udf(tsla_stock_twts_df['rawContent']))

In [None]:
tsla_stock_twts_cleaned_df.show(10)

+--------------------+--------------------+----------+------------+---------+----------+--------------------+
|                date|          rawContent|replyCount|retweetCount|likeCount|quoteCount|       cleanedTweets|
+--------------------+--------------------+----------+------------+---------+----------+--------------------+
|2022-12-30 23:25:...|Tesla's CFO exerc...|     177.0|       360.0|   3838.0|      70.0|tesla cfo exercis...|
|2022-12-30 23:13:...|@elonmusk @teslao...|       1.0|         0.0|     27.0|       0.0|dump tesla share ...|
|2022-12-30 23:12:...|@S3XYev @teslaown...|      13.0|         1.0|     24.0|       0.0|ok tired attentio...|
|2022-12-30 23:03:...|People are starti...|       5.0|        17.0|     89.0|       0.0|people starting t...|
|2022-12-30 22:47:...|@elonmusk @teslao...|      12.0|        11.0|    196.0|       0.0|short term stock ...|
|2022-12-30 22:45:...|Tesla short Mark ...|      31.0|        28.0|    551.0|       6.0|tesla short mark ...|
|2022-12-3

### Calculating sentiment score for tweets

In [None]:
# Download the Vader lexicon if it is not already downloaded
nltk.download('vader_lexicon')

from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define a UDF to calculate sentiment score for a text
def calculate_sentiment_score(text):
    # Analyze the sentiment of the text
    scores = analyzer.polarity_scores(text)
    
    # Return the compound sentiment score
    return scores['compound']

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
# Register the UDF
sentiment_udf = udf(calculate_sentiment_score, DoubleType())

tsla_stock_twts_cleaned_df = tsla_stock_twts_cleaned_df.withColumn("sentimentScore",sentiment_udf(tsla_stock_twts_cleaned_df['cleanedTweets']))

In [None]:
tsla_stock_twts_cleaned_df.show(10)

+--------------------+--------------------+----------+------------+---------+----------+--------------------+--------------+
|                date|          rawContent|replyCount|retweetCount|likeCount|quoteCount|       cleanedTweets|sentimentScore|
+--------------------+--------------------+----------+------------+---------+----------+--------------------+--------------+
|2022-12-30 23:25:...|Tesla's CFO exerc...|     177.0|       360.0|   3838.0|      70.0|tesla cfo exercis...|        0.5267|
|2022-12-30 23:13:...|@elonmusk @teslao...|       1.0|         0.0|     27.0|       0.0|dump tesla share ...|       -0.1027|
|2022-12-30 23:12:...|@S3XYev @teslaown...|      13.0|         1.0|     24.0|       0.0|ok tired attentio...|        0.2023|
|2022-12-30 23:03:...|People are starti...|       5.0|        17.0|     89.0|       0.0|people starting t...|         0.765|
|2022-12-30 22:47:...|@elonmusk @teslao...|      12.0|        11.0|    196.0|       0.0|short term stock ...|       -0.4215|


# Grouping by date

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def extract_date(date):
  date_value = date.split()[0]
  return date_value

# Register the UDF
sentiment_udf = udf(extract_date, StringType())

tsla_stock_twts_cleaned_df = tsla_stock_twts_cleaned_df.withColumn("date",sentiment_udf(tsla_stock_twts_cleaned_df['date']))

In [None]:
tsla_stock_twts_cleaned_df.show(10)

+----------+--------------------+----------+------------+---------+----------+--------------------+--------------+
|      date|          rawContent|replyCount|retweetCount|likeCount|quoteCount|       cleanedTweets|sentimentScore|
+----------+--------------------+----------+------------+---------+----------+--------------------+--------------+
|2022-12-30|Tesla's CFO exerc...|     177.0|       360.0|   3838.0|      70.0|tesla cfo exercis...|        0.5267|
|2022-12-30|@elonmusk @teslao...|       1.0|         0.0|     27.0|       0.0|dump tesla share ...|       -0.1027|
|2022-12-30|@S3XYev @teslaown...|      13.0|         1.0|     24.0|       0.0|ok tired attentio...|        0.2023|
|2022-12-30|People are starti...|       5.0|        17.0|     89.0|       0.0|people starting t...|         0.765|
|2022-12-30|@elonmusk @teslao...|      12.0|        11.0|    196.0|       0.0|short term stock ...|       -0.4215|
|2022-12-30|Tesla short Mark ...|      31.0|        28.0|    551.0|       6.0|te

In [None]:
from pyspark.sql.functions import col, count, sum, avg, expr, percentile_approx, abs

# Group the tweets by date and calculate the daywise positive score, negative score,
# number of positive tweets, and number of negative tweets

grouped_tsla_stock_twts_df = tsla_stock_twts_cleaned_df.groupBy("date").agg(
    (sum(expr("sentimentScore * likeCount")) / sum("likeCount")).alias("weighted_avg_sentiment_score"),
    # Calculate the median positive sentiment score
    percentile_approx(expr("CASE WHEN sentimentScore > 0 THEN sentimentScore END"), 0.5).alias("daywise_median_positive_score"),
    # Calculate the median negative sentiment score
    abs(percentile_approx(expr("CASE WHEN sentimentScore < 0 THEN sentimentScore END"), 0.5)).alias("daywise_median_negative_score"),
    sum((col("sentimentScore") > 0.0).cast("int")).alias("daywise_positive_tweet_count"),
    sum((col("sentimentScore") < 0.0).cast("int")).alias("daywise_negative_tweet_count"),
    avg(col("replyCount")).alias("daywise_avg_replyCount"),
    avg((col("retweetCount")).cast("int")).alias("daywise_avg_retweetCount"),
    avg((col("likeCount")).cast("int")).alias("daywise_avg_likeCount"),
    avg((col("quoteCount")).cast("int")).alias("daywise_avg_quoteCount")
)

In [None]:
grouped_tsla_stock_twts_df.show(10)

+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+
|      date|weighted_avg_sentiment_score|daywise_median_positive_score|daywise_median_negative_score|daywise_positive_tweet_count|daywise_negative_tweet_count|daywise_avg_replyCount|daywise_avg_retweetCount|daywise_avg_likeCount|daywise_avg_quoteCount|
+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+
|2022-01-01|          0.3480822370617696|                       0.7096|                       0.4678|                          15|                           8|     9.153846153846153|      7.8076923076923075|    115.1923076923077|    1.076923

In [None]:
# replacing null values with 0
grouped_tsla_stock_twts_df = grouped_tsla_stock_twts_df.na.fill(value = 0.0)
grouped_tsla_stock_twts_df.show(10)

+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+
|      date|weighted_avg_sentiment_score|daywise_median_positive_score|daywise_median_negative_score|daywise_positive_tweet_count|daywise_negative_tweet_count|daywise_avg_replyCount|daywise_avg_retweetCount|daywise_avg_likeCount|daywise_avg_quoteCount|
+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+
|2022-01-01|          0.3480822370617696|                       0.7096|                       0.4678|                          15|                           8|     9.153846153846153|      7.8076923076923075|    115.1923076923077|    1.076923

In [None]:
grouped_tsla_stock_twts_df.count()

364

In [None]:
# Rounding off the values using ceil
from pyspark.sql.functions import ceil

# Assume `df` is a PySpark DataFrame with a column `col` that contains values to be rounded up
grouped_tsla_stock_twts_df = grouped_tsla_stock_twts_df.withColumn("daywise_avg_replyCount", ceil(grouped_tsla_stock_twts_df.daywise_avg_replyCount))
grouped_tsla_stock_twts_df = grouped_tsla_stock_twts_df.withColumn("daywise_avg_retweetCount", ceil(grouped_tsla_stock_twts_df.daywise_avg_retweetCount))
grouped_tsla_stock_twts_df = grouped_tsla_stock_twts_df.withColumn("daywise_avg_likeCount", ceil(grouped_tsla_stock_twts_df.daywise_avg_likeCount))
grouped_tsla_stock_twts_df = grouped_tsla_stock_twts_df.withColumn("daywise_avg_quoteCount", ceil(grouped_tsla_stock_twts_df.daywise_avg_quoteCount))


In [None]:
company_id = {'google':1,'microsoft':2, 'tesla':3, 'amazon':4, 'apple':5}

In [None]:
from pyspark.sql.functions import lit
grouped_tsla_stock_twts_df = grouped_tsla_stock_twts_df.withColumn('company',lit(company_id['tesla']))

In [None]:
grouped_tsla_stock_twts_df.show(10)

+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+-------+
|      date|weighted_avg_sentiment_score|daywise_median_positive_score|daywise_median_negative_score|daywise_positive_tweet_count|daywise_negative_tweet_count|daywise_avg_replyCount|daywise_avg_retweetCount|daywise_avg_likeCount|daywise_avg_quoteCount|company|
+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+-------+
|2022-01-01|          0.3480822370617696|                       0.7096|                       0.4678|                          15|                           8|                    10|                       8|          

## Saving the file

In [None]:
# converting pyspark dataframe to pandas dataframe
DF = grouped_tsla_stock_twts_df.toPandas()

DF.to_csv('Preprocessed Data/tesla_preprocessed.csv', index = False)

**Note**: Similarly preprocessing is done for other companies like AMAZON, GOOGLE, TESLA, MICROSOFT, APPLE