In [5]:
import pandas as pd
import yfinance as yf
import re
import emoji
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import os

In [None]:
# Define the start and end dates for the data.
start_date = '2017-02-01'
end_date = '2020-12-31'

# Download the historical data for the S&P 500 index using the ^GSPC ticker symbol.
sp500 = yf.download('^GSPC', start=start_date, end=end_date)

# The variable `sp500` now contains a DataFrame with the historical data for the S&P 500 index.

In [7]:
# Drop the columns from the DataFrame
sp500 = sp500.drop(columns=['Volume', 'Adj Close'])
sp500_tb = sp500.copy()

In [8]:
target_directory = '.\stocks'
if not os.path.exists(target_directory):
    os.makedirs(target_directory)

csv_filename = 'sp500.csv'
sp500.to_csv(os.path.join(target_directory, csv_filename), index=True, encoding='utf-8')

In [9]:
# Define the URLs for the CSV files.
url_musk = 'https://raw.githubusercontent.com/maorisraelii/twitter-sentiment-analysis/main/Musk(2014-2019).csv'
url_biden = 'https://raw.githubusercontent.com/maorisraelii/twitter-sentiment-analysis/main/Biden(2007-2020).csv'
url_trump = 'https://raw.githubusercontent.com/maorisraelii/twitter-sentiment-analysis/main/Trump(2017-2021).csv'
url_bill = 'https://raw.githubusercontent.com/maorisraelii/twitter-sentiment-analysis/main/Bill_Gates.csv'
url_jeff = 'https://raw.githubusercontent.com/maorisraelii/twitter-sentiment-analysis/main/Jeff_Bezos.csv'
url_tim = 'https://raw.githubusercontent.com/maorisraelii/twitter-sentiment-analysis/main/Tim_Cook.csv'

In [10]:
# Read the CSV files into DataFrames.
musk = pd.read_csv(url_musk, encoding= 'unicode_escape')
biden = pd.read_csv(url_biden, encoding= 'unicode_escape')
trump = pd.read_csv(url_trump, encoding= 'unicode_escape',on_bad_lines= 'skip')
bill = pd.read_csv(url_bill, encoding= 'unicode_escape')
jeff = pd.read_csv(url_jeff, encoding= 'unicode_escape')
tim = pd.read_csv(url_tim, encoding= 'unicode_escape')

In [11]:
# Convert 'time' column in the dataset to datetime format
trump['time'] = pd.to_datetime(trump['time'])
biden['time'] = pd.to_datetime(biden['time'], dayfirst=True)
musk['time'] = pd.to_datetime(musk['date'])
bill['time'] = pd.to_datetime(bill['time_stamp_UTC'])
jeff['time'] = pd.to_datetime(jeff['created_at'])
tim['time'] = pd.to_datetime(tim['created_at'])

In [12]:
# Convert 'tweet' column in the dataset
trump.rename(columns={'tweet': 'tweet'}, inplace=True)
biden.rename(columns={'tweet': 'tweet'}, inplace=True)
musk.rename(columns={'tweet': 'tweet'}, inplace=True)
bill.rename(columns={'tweet_text': 'tweet'}, inplace=True)
jeff.rename(columns={'text': 'tweet'}, inplace=True)
tim.rename(columns={'text': 'tweet'}, inplace=True)

In [13]:
# Define a function to replace emojis with words
def replace_emojis_with_words(text):
    # Replace each emoji with its corresponding description
    text_with_words = emoji.demojize(text)
    return text_with_words

In [14]:
# Define regular expressions to match links and emojis
link_pattern = r'https?://\S+'
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+", flags=re.UNICODE)

def clean_dataset(dataset, tweet_column = 'tweet', date_column = 'time'):
    dataset[tweet_column] = dataset[tweet_column].fillna('')  # Replace NaN values with an empty string
    dataset = dataset[~dataset[tweet_column].str.match(link_pattern)]  # Remove rows with tweets that contain only links
    dataset = dataset[~dataset[tweet_column].str.contains(emoji_pattern)]  # Remove rows with tweets that contain emojis
    dataset[tweet_column] = dataset[tweet_column].apply(replace_emojis_with_words)
    dataset = dataset.dropna(axis=0, how='all')  # Drop rows with all NaN values
    dataset = dataset.dropna(axis=1, how='all')  # Drop columns with all NaN values
    dataset = dataset.reset_index(drop=True)  # Reset the index of the dataset
    dataset.dropna(subset=[date_column], inplace=True)  # Drop rows with NaN values in the date column
    return dataset

In [15]:
# Clean datasets
musk = clean_dataset(musk)
biden = clean_dataset(biden)
trump = clean_dataset(trump)
bill = clean_dataset(bill)
jeff = clean_dataset(jeff)
tim = clean_dataset(tim)

In [16]:
# Filter the dataset for the specified time range
trump = trump[(trump['time'] >= start_date) & (trump['time'] <= end_date)]
biden = biden[(biden['time'] >= start_date) & (biden['time'] <= end_date)]
musk = musk[(musk['time'] >= start_date) & (musk['time'] <= end_date)]
bill = bill[(bill['time'] >= start_date) & (bill['time'] <= end_date)]
jeff = jeff[(jeff['time'] >= start_date) & (jeff['time'] <= end_date)]
tim = tim[(tim['time'] >= start_date) & (tim['time'] <= end_date)]

In [17]:
target_directory = './tweets'
if not os.path.exists(target_directory):
    os.makedirs(target_directory)

trump.to_csv(os.path.join(target_directory,'trump.csv'), index=False, encoding='utf-8')
biden.to_csv(os.path.join(target_directory,'biden.csv'), index=False, encoding='utf-8')
musk.to_csv(os.path.join(target_directory,'musk.csv'), index=False, encoding='utf-8')
bill.to_csv(os.path.join(target_directory,'bill.csv'), index=False, encoding='utf-8')
jeff.to_csv(os.path.join(target_directory,'jeff.csv'), index=False, encoding='utf-8')
tim.to_csv(os.path.join(target_directory,'tim.csv'), index=False, encoding='utf-8')

In [18]:
# Function to perform sentiment analysis using Vader
def perform_sentiment_analysis_v(df):
    analyzed_df = df.copy()
    analyzed_df['sentiment'] = ''
    analyzed_df['polarity'] = ''

    # Create an instance of the Vader sentiment analyzer
    analyzer = SentimentIntensityAnalyzer()

    for index, row in analyzed_df.iterrows():
        tweet = row['tweet']
        sentiment, compound_score = get_sentiment_label_v(analyzer.polarity_scores(tweet))
        analyzed_df.at[index, 'sentiment'] = sentiment
        analyzed_df.at[index, 'polarity'] = compound_score

    return analyzed_df

# Function to get sentiment label and compound score based on Vader sentiment scores
def get_sentiment_label_v(sentiment_scores):
    compound_score = sentiment_scores['compound']

    if compound_score >= 0.05:
        sentiment_label = 'positive'
    elif compound_score <= -0.05:
        sentiment_label = 'negative'
    else:
        sentiment_label = 'neutral'

    return sentiment_label, compound_score

In [19]:
# Perform sentiment analysis using Vader
analyzed_musk_vader = perform_sentiment_analysis_v(musk)
analyzed_biden_vader = perform_sentiment_analysis_v(biden)
analyzed_trump_vader = perform_sentiment_analysis_v(trump)
analyzed_jeff_vader = perform_sentiment_analysis_v(jeff)
analyzed_tim_vader = perform_sentiment_analysis_v(tim)
analyzed_bill_vader = perform_sentiment_analysis_v(bill)

In [20]:
# Group by the date and calculate the average sentiment for each day
day_grouped_musk_vader = analyzed_musk_vader.groupby(analyzed_musk_vader['time'].dt.date)['polarity'].mean()
day_grouped_trump_vader= analyzed_trump_vader.groupby(analyzed_trump_vader['time'].dt.date)['polarity'].mean()
day_grouped_biden_vader = analyzed_biden_vader.groupby(analyzed_biden_vader['time'].dt.date)['polarity'].mean()
day_grouped_jeff_vader = analyzed_jeff_vader.groupby(analyzed_jeff_vader['time'].dt.date)['polarity'].mean()
day_grouped_tim_vader = analyzed_tim_vader.groupby(analyzed_tim_vader['time'].dt.date)['polarity'].mean()
day_grouped_bill_vader = analyzed_bill_vader.groupby(analyzed_bill_vader['time'].dt.date)['polarity'].mean()

In [21]:
sp500['biden'] = day_grouped_biden_vader
sp500['musk'] = day_grouped_musk_vader
sp500['trump'] = day_grouped_trump_vader
sp500['tim'] = day_grouped_tim_vader
sp500['bill'] = day_grouped_bill_vader
sp500['jeff'] = day_grouped_jeff_vader

In [22]:
sp500.interpolate(method='linear', inplace=True)
sp500.fillna(method='bfill', inplace=True)
sp500.fillna(method='ffill', inplace=True)

In [23]:
target_directory = './sentiment'
if not os.path.exists(target_directory):
    os.makedirs(target_directory)

sp500.to_csv(os.path.join(target_directory,'data_vader.csv'), index=True, encoding='utf-8')

In [24]:
# Function to perform sentiment analysis using TextBlob
def perform_sentiment_analysis_tb(df):
    analyzed_df = df.copy()
    analyzed_df['sentiment'] = ''
    analyzed_df['polarity'] = ''

    for index, row in analyzed_df.iterrows():
        tweet = row['tweet']
        sentiment, polarity = get_sentiment_label_tb(TextBlob(tweet).sentiment)
        analyzed_df.at[index, 'sentiment'] = sentiment
        analyzed_df.at[index, 'polarity'] = polarity

    return analyzed_df

# Function to get sentiment label, subjectivity, and polarity based on sentiment score
def get_sentiment_label_tb(sentiment):
    polarity = sentiment.polarity
    subjectivity = sentiment.subjectivity

    if polarity > 0:
        sentiment_label = 'positive'
    elif polarity < 0:
        sentiment_label = 'negative'
    else:
        sentiment_label = 'neutral'

    return sentiment_label, polarity

In [25]:
# Perform sentiment analysis using TextBlob
analyzed_musk_textblob = perform_sentiment_analysis_tb(musk)
analyzed_biden_textblob = perform_sentiment_analysis_tb(biden)
analyzed_trump_textblob = perform_sentiment_analysis_tb(trump)
analyzed_jeff_textblob = perform_sentiment_analysis_tb(jeff)
analyzed_tim_textblob = perform_sentiment_analysis_tb(tim)
analyzed_bill_textblob = perform_sentiment_analysis_tb(bill)

In [26]:
# Group by the date and calculate the average sentiment for each day
day_grouped_musk_textblob = analyzed_musk_textblob.groupby(analyzed_musk_textblob['time'].dt.date)['polarity'].mean()
day_grouped_trump_textblob = analyzed_trump_textblob.groupby(analyzed_trump_textblob['time'].dt.date)['polarity'].mean()
day_grouped_biden_textblob = analyzed_biden_textblob.groupby(analyzed_biden_textblob['time'].dt.date)['polarity'].mean()
day_grouped_jeff_textblob = analyzed_jeff_textblob.groupby(analyzed_jeff_textblob['time'].dt.date)['polarity'].mean()
day_grouped_tim_textblob = analyzed_tim_textblob.groupby(analyzed_tim_textblob['time'].dt.date)['polarity'].mean()
day_grouped_bill_textblob = analyzed_bill_textblob.groupby(analyzed_bill_textblob['time'].dt.date)['polarity'].mean()

In [27]:
sp500_tb['biden'] = day_grouped_biden_textblob
sp500_tb['musk'] = day_grouped_musk_textblob
sp500_tb['trump'] = day_grouped_trump_textblob
sp500_tb['tim'] = day_grouped_tim_textblob
sp500_tb['bill'] = day_grouped_bill_textblob
sp500_tb['jeff'] = day_grouped_jeff_textblob

In [28]:
sp500_tb.interpolate(method='linear', inplace=True)
sp500_tb.fillna(method='bfill', inplace=True)
sp500_tb.fillna(method='ffill', inplace=True)

In [29]:
target_directory = './sentiment'
if not os.path.exists(target_directory):
    os.makedirs(target_directory)

sp500_tb.to_csv(os.path.join(target_directory,'data_textblob.csv'), index=True, encoding='utf-8')