# **Sentiment Analysis of Elon Musk and impact on stocks**


In [219]:
import itertools
import snscrape.modules.twitter as sntwitter
import pandas as pd
import numpy as np
import sys

import pandas_datareader.data as web
import os
import datetime
from alpha_vantage.timeseries import TimeSeries

# preprocess
import nltk
from nltk.corpus import stopwords
from textblob import Word, TextBlob

import plotly.graph_objects as go 
import plotly.express as px
from plotly.subplots import make_subplots


In [220]:
#download nltk packages
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mtdra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mtdra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mtdra\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## **Extract Tweets**

In [221]:
from pandas import DataFrame

def search_tweets(query: str,
                     since: str = None,
                     until: str = None,
                     exclude_retweets: bool = False,
                     exclude_replies: bool = False,
                     username:str = None,
                     near:str = None,
                     radius:str = None,
                     lang:str = None,
                     max_tweets:int =-1,
                     quiet=False) -> DataFrame: 
    """
    Search tweets according to keyword arguments specified using snscrape.

    Parameters
    ----------
    query (str): A query text to be matched.
    since (str. "yyyy-mm-dd"): A lower bound date (UTC) to restrict search. Default is 7 days before today.
    until (str. "yyyy-mm-dd"): An upper bound date (not included) to restrict search. Default is today.
    exclude_retweets (Bool): Exclude retweets from the dataframe
    exclude_replies (Bool): Exclude replies from retweets
    username (str or iterable): An optional specific username(s) from a twitter account (with or without "@"). Default is no username restriction.
    near (str): A reference location area (e.g. Milan) from where tweets were generated. Default is no reference area.
    radius (str): A distance radius (e.g. 15km) from location specified by "near". Meaningful only if "near" is set.
    lang (str): Restrict language of the tweets retrieved. Must be an ISO 639-1 code (e.g. en, it, etc.). Default is no language restriction.
    max_tweets (int): The maximum number of tweets to be retrieved. If this number is unsetted or lower than 1 all possible tweets will be retrieved. Default is -1.

    Returns
    -------
    tweets (NLPTweetList): list of tweets resulting from the search and amenable to analysis.
    """
    if until is None:
        until = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d')
    if since is None:
        since = datetime.datetime.strftime(
            datetime.datetime.strptime(until, '%Y-%m-%d') - datetime.timedelta(days=7), '%Y-%m-%d')
    if max_tweets == -1:
        max_tweets = sys.maxsize

    criteria = f'"{query} since:{since} until:{until}"' 

    if exclude_retweets:
        criteria += f" exclude:retweets"
    if exclude_replies:
        criteria += f" exclude:replies"
    if username is not None:
        criteria += f" from:{username}"
    if near is not None:
        criteria += f" near:{near.replace(' ', '&')}"
    if radius is not None:
        criteria += f" within:{radius}"
    if lang is not None:
        criteria += f" lang:{lang}"

    df = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(criteria).get_items(), max_tweets))
    return df

In [222]:

def daily_tweets( n_daily_tweets:int = 100 , t_days:int = 30 , query:str = '' , username:str = '@elonmusk') -> DataFrame:
    '''
    Returns a dataframe of at most n daily tweets for a given query for a specified period of time starting today.

    Parameters
    ------------
    t_days (int): The number of days prior to today for which you want to extract tweets.
    n_daily_tweets (int): The number of tweets to extract each day.
    query (str): The hastag or tweet you want to tweet about.
    '''

    for day_n in range(t_days):
        #moving time period
        start_day = datetime.datetime.today() - datetime.timedelta(days=day_n+1)
        start_day = start_day.strftime("%Y-%m-%d")
        end_day = datetime.datetime.today() - datetime.timedelta(days=day_n)
        end_day = end_day.strftime("%Y-%m-%d")
        
        #get data
        df_day = search_tweets(
                                query = '"{}"'.format(query), 
                                since=start_day, 
                                until=end_day, 
                                exclude_retweets=True, 
                                exclude_replies=False, 
                                max_tweets=n_daily_tweets ,
                                lang='en' ,
                                username=username
                            )

        #make date a column not index
        df_day.reset_index(inplace=True)

        # append to single df
        if day_n == 0:
            df = df_day.copy()
        else:
            df = pd.concat([df , df_day] , ignore_index=True)

    return df


In [223]:
df = daily_tweets(n_daily_tweets=20000 , t_days=30 , query='#elonmusk' , username=None)

#### **Preprocess tweets**

In [224]:
stop_words = stopwords.words('english')
custom_stopwords = ['elon','musk','tesla']

In [225]:
def clean_tweet(tweet , custom_stopwords):
    preprocessed_tweet = tweet
    preprocessed_tweet.replace('[^\w\s]' , '') #remove punctuation
    preprocessed_tweet = ' '.join(word for word in preprocessed_tweet.split() if word not in stop_words) #remove stopwords
    preprocessed_tweet = ' '.join(word for word in preprocessed_tweet.split() if word not in custom_stopwords) #remove custom stopwords
    preprocessed_tweet = ' '.join(word for word in preprocessed_tweet.split() if (word[0] != '@') ) # exclude handles 
    preprocessed_tweet = ' '.join(Word(word).lemmatize() for word in preprocessed_tweet.split())
    return preprocessed_tweet


In [226]:
df = df.loc[:,['date', 'renderedContent' , 'replyCount','retweetCount','likeCount','quoteCount']]
df['clean_tweet'] = df['renderedContent'].apply(lambda x: clean_tweet(x , custom_stopwords))

**Calculate Sentiment**

In [227]:
# using text blob to calculate polarity - 0 to 1 based on how positive or negative the tweets are
df['polarity'] = df['clean_tweet'].apply(lambda x: TextBlob(x).sentiment[0])
# subjectivity tells us how emotional the text is: how objective or subjective
df['subjectivity'] = df['clean_tweet'].apply(lambda x: TextBlob(x).sentiment[1])

**Group by day**

In [228]:
df.rename(mapper={'date':'datetime'} , axis=1 , inplace=1)

In [229]:
df['date'] = df['datetime'].dt.date

In [230]:
df_time = df[['date' , 'polarity' , 'subjectivity']].groupby('date').mean()

**Including Error Bars**

The distribution could be skewed so it is better to use asymmetrical errorbars and get a 95% interval from the percentiles. Perhaps preferable to assuming a normal distribution for the data where the interval [μ - 2σ, μ + 2σ] will cover 95.5 %, so we can use 2 * std to estimate the 95 % interval.

In [231]:
def error95interval(df:DataFrame , grouped_by:str , y:str , upper_lower:str = 'lower') -> np.array:
    '''
    Return the lower bound or upper bound from a distribution formed by the pandas group by method. 
    We assume a non-normal distribution so we will get asymmetrical errorbars and get the 95% interval from the percentiles.
    '''
    if upper_lower == 'upper':
        error = df[[ grouped_by , y ]].groupby(grouped_by).quantile(0.975)
    else:
        error = df[[ grouped_by , y ]].groupby(grouped_by).quantile(0.025)
        
    return error

In [232]:
df_time.loc[:,'polarity_lower'] = error95interval(df=df , grouped_by='date' , y='polarity' , upper_lower='lower')
df_time.loc[:,'polarity_upper'] = error95interval(df=df , grouped_by='date' , y='polarity' , upper_lower='upper')

df_time.loc[:,'subjectivity_lower'] = error95interval(df=df , grouped_by='date' , y='subjectivity' , upper_lower='lower')
df_time.loc[:,'subjectivity_upper'] = error95interval(df=df , grouped_by='date' , y='subjectivity' , upper_lower='upper')

# **Quick Guide of Stock Market**

* The S&P 500, Nasdaq, the Dow, and others are examples of market indexes. An index provides a summary of the market by tracking a sample of top stocks in that market.

    * The Dow tracks the 30 largest U.S. companies.

    * The Nasdaq market index, known as the Nasdaq composite, tracks the roughly 3,000 companies that are traded on the Nasdaq Exchange.

    * Difference between the NASDAQ 100 Vs NASDAQ Composite INdex: *The Nasdaq Composite Index comprises of all Nasdaq domestic and international stocks listed on the Nasdaq Stock Market while the Nasdaq 100 index is a large-cap growth index and includes 100 of the top domestic and international non-financial companies based on market capitalization*

    * The S&P 500 tracks 500 large U.S. companies across a span of industries and sectors.


* The stock market is a “forward-looking mechanism” or “discounting mechanism.” That is, participants are always looking forward and prices are always adjusted according to the anticipation of future events.

* Investors vs Traders: Investors like to buy and hold for many years whereas Traders like to buy and sell stocks more quickly, maybe holding them for only an hour, a day, a week, or a month.

* Market cap — or market capitalization — refers to the total value of all a company’s shares of stock. It is calculated by multiplying the price of a stock by its total number of outstanding shares. For example, a company with 20 million shares selling at $50 a share would have a market cap of $1 billion.

* Every stock has a bid price and an offer (or “ask”) price. The bid is the price at which someone is willing to buy the stock. The offer is the price at which someone is willing to sell the stock.

* A liquid stock is defined as a stock where you can buy or sell a lot of shares without moving the stock too much. Liquid stocks in the U.S. usually have a bid-ask spread of just a penny or two. Eg: Microsoft.

In [233]:
key = open('alphavantage_api_key.txt').read()

In [234]:
ts = TimeSeries(key , output_format='pandas')
df_stock , meta = ts.get_daily(symbol='TSLA',outputsize='full')

In [235]:
df_stock.reset_index(inplace=True)
df_stock.rename(mapper={'date':'datetime'} , axis=1 , inplace=1)
df_stock['date'] = df_stock['datetime'].dt.date

In [236]:

#end_date = datetime.datetime.today().strftime("%Y/%m/%d")
tminus30 = datetime.datetime.today() - datetime.timedelta(days=30)
tminus30 = tminus30.strftime("%Y-%m-%d")


last30 = df_stock.loc[df_stock.datetime>=tminus30,:]


In [237]:
px.line(last30 , x=last30.index , y='4. close')

Inner join data frames and plot on same axis

In [238]:
df_all = pd.merge(
    left=df_stock,
    right=df_time,
    how='inner',
    on='date'
)

### **Plot both**

In [243]:
import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls

api_key = open("plotly_chart_studio_api_key.txt").read()

username = "mutombii"
api_key = api_key

chart_studio.tools.set_credentials_file(username=username , api_key=api_key)

In [240]:
fig = make_subplots(rows=2, cols=1,
                    specs=[
                        [{"secondary_y": True}], 
                        [{"secondary_y": True}]
                        ])

# polarity
fig.add_trace(
    go.Scatter(
        x = df_all.date,
        y = df_all.polarity,
        name='Polarity',
        mode='lines',
        line=dict(color='#636EFA')
    ),
    row=1,col=1
)
# bounds
fig.add_trace(
    go.Scatter(
        name='Upper Bound',
        x=df_all.date,
        y=df_all.polarity_upper,
        mode='lines',
        marker=dict(color="#444"),
        line=dict(width=0),
        showlegend=False
    ),
    row=1,col=1
)
fig.add_trace(
    go.Scatter(
        name='Lower Bound',
        x=df_all.date,
        y=df_all.polarity_lower,
        mode='lines',
        marker=dict(color="#444"),
        line=dict(width=0),
        fillcolor='rgba(68, 68, 68, 0.3)',
        fill='tonexty',
        showlegend=False
    ),
    row=1,col=1
)
fig.add_trace(
    go.Scatter(
        x = df_all.date,
        y = df_all['4. close'],
        name='TSLA',
        mode='lines',
        line=dict(color='#EF553B')
    ),
    row=1,col=1,
    secondary_y=True
)


############### subjectivity ###############
fig.add_trace(
    go.Scatter(
        x = df_all.date,
        y = df_all.subjectivity,
        name='subjectivity',
        mode='lines',
        line=dict(color='#00CC96')
    ),
    row=2,col=1
)
# bounds
fig.add_trace(
    go.Scatter(
        name='Upper Bound',
        x=df_all.date,
        y=df_all.subjectivity_upper,
        mode='lines',
        marker=dict(color="#444"),
        line=dict(width=0),
        showlegend=False
    ),
    row=2,col=1
)
fig.add_trace(
    go.Scatter(
        name='Lower Bound',
        x=df_all.date,
        y=df_all.subjectivity_lower,
        mode='lines',
        marker=dict(color="#444"),
        line=dict(width=0),
        fillcolor='rgba(68, 68, 68, 0.3)',
        fill='tonexty',
        showlegend=False
    ),
    row=2,col=1
)
fig.add_trace(
    go.Scatter(
        x = df_all.date,
        y = df_all['4. close'],
        name='TSLA',
        mode='lines',
        line=dict(color='#EF553B'),
        showlegend=False
    ),
    row=2,col=1,
    secondary_y=True
)

fig.update_yaxes(title='Share Price (USD)',row=1,col=1 , secondary_y=True , gridwidth=0.1, gridcolor='Grey' , showgrid=False)
fig.update_yaxes(title='Share Price (USD)',row=2,col=1 , secondary_y=True , gridwidth=0.1, gridcolor='Grey' , showgrid=False)

fig.update_yaxes(title='Polarity',row=1,col=1 , secondary_y=False , gridwidth=0.1, gridcolor='Grey' , showgrid=False)
fig.update_yaxes(title='subjectivity',row=2,col=1 , secondary_y=False , gridwidth=0.1, gridcolor='Grey' , showgrid=False)

fig.update_xaxes(showgrid=True, gridwidth=0.1, gridcolor='Grey' , row=1 , col=1)
fig.update_xaxes(showgrid=True, gridwidth=0.1, gridcolor='Grey' , row=2 , col=1)

fig.update_layout(
    hovermode="x",
    paper_bgcolor = "rgba(0,0,0,0)",
    plot_bgcolor = "rgba(0,0,0,0)",
    font_color = 'rgba(255,255,255,1)',
    height=900
)

In [242]:
#py.plot(fig , filename='elons_tweet_sentiment' , auto_open=False)
#py.plot(fig , filename='elons_tweet_publicsentiment' , auto_open=False)

'https://plotly.com/~mutombii/28/'