In [233]:
from textblob import TextBlob
from textblob import Blobber
import nltk
from nltk.corpus import stopwords
from stop_words import get_stop_words
import re
from nltk.util import ngrams

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [234]:
# from twitterscraper import query_tweets

# def get_tweets(ticker, name, start='2019-01-01', end='2020-01-01'):
#     list_of_tweets = query_tweets(f'(${ticker} OR {name}) (from:wsj OR from:reuters OR from:business OR from:cnbc OR from:RANsquawk OR from:wsjmarkets) until:{end} since:{start}', 100)
#     return list_of_tweets

In [235]:
# tsla_tweets = get_tweets('TSLA', 'Tesla', start='2019-01-01', end='2020-01-01')
# tsla_tweets = [text]

In [236]:
df_original = pd.read_json('tesla_tweets.json', encoding='utf-8')
df = df_original.copy()

In [237]:
df = df[['timestamp', 'text', 'username']]
df.head()

Unnamed: 0,timestamp,text,username
0,2011-07-20 19:55:32,Tesla wins $100 million Toyota supply deal | h...,Bloomberg
1,2011-05-25 01:22:17,Tesla CEO: No interest in selling electric car...,Bloomberg
2,2011-07-20 19:55:32,Tesla wins $100 million Toyota supply deal | h...,Bloomberg
3,2011-05-25 01:22:17,Tesla CEO: No interest in selling electric car...,Bloomberg
4,2012-05-07 23:45:12,Toyota to sell electric RAV4 with Tesla Motor ...,Bloomberg


In [238]:
df.set_index(pd.to_datetime(df['timestamp']), inplace=True)
df.drop(columns='timestamp', inplace=True)
df.sort_index(inplace=True)

In [239]:
def remove_URLs(text):
    
    tweet = text
    tweet = re.sub('http.+?$', '', tweet)
    
    return tweet

In [240]:
df.text = df.text.map(remove_URLs)
df.head()

Unnamed: 0_level_0,text,username
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-06-29 01:46:32,#Tesla Motors Raises $226 Million in First IPO...,Bloomberg
2010-06-29 01:46:32,#Tesla Motors Raises $226 Million in First IPO...,Bloomberg
2010-06-29 22:31:42,Carmaker Tesla's stock zooms 40 percent on fir...,Reuters
2010-06-29 22:31:42,Carmaker Tesla's stock zooms 40 percent on fir...,Reuters
2010-07-02 20:21:02,"Tesla shot up 57% after its IPO this week, the...",The Wall Street Journal


In [241]:
df.shape

(3638, 2)

In [242]:
df.drop_duplicates(subset=['text'], keep='first', inplace=True)

In [243]:
df.shape

(2354, 2)

In [244]:
def remove_singles(text):
    
    tweet = ' '.join([word for word in text.split(' ') if len(word) > 2])
    
    return tweet

In [245]:
df.text = df.text.map(remove_singles)
df.head()

Unnamed: 0_level_0,text,username
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-06-29 01:46:32,#Tesla Motors Raises $226 Million First IPO U....,Bloomberg
2010-06-29 22:31:42,Carmaker Tesla's stock zooms percent first day,Reuters
2010-07-02 20:21:02,"Tesla shot 57% after its IPO this week, then f...",The Wall Street Journal
2010-07-06 22:09:12,#Tesla Shares Fall Below Electric-Car Maker's ...,Bloomberg
2010-07-12 16:44:50,"Toyota, Tesla Said Prepare Battery-Powered RAV...",Bloomberg


In [246]:
pattern = ("([a-zA-Z]+(?:'[a-z]+)?)")
cv = CountVectorizer(token_pattern=pattern, 
                     stop_words=nltk.corpus.stopwords.words('english'),
                     ngram_range=(1,2))

cv_texts = cv.fit_transform(df.text)
cv_df = pd.DataFrame(cv_texts.toarray())
cv_df.columns = cv.vocabulary_
cv_df

Unnamed: 0,tesla,motors,raises,million,first,ipo,u,carmaker,years,tesla motors,...,weeks start,start building,building plant,plant germany,germany risks,risks big,big delay,delay government,government minister,minister warns
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2349,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [247]:
cv_df.set_index(df.index, inplace=True)
cv_df.head()

Unnamed: 0_level_0,tesla,motors,raises,million,first,ipo,u,carmaker,years,tesla motors,...,weeks start,start building,building plant,plant germany,germany risks,risks big,big delay,delay government,government minister,minister warns
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-06-29 01:46:32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2010-06-29 22:31:42,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2010-07-02 20:21:02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2010-07-06 22:09:12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2010-07-12 16:44:50,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [248]:
cv_df['text'] = df.text

In [249]:
def sentiment(tweet):
    analysis = TextBlob(tweet)
    return analysis.sentiment[0]

In [250]:
cv_df['sentiment_score'] = cv_df['text'].map(sentiment)
cv_df.head()

Unnamed: 0_level_0,tesla,motors,raises,million,first,ipo,u,carmaker,years,tesla motors,...,start building,building plant,plant germany,germany risks,risks big,big delay,delay government,government minister,minister warns,sentiment_score
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-06-29 01:46:32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.25
2010-06-29 22:31:42,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0.25
2010-07-02 20:21:02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.018519
2010-07-06 22:09:12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.136364
2010-07-12 16:44:50,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [251]:
cv_df.shape

(2354, 17513)

In [252]:
df_sp_orig = pd.read_csv('TSLA.csv')
df_sp = df_sp_orig.copy()
df_sp.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,19.0,25.0,17.540001,23.889999,23.889999,18766300
1,2010-06-30,25.790001,30.42,23.299999,23.83,23.83,17187100
2,2010-07-01,25.0,25.92,20.27,21.959999,21.959999,8218800
3,2010-07-02,23.0,23.1,18.709999,19.200001,19.200001,5139800
4,2010-07-06,20.0,20.0,15.83,16.110001,16.110001,6866900


In [253]:
def price_change(df):
    df_output = df.copy()
    df_output['price_change_%'] = round(((df_output['Close'] - df_output['Open'])/df_output['Open']*100), ndigits=2)
    return df_output

In [254]:
df_sp = price_change(df_sp)
df_sp.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,price_change_%
0,2010-06-29,19.0,25.0,17.540001,23.889999,23.889999,18766300,25.74
1,2010-06-30,25.790001,30.42,23.299999,23.83,23.83,17187100,-7.6
2,2010-07-01,25.0,25.92,20.27,21.959999,21.959999,8218800,-12.16
3,2010-07-02,23.0,23.1,18.709999,19.200001,19.200001,5139800,-16.52
4,2010-07-06,20.0,20.0,15.83,16.110001,16.110001,6866900,-19.45


In [255]:
df_sp.set_index(pd.to_datetime(df_sp['Date']), inplace=True)
df_sp.drop(columns='Date', inplace=True)
df_sp.sort_index(inplace=True)
df_sp.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,price_change_%
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,19.0,25.0,17.540001,23.889999,23.889999,18766300,25.74
2010-06-30,25.790001,30.42,23.299999,23.83,23.83,17187100,-7.6
2010-07-01,25.0,25.92,20.27,21.959999,21.959999,8218800,-12.16
2010-07-02,23.0,23.1,18.709999,19.200001,19.200001,5139800,-16.52
2010-07-06,20.0,20.0,15.83,16.110001,16.110001,6866900,-19.45


In [256]:
cv_df = cv_df.groupby(pd.Grouper(freq='D')).mean()
cv_df.sort_index(inplace=True)
cv_df.head()

Unnamed: 0_level_0,tesla,motors,raises,million,first,ipo,u,carmaker,years,tesla motors,...,start building,building plant,plant germany,germany risks,risks big,big delay,delay government,government minister,minister warns,sentiment_score
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-06-29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0.5
2010-06-30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2010-07-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2010-07-02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.018519
2010-07-03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


# Random Text Cleaning Functions

In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import *
from nltk import pos_tag
import re
from sklearn.feature_extraction.text import CountVectorizer
import string
string.punctuation

In [None]:
def get_wordnet_pos(treebank_tag):
    '''
    Translate nltk POS to wordnet tags
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
def clean_text(text_in):
    
    pattern = '[0-9]'
    wordnet_lemmatizer = WordNetLemmatizer()
    
    text = text_in
    text2 = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text3 = re.sub(pattern, '', text2)
    text4 = [word.lower() for word in word_tokenize(text3)]
    text5 = [word for word in text4 if word not in string.punctuation]
    text6 = [word for word in text5 if word not in stopwords.words('english')]
    text7 = [word for word in text6 if len(word) > 2]
    text8 = pos_tag(text7)
    text9 = [(word[0], get_wordnet_pos(word[1])) for word in text8]
    text_out = [wordnet_lemmatizer.lemmatize(word[0],word[1]) for word in text9]
    
    return text_out