In [1]:
pip install vaderSentiment

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [3]:
TeslaR = pd.read_csv("BareTesla.csv")

In [14]:
TeslaR.shape

(9414, 13)

In [4]:
TeslaAksje = pd.read_csv("TeslaLabelFinal1.csv")

In [5]:
#Merge datasettene
merge = TeslaR.merge(TeslaAksje, how = "inner", on = "Date")

In [6]:
#Create a function to get the subjecticity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

#Create a fuction to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [7]:
#Create two new columns "Subjectivity" and "Polarity"
merge["Subjectivity"] = merge["article_content"].apply(getSubjectivity)
merge["Polarity"] = merge["article_content"].apply(getPolarity)

In [8]:
merge["article_content"] = merge["article_content"].str.lower()

In [9]:
import nltk

from nltk.corpus import stopwords
stop = stopwords.words('english')

In [10]:
merge['article_content_without_stopwords'] = merge['article_content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [11]:
from nltk.stem import PorterStemmer

port = PorterStemmer()

In [12]:
def stem_words(article_content_without_stopwords):
    return " ".join([port.stem(word) for word in article_content_without_stopwords.split()])

merge["EtterStemming"] = merge["article_content_without_stopwords"].apply(lambda article_content_without_stopwords: stem_words(article_content_without_stopwords))
merge.head()

Unnamed: 0.1,Unnamed: 0,Date,X,X_id,article_link,article_title,article_time,author_name,author_link,article_content,...,Low,Close,Volume,Dividends,Stock Splits,Label,Subjectivity,Polarity,article_content_without_stopwords,EtterStemming
0,1,2018-03-13,62,{'$oid': '5ab8cb373efc2c03c520c0cb'},https://www.nasdaq.com/article/can-tesla-inc-h...,"Can Tesla, Inc. Hit Its Model 3 Production Tar...",{'$date': '2018-03-13T11:05:00Z'},"By Daniel Sparks, Motley Fool",https://www.nasdaq.com/author/fool,with just a few weeks left in tesla's (nasdaq:...,...,65.300003,68.367996,29829000,0,0.0,1,0.513533,0.184208,weeks left tesla's (nasdaq: tsla) first quarte...,"week left tesla' (nasdaq: tsla) first quarter,..."
1,2,2018-03-13,370,{'$oid': '5ab8d0e43efc2c03c520cd51'},https://www.nasdaq.com/article/6-energy-stocks...,6 Energy Stocks I'd Avoid at All Costs,{'$date': '2018-03-13T06:05:00Z'},"By Travis Hoium, Motley Fool",https://www.nasdaq.com/author/fool,the energy industry has been one of the most s...,...,65.300003,68.367996,29829000,0,0.0,1,0.512743,0.221662,energy industry one stable investors last cent...,energi industri one stabl investor last centur...
2,4,2018-03-13,787,{'$oid': '5ab8d5ed3efc2c03c520d8a7'},https://www.nasdaq.com/article/why-we-might-be...,Why We Might Be Hitting Peak Smartphone,{'$date': '2018-03-13T16:48:13Z'},"By Chris Hill, Motley Fool",https://www.nasdaq.com/author/fool,"in to this episode ofmarketfoolery , analyst s...",...,65.300003,68.367996,29829000,0,0.0,1,0.499273,0.135207,"episode ofmarketfoolery , analyst simon ericks...","episod ofmarketfooleri , analyst simon erickso..."
3,5,2018-03-13,793,{'$oid': '5ab8d61e3efc2c03c520d915'},https://www.nasdaq.com/article/nasdaq-streak-s...,"Nasdaq Streak Snapped on Tech Pullback, White ...",{'$date': '2018-03-13T16:26:43Z'},"By pmartin@sir-inc.com, Schaeffer's Research",https://www.nasdaq.com/author/schaeffers,"stocks once again started the day off strong, ...",...,65.300003,68.367996,29829000,0,0.0,1,0.463206,-0.015142,"stocks started day strong, thanks encouragingi...","stock start day strong, thank encouraginginfl ..."
4,9,2018-03-13,1113,{'$oid': '5ab8d62a3efc2c03c520d933'},https://www.nasdaq.com/article/tesla-bulls-bew...,Tesla Bulls Beware: Volkswagen Wants to Be the...,{'$date': '2018-03-13T16:20:00Z'},"By Benjamin Rains, Zacks.com",https://www.nasdaq.com/author/zacks,german automotive giant volkswagen vlkaydetail...,...,65.300003,68.367996,29829000,0,0.0,1,0.435281,0.023066,german automotive giant volkswagen vlkaydetail...,german automot giant volkswagen vlkaydetail pl...


In [15]:
#Create a function to get the sentiment scores
def getSIA(EtterStemming):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(EtterStemming)
    return sentiment

In [16]:
#Get the sentiment scores for each day
compound = []
neg = []
pos = []
neu = []
SIA = 0

for i in range(0, len(merge["EtterStemming"])):
    SIA = getSIA(merge["EtterStemming"][i])
    compound.append(SIA["compound"])
    neg.append(SIA["neg"])
    neu.append(SIA["neu"])
    pos.append(SIA["pos"])

In [17]:
#Store the sentiment scores in the merge data set:
merge["Compound"] = compound
merge["Negative"] = neg
merge["Neutral"] = neu
merge["Positive"] = pos

In [18]:
#Show the merge data
merge.head(3)

Unnamed: 0.1,Unnamed: 0,Date,X,X_id,article_link,article_title,article_time,author_name,author_link,article_content,...,Stock Splits,Label,Subjectivity,Polarity,article_content_without_stopwords,EtterStemming,Compound,Negative,Neutral,Positive
0,1,2018-03-13,62,{'$oid': '5ab8cb373efc2c03c520c0cb'},https://www.nasdaq.com/article/can-tesla-inc-h...,"Can Tesla, Inc. Hit Its Model 3 Production Tar...",{'$date': '2018-03-13T11:05:00Z'},"By Daniel Sparks, Motley Fool",https://www.nasdaq.com/author/fool,with just a few weeks left in tesla's (nasdaq:...,...,0.0,1,0.513533,0.184208,weeks left tesla's (nasdaq: tsla) first quarte...,"week left tesla' (nasdaq: tsla) first quarter,...",0.944,0.044,0.876,0.08
1,2,2018-03-13,370,{'$oid': '5ab8d0e43efc2c03c520cd51'},https://www.nasdaq.com/article/6-energy-stocks...,6 Energy Stocks I'd Avoid at All Costs,{'$date': '2018-03-13T06:05:00Z'},"By Travis Hoium, Motley Fool",https://www.nasdaq.com/author/fool,the energy industry has been one of the most s...,...,0.0,1,0.512743,0.221662,energy industry one stable investors last cent...,energi industri one stabl investor last centur...,0.9851,0.074,0.808,0.119
2,4,2018-03-13,787,{'$oid': '5ab8d5ed3efc2c03c520d8a7'},https://www.nasdaq.com/article/why-we-might-be...,Why We Might Be Hitting Peak Smartphone,{'$date': '2018-03-13T16:48:13Z'},"By Chris Hill, Motley Fool",https://www.nasdaq.com/author/fool,"in to this episode ofmarketfoolery , analyst s...",...,0.0,1,0.499273,0.135207,"episode ofmarketfoolery , analyst simon ericks...","episod ofmarketfooleri , analyst simon erickso...",0.9997,0.036,0.8,0.164


In [19]:
#Create a list of column to keep
keep_columns = ["Open", "Low", "High", "Volume", "Subjectivity", "Polarity", "Compound", "Negative", "Neutral", "Positive", "Label"]
df = merge[keep_columns]
df

Unnamed: 0,Open,Low,High,Volume,Subjectivity,Polarity,Compound,Negative,Neutral,Positive,Label
0,65.722000,65.300003,69.442001,29829000,0.513533,0.184208,0.9440,0.044,0.876,0.080,1
1,65.722000,65.300003,69.442001,29829000,0.512743,0.221662,0.9851,0.074,0.808,0.119,1
2,65.722000,65.300003,69.442001,29829000,0.499273,0.135207,0.9997,0.036,0.800,0.164,1
3,65.722000,65.300003,69.442001,29829000,0.463206,-0.015142,0.8402,0.064,0.845,0.091,1
4,65.722000,65.300003,69.442001,29829000,0.435281,0.023066,0.9627,0.005,0.933,0.061,1
...,...,...,...,...,...,...,...,...,...,...,...
8672,43.147999,42.405998,43.759998,27081000,0.511702,0.085723,0.9337,0.066,0.836,0.098,0
8673,43.147999,42.405998,43.759998,27081000,0.540350,0.054140,-0.9693,0.125,0.793,0.082,0
8674,43.147999,42.405998,43.759998,27081000,0.454072,0.102247,0.9992,0.030,0.878,0.093,0
8675,43.147999,42.405998,43.759998,27081000,0.469294,0.179761,0.9766,0.059,0.838,0.104,0


In [20]:
#Create the feature data set
X = df
X = np.array(X.drop(["Label"], 1))

#Create the target data set
y = np.array(df["Label"])

  X = np.array(X.drop(["Label"], 1))


In [21]:
#Split the data into 80% trainig and 20% testing data set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [22]:
#Create and train the model
model = LinearDiscriminantAnalysis().fit(x_train, y_train)

In [23]:
#Show the models predictions
predictions = model.predict(x_test)
predictions

array([1, 0, 1, ..., 1, 1, 1])

In [24]:
y_test

array([1, 0, 1, ..., 1, 1, 1])

In [25]:
#Show the model metrics
print( classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86       922
           1       0.84      0.84      0.84       814

    accuracy                           0.85      1736
   macro avg       0.85      0.85      0.85      1736
weighted avg       0.85      0.85      0.85      1736

