In [1]:
# imports
import json
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from nltk.stem import PorterStemmer
from num2words import num2words
from nltk.corpus import words
from nltk.corpus import wordnet
from nltk.corpus import words as nltk_words
nltk.download('words')
from collections import defaultdict
from sklearn.ensemble import RandomForestRegressor

[nltk_data] Downloading package words to /Users/Nelson/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
# Load in Data

with open("Tesla_Comments_p.json", "r") as file:
    tesla_data_p = json.load(file)
with open("Tesla_Comments_c.json", "r") as file:
    tesla_data_c = json.load(file)
    
with open("GE_Comments_p.json", "r") as file:
    ge_data_p = json.load(file)
with open("GE_Comments_c.json", "r") as file:
    ge_data_c = json.load(file)
     
with open("AMD_Comments_p.json", "r") as file:
    amd_data_p = json.load(file)
with open("AMD_Comments_c.json", "r") as file:
    amd_data_c = json.load(file)
    
with open("NVDA_Comments_p.json", "r") as file:
    nvda_data_p = json.load(file)
with open("NVDA_Comments_c.json", "r") as file:
    nvda_data_c = json.load(file)

    
tesla_stock_data = pd.read_csv("TSLA.csv")
ge_stock_data = pd.read_csv("GE.csv")
amd_stock_data = pd.read_csv("AMD.csv")
nvda_stock_data = pd.read_csv("NVDA.csv")

posts = [tesla_data_p, ge_data_p, amd_data_p, nvda_data_p]
comms = [tesla_data_c, ge_data_c, amd_data_c, nvda_data_c]
stocs = [tesla_stock_data, ge_stock_data, amd_stock_data, nvda_stock_data]
tickers = ["TSLA","GE","AMD","NVDA"]

In [3]:
# Combining and Cleaning Finance and Social Media data

combined = pd.DataFrame()
for comm, post, stoc, ticker in zip(comms, posts, stocs, tickers):
    
    # create columns
    df_comments = pd.json_normalize(comm).transpose()
    df_posts = pd.json_normalize(post).transpose()
    #df_posts.rename(columns={0,"Posts"})
    df_posts.columns = ['Posts']
    
    df_stock_info = stoc.set_index("Date")
    
    # join columns
    joined_df = df_stock_info.join(df_comments)
    joined_df = joined_df.join(df_posts)

    # add price column chance
    joined_df["Price_Change"] = joined_df["Close"] - joined_df["Open"]

    def dire(x):
        if x>0:
            return "up"
        elif x<0:
            return "down"
        else:
            return "no change"

    joined_df["Direction"] = joined_df["Price_Change"].apply(dire) 

    joined_df["Ticker"] = ticker


    joined_df2 = joined_df.rename(columns={"Open":"Open_Price","High":"High_Price","Low":"Low_Price","Close":"Close_Price",
                              "Volume":"Trade_Volume","Direction":"Price_Direction",0:"Comments"})
    
    joined_df3 = joined_df2
    
    combined = pd.concat([combined,joined_df3])
    
# drop rows where posts are empty so comments are nan
combined = combined.dropna()

In [134]:
# Sentiment Creation

Post_Sentiment = []
Comment_Sentiment = []

sia = SentimentIntensityAnalyzer()

Total_Positive_Sentiment_Comm = []
Total_Negative_Sentiment_Comm = []
Total_Positive_Sentiment_Post = []
Total_Negative_Sentiment_Post = []
Number_of_Comments = []

for p, c in zip(combined["Posts"], combined["Comments"]):
    
    # Comment Sentiment
    tot_pos = 0
    tot_neg = 0

    for comment in c:
        pos = sia.polarity_scores(comment)["pos"]
        tot_pos = tot_pos + pos
        
        neg = sia.polarity_scores(comment)["neg"]
        tot_neg = tot_neg + neg
        
    if tot_pos > tot_neg:
        Comment_Sentiment.append("Pos")
    else:
        Comment_Sentiment.append("Neg")
        
    Total_Positive_Sentiment_Comm.append(tot_pos)
    Total_Negative_Sentiment_Comm.append(tot_neg)
       
        
    # Post Sentiment
    tot_pos = 0
    tot_neg = 0

    for post in p:
        pos = sia.polarity_scores(post)["pos"]
        tot_pos = tot_pos + pos
        
        neg = sia.polarity_scores(post)["neg"]
        tot_neg = tot_neg + neg

    if tot_pos > tot_neg:
        Post_Sentiment.append("Pos")
    else:
        Post_Sentiment.append("Neg")
    
    Total_Positive_Sentiment_Post.append(tot_pos)
    Total_Negative_Sentiment_Post.append(tot_neg) 
    
    Number_of_Comments.append(len(c))
    
    
combined["Post_Sentiment"] = Post_Sentiment
combined["Comment_Sentiment"] = Comment_Sentiment
combined["Total_Positive_Sentiment_Comm"] = Total_Positive_Sentiment_Comm
combined["Total_Negative_Sentiment_Comm"] = Total_Negative_Sentiment_Comm
combined["Total_Positive_Sentiment_Post"] = Total_Positive_Sentiment_Post
combined["Total_Negative_Sentiment_Post"] =Total_Negative_Sentiment_Post
combined["Number_of_Comments"] = Number_of_Comments

combined2 = combined[["Ticker","Posts","Post_Sentiment","Total_Positive_Sentiment_Post",
                      "Total_Negative_Sentiment_Post", "Comments", "Comment_Sentiment",
                      "Total_Positive_Sentiment_Comm", 
                      "Total_Negative_Sentiment_Comm","Number_of_Comments","Price_Change",
                      "Price_Direction","Open_Price","Close_Price","High_Price",
                      "Low_Price","Trade_Volume"]]

# Remove rows with empty posts.
mask = []
for val in combined2["Posts"]:
    if val[0] == "":
        mask.append(False)
    else:
        mask.append(True)
        
comb3 = combined2[mask]

In [135]:
## CAREFUL NOT TO RANDOMIZE DATA HERE SINCE USING TIME SERIES APPROACH

In [148]:
variables = pd.DataFrame()

variables['Post_Sentiment'] = np.where(comb3['Post_Sentiment']=="Pos", 1, 0)
variables['Comment_Sentiment'] = np.where(comb3['Comment_Sentiment']=="Pos", 1, 0)



variables["Numb_Comm_ized"] = (np.array(comb3["Number_of_Comments"]) - 
                               np.mean(np.array(comb3["Number_of_Comments"]))
                              ) / np.std(np.array(comb3["Number_of_Comments"]))


variables["Numb_Comm_ized"] = (np.array(comb3["Number_of_Comments"]) - 
                               np.mean(np.array(comb3["Number_of_Comments"]))
                              ) / np.std(np.array(comb3["Number_of_Comments"]))

# this feature will be about amount of change, and not direction
comb3['Price_Change'] = np.where(comb3['Price_Change']>=0, comb3['Price_Change'], abs(comb3['Price_Change']))
variables["Abs_Val_Price_Change"] = (np.array(comb3["Price_Change"]) - 
                               np.mean(np.array(comb3["Price_Change"]))
                              ) / np.std(np.array(comb3["Price_Change"]))

variables["Open_Price"] = (np.array(comb3["Open_Price"]) - 
                               np.mean(np.array(comb3["Open_Price"]))
                              ) / np.std(np.array(comb3["Open_Price"]))

variables["Close_Price"] = (np.array(comb3["Close_Price"]) - 
                               np.mean(np.array(comb3["Close_Price"]))
                              ) / np.std(np.array(comb3["Close_Price"]))

variables["High_Price"] = (np.array(comb3["High_Price"]) - 
                               np.mean(np.array(comb3["High_Price"]))
                              ) / np.std(np.array(comb3["High_Price"]))

variables["Low_Price"] = (np.array(comb3["Low_Price"]) - 
                               np.mean(np.array(comb3["Low_Price"]))
                              ) / np.std(np.array(comb3["Low_Price"]))

variables["Trade_Volume"] = (np.array(comb3["Trade_Volume"]) - 
                               np.mean(np.array(comb3["Trade_Volume"]))
                              ) / np.std(np.array(comb3["Trade_Volume"]))

y_var = pd.DataFrame()
y_var['Price_Direction'] = np.where(comb3['Price_Direction']=="up", 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [149]:
variables

Unnamed: 0,Post_Sentiment,Comment_Sentiment,Numb_Comm_ized,Abs_Val_Price_Change,Open_Price,Close_Price,High_Price,Low_Price,Trade_Volume
0,1,0,-1.084598,0.140257,0.700466,0.678465,0.735682,0.713167,0.451718
1,0,1,-0.903058,-0.291997,0.707764,0.717920,0.715643,0.729689,-0.229869
2,1,0,-0.963571,-0.124322,0.733013,0.748918,0.719111,0.756894,-0.633023
3,1,1,-0.963571,-0.186384,0.782243,0.796049,0.771306,0.795585,-0.491689
4,1,0,-1.145111,-0.389988,0.808161,0.804357,0.799338,0.818097,-0.460842
...,...,...,...,...,...,...,...,...,...
334,1,1,-1.205624,0.528954,1.105813,1.070709,1.071010,1.114569,-1.256931
335,1,1,-1.024084,0.162031,1.076990,1.054427,1.039984,1.077672,-1.235608
336,0,1,-0.297925,-0.466205,1.056509,1.060873,1.041444,1.089729,-1.262528
337,1,1,-0.721518,0.131548,1.066228,1.091051,1.062067,1.098047,-1.227421


In [164]:
def f(row, df, ys):
    row_number = int(row.name)

    try:
        xs1 = df.loc[row_number-3,:]
        xs2 = df.loc[row_number-2,:]
        xs3 = df.loc[row_number-1,:]

        y = y_var.loc[row_number,:][0]
        
        row = []
        for x1 in xs1:
            row.append(x1)
        for x2 in xs2:
            row.append(x2)
        for x3 in xs3:
            row.append(x3)
        row.append(y)


        return row
        
    except KeyError:
        pass


new_df = variables.apply(f, axis = 1, df = variables, ys = y_var)

In [165]:
df = pd.DataFrame()
for row in new_df.values[3:]:
    df = pd.concat([df, pd.DataFrame(row).T])

In [166]:
y_vars = np.array(df.loc[:,27])
df2 = df.drop([27], axis=1)
vs = df2.to_numpy()

In [167]:
X_train, X_test, y_train, y_test = train_test_split(vs, y_vars, test_size=0.25)

In [168]:
# Random Forest ML Model Based on Last 3 Days

# Builds out 1000 trees 
model = RandomForestRegressor(n_estimators=1000)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [176]:
c = 0
w = 0
for prediction, actual in zip(model.predict(X_test), y_test):
    if prediction >=.5:
        prediction = 1
    else:
        prediction = 0
    
    if prediction - actual == 0:
        c += 1
    else:
        w += 1

print("3 Day Accuracy:", c / (w + c))

3 Day Accuracy: 0.5357142857142857


In [198]:






# Let's try to increase the window to 10 days.







In [199]:
def f2(row, df, ys):
    row_number = int(row.name)

    try:
        xs1 = df.loc[row_number-10,:]
        xs2 = df.loc[row_number-9,:]
        xs3 = df.loc[row_number-8,:]
        xs4 = df.loc[row_number-7,:]
        xs5 = df.loc[row_number-6,:]
        xs6 = df.loc[row_number-5,:]
        xs7 = df.loc[row_number-4,:]
        xs8 = df.loc[row_number-3,:]
        xs9 = df.loc[row_number-2,:]
        xs10 = df.loc[row_number-1,:]


        y = y_var.loc[row_number,:][0]
        
        row = []
        for x1 in xs1:
            row.append(x1)
        for x2 in xs2:
            row.append(x2)
        for x3 in xs3:
            row.append(x3)
        for x4 in xs4:
            row.append(x4)
        for x5 in xs5:
            row.append(x5)
        for x6 in xs6:
            row.append(x6)
        for x7 in xs7:
            row.append(x7)
        for x8 in xs8:
            row.append(x8)
        for x9 in xs9:
            row.append(x9)
        for x10 in xs10:
            row.append(x10)
        row.append(y)
        
        return row
        
    except KeyError:
        pass


new_df2 = variables.apply(f2, axis = 1, df = variables, ys = y_var)


df2 = pd.DataFrame()
for row in new_df2.values[10:]:
    df2 = pd.concat([df2, pd.DataFrame(row).T])
    
y_vars = np.array(df2.loc[:,90])
df3 = df2.drop([90], axis=1)
xs = df3.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(xs, y_vars, test_size=0.25)

# Random Forest ML Model Based on Last 10 Days

# Builds out 1000 trees 
model = RandomForestRegressor(n_estimators=1000)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [200]:
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
0,1.0,0.0,-1.084598,0.140257,0.700466,0.678465,0.735682,0.713167,0.451718,0.0,...,1.0,0.0,-0.903058,0.593196,0.682181,0.644673,0.660747,0.678827,-0.721690,1.0
0,0.0,1.0,-0.903058,-0.291997,0.707764,0.717920,0.715643,0.729689,-0.229869,1.0,...,1.0,0.0,-1.084598,-0.450960,0.657901,0.662593,0.658776,0.680582,-0.793048,0.0
0,1.0,0.0,-0.963571,-0.124322,0.733013,0.748918,0.719111,0.756894,-0.633023,1.0,...,0.0,0.0,-1.084598,1.507778,0.646506,0.577684,0.620560,0.576225,-0.286309,1.0
0,1.0,1.0,-0.963571,-0.186384,0.782243,0.796049,0.771306,0.795585,-0.491689,1.0,...,1.0,1.0,-0.842545,0.997134,0.657045,0.711289,0.694218,0.681460,-0.494548,0.0
0,1.0,0.0,-1.145111,-0.389988,0.808161,0.804357,0.799338,0.818097,-0.460842,1.0,...,0.0,1.0,-0.963571,1.440274,0.767906,0.701453,0.784483,0.736061,-0.482264,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1.0,1.0,-0.842545,-0.050285,1.123539,1.108264,1.090355,1.139828,-1.264503,1.0,...,0.0,1.0,-1.024084,0.575771,1.078815,1.118845,1.084880,1.108845,-1.178335,0.0
0,1.0,0.0,-1.024084,0.048799,1.107154,1.129165,1.091085,1.109570,-1.188148,1.0,...,1.0,1.0,-1.205624,0.528954,1.105813,1.070709,1.071010,1.114569,-1.256931,0.0
0,1.0,1.0,-1.145111,-0.100365,1.141004,1.127452,1.108057,1.148642,-1.181770,1.0,...,1.0,1.0,-1.024084,0.162031,1.076990,1.054427,1.039984,1.077672,-1.235608,1.0
0,1.0,1.0,0.186181,-0.067703,1.134487,1.152526,1.112620,1.173138,-1.242193,1.0,...,0.0,1.0,-0.297925,-0.466205,1.056509,1.060873,1.041444,1.089729,-1.262528,1.0


In [196]:
c = 0
w = 0
for prediction, actual in zip(model.predict(X_test), y_test):
    if prediction >=.5:
        prediction = 1
    else:
        prediction = 0
    
    if prediction - actual == 0:
        c += 1
    else:
        w += 1

print("10 Day Accuracy:", c / (w + c))

3 Day Accuracy: 0.5060240963855421


In [None]:





# Now 20 Day Moving Window





In [202]:
def f3(row, df, ys):
    row_number = int(row.name)

    try:
        row = []
        for i in range(1,21):
            xs = df.loc[row_number-i,:]
            for x in xs:
                row.append(x)

        y = y_var.loc[row_number,:][0]
        row.append(y)
        return row
        
    except KeyError:
        pass


new_df2 = variables.apply(f3, axis = 1, df = variables, ys = y_var)


df2 = pd.DataFrame()
for row in new_df2.values[20:]:
    df2 = pd.concat([df2, pd.DataFrame(row).T])

In [203]:
y_vars = np.array(df2.loc[:,180])
df3 = df2.drop([180], axis=1)
xs = df3.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(xs, y_vars, test_size=0.25)

# Random Forest ML Model Based on Last 10 Days

# Builds out 1000 trees 
model = RandomForestRegressor(n_estimators=1000)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [204]:
c = 0
w = 0
for prediction, actual in zip(model.predict(X_test), y_test):
    if prediction >=.5:
        prediction = 1
    else:
        prediction = 0
    
    if prediction - actual == 0:
        c += 1
    else:
        w += 1

print("20 Day Accuracy:", c / (w + c))

20 Day Accuracy: 0.5625


In [None]:




# Now 50 Day Moving Window





In [207]:
def f4(row, df, ys):
    row_number = int(row.name)

    try:
        row = []
        for i in range(1,51):
            xs = df.loc[row_number-i,:]
            for x in xs:
                row.append(x)

        y = y_var.loc[row_number,:][0]
        row.append(y)
        return row
        
    except KeyError:
        pass


new_df2 = variables.apply(f4, axis = 1, df = variables, ys = y_var)


df2 = pd.DataFrame()
for row in new_df2.values[50:]:
    df2 = pd.concat([df2, pd.DataFrame(row).T])

In [208]:
y_vars = np.array(df2.loc[:,450])
df3 = df2.drop([450], axis=1)
xs = df3.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(xs, y_vars, test_size=0.25)

# Random Forest ML Model Based on Last 10 Days

# Builds out 1000 trees 
model = RandomForestRegressor(n_estimators=1000)
model.fit(X_train, y_train)


c = 0
w = 0
for prediction, actual in zip(model.predict(X_test), y_test):
    if prediction >=.5:
        prediction = 1
    else:
        prediction = 0
    
    if prediction - actual == 0:
        c += 1
    else:
        w += 1

print("20 Day Accuracy:", c / (w + c))

20 Day Accuracy: 0.5068493150684932


In [None]:

# next steps:
  # test with 1 day back here.  if still perect, do own creation get numbers.
  # game plan, try this technique on other data, etc.
  # Deep Learning technique (done for year)
    


