In [9]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

import os
from graphs import *


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/oscarpinon/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [10]:
# import prices

prices=pd.read_csv('datasets/values-of-top-nasdaq-copanies-from-2010-to-2020/CompanyValues.csv')
# format date
prices['day_date'] = pd.to_datetime(prices['day_date'], format="%Y-%m-%d").dt.date.astype('datetime64[ns]')
prices=prices.sort_values(by=['day_date']).reset_index()
# rename date column for consistency
prices=prices.rename(columns={"day_date": "date"})



In [11]:
# import tweets
tweets=pd.read_csv('datasets/tweets-about-the-top-companies-from-2015-to-2020/Tweet.csv')
company_tweet=pd.read_csv('datasets/tweets-about-the-top-companies-from-2015-to-2020/Company_Tweet.csv')

tweets=tweets.merge(company_tweet,how='left',on='tweet_id')
# format dates
tweets['date'] = pd.to_datetime(tweets['post_date'], unit='s').dt.date
tweets.date=pd.to_datetime( tweets.date,errors='coerce')
tweets['time'] = pd.to_datetime(tweets['post_date'], unit='s').dt.time


In [12]:
# augment vocab
sia = SentimentIntensityAnalyzer()


positive_words='buy bull long support undervalued underpriced cheap upward rising trend moon rocket hold breakout call beat support buying holding high profit'
negative_words='sell bear bubble bearish short overvalued overbought overpriced expensive downward falling sold sell low put miss resistance squeeze cover seller '

#MEJORAR
financial_lingo_value = 4


dictOfpos = { i : financial_lingo_value for i in positive_words.split(" ") }
dictOfneg = { i : -financial_lingo_value for i in negative_words.split(" ")  }
Financial_Lexicon = {**dictOfpos, **dictOfneg}

sia.lexicon.update(Financial_Lexicon)

In [None]:
df = tweets

df.loc[:,('score')]=df.loc[:,'body'].apply(lambda x: sia.polarity_scores(x)['compound'])
df.loc[:,('label')]=pd.cut(np.array(df.loc[:,'score']),bins=[-1, -0.33, 0.33, 1],right=True ,labels=["bad", "neutral", "good"])

df.to_pickle("my_data.pkl")

In [13]:
#loading df

df= pd.read_pickle("my_data.pkl")

In [14]:
df

Unnamed: 0,tweet_id,writer,post_date,body,comment_num,retweet_num,like_num,ticker_symbol,date,time,score,label
0,550441509175443456,VisualStockRSRC,1420070457,"lx21 made $10,008 on $AAPL -Check it out! htt...",0,0,1,AAPL,2015-01-01,00:00:57,0.0000,neutral
1,550441672312512512,KeralaGuy77,1420070496,Insanity of today weirdo massive selling. $aap...,0,0,0,AAPL,2015-01-01,00:01:36,-0.8271,bad
2,550441732014223360,DozenStocks,1420070510,S&P100 #Stocks Performance $HD $LOW $SBUX $TGT...,0,0,0,AMZN,2015-01-01,00:01:50,0.0000,neutral
3,550442977802207232,ShowDreamCar,1420070807,$GM $TSLA: Volkswagen Pushes 2014 Record Recal...,0,0,1,TSLA,2015-01-01,00:06:47,0.0000,neutral
4,550443807834402816,i_Know_First,1420071005,Swing Trading: Up To 8.91% Return In 14 Days h...,0,0,1,AAPL,2015-01-01,00:10:05,0.0000,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...
4336440,1212159838882533376,ShortingIsFun,1577836401,In 2020 I may start Tweeting out positive news...,0,0,1,TSLA,2019-12-31,23:53:21,-0.5423,bad
4336441,1212160015332728833,Commuternyc,1577836443,Patiently Waiting for the no twitter sitter tw...,0,0,5,TSLA,2019-12-31,23:54:03,-0.2960,neutral
4336442,1212160410692046849,MoriaCrypto,1577836537,I don't discriminate. I own both $aapl and $ms...,1,0,1,AAPL,2019-12-31,23:55:37,0.0000,neutral
4336443,1212160410692046849,MoriaCrypto,1577836537,I don't discriminate. I own both $aapl and $ms...,1,0,1,MSFT,2019-12-31,23:55:37,0.0000,neutral


In [15]:
# Aggregating the tweets data by date and ticker_symbol
tweets_agg = df.groupby(['date', 'ticker_symbol']).agg({
    'tweet_id': 'count',
    'comment_num': 'sum',
    'retweet_num': 'sum',
    'like_num': 'sum',
    'score': 'mean'
}).rename(columns={'tweet_id': 'tweet_count', 'score': 'avg_sentiment_score'}).reset_index()

# Display the aggregated dataframe

tweets_agg

Unnamed: 0,date,ticker_symbol,tweet_count,comment_num,retweet_num,like_num,avg_sentiment_score
0,2015-01-01,AAPL,299,51,1250,398,0.145556
1,2015-01-01,AMZN,131,45,699,192,-0.159267
2,2015-01-01,GOOG,60,2,12,22,0.184912
3,2015-01-01,GOOGL,45,1,3,30,0.189336
4,2015-01-01,MSFT,54,2,19,51,0.121148
...,...,...,...,...,...,...,...
10943,2019-12-31,AMZN,474,159,231,1023,0.209775
10944,2019-12-31,GOOG,114,14,27,104,0.155536
10945,2019-12-31,GOOGL,120,30,53,203,0.125038
10946,2019-12-31,MSFT,218,62,244,621,0.211606


In [16]:
# Convert date to datetime in prices dataframe
prices['date'] = pd.to_datetime(prices['date'])

# Merge prices with aggregated tweets data
merged_df = prices.merge(tweets_agg, how='left', on=['date', 'ticker_symbol'])

# Fill NaN values resulting from the merge
merged_df.fillna({'tweet_count': -1, 'comment_num': 0, 'retweet_num': 0, 'like_num': 0, 'avg_sentiment_score': 0}, inplace=True)

# Display the merged dataframe

df = merged_df[merged_df['tweet_count'] != -1]

df


Unnamed: 0,index,ticker_symbol,date,close_value,volume,open_value,high_value,low_value,tweet_count,comment_num,retweet_num,like_num,avg_sentiment_score
5954,13895,MSFT,2015-01-01,46.45,21551090,46.73,47.44,46.450,54.0,2.0,19.0,51.0,0.121148
5955,2517,AAPL,2015-01-01,110.38,41304780,112.82,113.13,110.210,299.0,51.0,1250.0,398.0,0.145556
5956,16960,TSLA,2015-01-01,222.41,2392947,223.09,225.68,222.250,99.0,25.0,47.0,95.0,0.138119
5957,10810,GOOG,2015-01-01,526.40,1367110,531.25,532.60,525.800,60.0,2.0,12.0,22.0,0.184912
5958,5602,AMZN,2015-01-01,310.35,2048676,311.55,312.98,310.010,131.0,45.0,699.0,192.0,-0.159267
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16905,3188,AMZN,2019-12-31,1847.84,2510380,1842.00,1853.26,1832.230,474.0,159.0,231.0,1023.0,0.209775
16906,6273,GOOGL,2019-12-31,1339.39,976061,1335.79,1340.66,1332.130,120.0,30.0,53.0,203.0,0.125038
16907,9358,GOOG,2019-12-31,1337.02,962468,1330.11,1338.00,1329.085,114.0,14.0,27.0,104.0,0.155536
16908,11481,MSFT,2019-12-31,157.70,18393380,156.77,157.77,156.450,218.0,62.0,244.0,621.0,0.211606


In [17]:
df['daily_return'] = (merged_df['close_value'] - merged_df['open_value']) / merged_df['open_value']
df['volatility'] = (merged_df['high_value'] - merged_df['low_value']) / merged_df['open_value']

df

Unnamed: 0,index,ticker_symbol,date,close_value,volume,open_value,high_value,low_value,tweet_count,comment_num,retweet_num,like_num,avg_sentiment_score,daily_return,volatility
5954,13895,MSFT,2015-01-01,46.45,21551090,46.73,47.44,46.450,54.0,2.0,19.0,51.0,0.121148,-0.005992,0.021186
5955,2517,AAPL,2015-01-01,110.38,41304780,112.82,113.13,110.210,299.0,51.0,1250.0,398.0,0.145556,-0.021627,0.025882
5956,16960,TSLA,2015-01-01,222.41,2392947,223.09,225.68,222.250,99.0,25.0,47.0,95.0,0.138119,-0.003048,0.015375
5957,10810,GOOG,2015-01-01,526.40,1367110,531.25,532.60,525.800,60.0,2.0,12.0,22.0,0.184912,-0.009129,0.012800
5958,5602,AMZN,2015-01-01,310.35,2048676,311.55,312.98,310.010,131.0,45.0,699.0,192.0,-0.159267,-0.003852,0.009533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16905,3188,AMZN,2019-12-31,1847.84,2510380,1842.00,1853.26,1832.230,474.0,159.0,231.0,1023.0,0.209775,0.003170,0.011417
16906,6273,GOOGL,2019-12-31,1339.39,976061,1335.79,1340.66,1332.130,120.0,30.0,53.0,203.0,0.125038,0.002695,0.006386
16907,9358,GOOG,2019-12-31,1337.02,962468,1330.11,1338.00,1329.085,114.0,14.0,27.0,104.0,0.155536,0.005195,0.006702
16908,11481,MSFT,2019-12-31,157.70,18393380,156.77,157.77,156.450,218.0,62.0,244.0,621.0,0.211606,0.005932,0.008420


In [18]:
%pip install scikit-learn
import sklearn
features = ['volume', 'volatility', 'tweet_count', 'comment_num', 'retweet_num', 'like_num', 'avg_sentiment_score']
X = df[features]
y = df['daily_return']  # Adjust target variable as needed

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


Note: you may need to restart the kernel to use updated packages.
Mean Squared Error: 0.0001669807988197405


In [19]:
from sklearn.model_selection import GridSearchCV
import joblib

In [11]:

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best Parameters: {best_params}')
print(f'Best Score: {best_score}')


joblib.dump(model, 'stock_price_predictor.pkl')

NameError: name 'GridSearchCV' is not defined

In [20]:
loaded_model = joblib.load('stock_price_predictor.pkl')


sample_data = X_test.iloc[52].values.reshape(1, -1)

print(sample_data)
prediction = loaded_model.predict(sample_data)
print(f'Predicted Close Value: {prediction}')

y_test.iloc[52]

[[4.46404800e+06 1.32018636e-02 1.80000000e+02 2.90000000e+01
  7.90000000e+01 3.68000000e+02 2.23290556e-01]]
Predicted Close Value: [-0.00241336]


-0.005955934320042156