In [2]:
!pip install xgboost
!pip install hvplot

Collecting xgboost
  Downloading xgboost-1.6.2-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 828 kB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2
Collecting hvplot
  Downloading hvplot-0.8.1-py2.py3-none-any.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 568 kB/s eta 0:00:01
Collecting holoviews>=1.11.0
  Downloading holoviews-1.15.1-py2.py3-none-any.whl (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 559 kB/s eta 0:00:01
Collecting colorcet>=2
  Downloading colorcet-3.0.1-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 1.1 MB/s eta 0:00:01
Collecting pyviz-comms>=0.7.4
  Downloading pyviz_comms-2.2.1-py2.py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 799 kB/s eta 0:00:01
[?25hCollecting panel>=0.13.1
  Downloading panel-0.14.0-py2.py3-none-any.whl (17.3 MB)
[K     |███████

In [9]:
import numpy as np
import pandas as pd

from xgboost import XGBRegressor
%matplotlib inline
from sklearn import metrics

In [17]:
#Read TSLA.csv contains open, high, low, close, Adj close, Volume of Apple stock with twitter polarity scores and twitter volumes
df = pd.read_csv('../Desktop/Twitter-and-Stock-prices/APPL.csv', index_col="Date", infer_datetime_format=True, parse_dates=True)

#Drop null values
df.dropna(inplace=True)
df.tail()

FileNotFoundError: [Errno 2] File ../Desktop/Twitter-and-Stock-prices/APPL.csv does not exist: '../Desktop/Twitter-and-Stock-prices/APPL.csv'

In [None]:
#Dataframe with Adj close, ts_polarity, twitter_volume
df = df[["Adj Close", "ts_polarity", "twitter_volume"]]
df.head()

In [None]:
#pct change based on Adj close value
df["Pct_change"] = df["Adj Close"].pct_change()

#Drop null values 
df.dropna(inplace=True)
df.head()

In [None]:
#Creation of vectors X and Y 
#X = input features / Y = target vector

def window_data(df, window, feature_col_number1, feature_col_number2, feature_col_number3, target_col_number):
    #Create empty lists "X_close", "X_polarity",  "X_volume" and y
    X_close = []
    X_polarity = []
    X_volume = []
    y = []
    
    for i in range(len(df) - window):
        
        #Get close, ts_polarity, tw_vol and target in the loop
        close = df.iloc[i:(i + window), feature_col_number1]
        ts_polarity = df.iloc[i:(i + window), feature_col_number2]
        tw_vol = df.iloc[i:(i + window), feature_col_number3]
        target = df.iloc[(i + window), target_col_number]
        
        #Append values in the lists
        X_close.append(close)
        X_polarity.append(ts_polarity)
        X_volume.append(tw_vol)
        y.append(target)
        
    return np.hstack((X_close, X_polarity, X_volume)), np.array(y).reshape(-1, 1)

In [None]:
#Use 70% of the data for training and 30% for testing
X_split = int(0.7 * len(X))
y_split = int(0.7 * len(y))

#Set X_train, X_test, y_train, t_test
X_train = X[:X_split]
X_test = X[X_split:]
y_train = y[:y_split]
y_test = y[y_split:]

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
#Use MinMaxScaler to scale data between 0 and 1
x_train_scaler = MinMaxScaler()
x_test_scaler = MinMaxScaler()
y_train_scaler = MinMaxScaler()
y_test_scaler = MinMaxScaler()

#Fit the scaler for the Training data
x_train_scaler.fit(X_train)
y_train_scaler.fit(y_train)

#Scale the training data
X_train = x_train_scaler.transform(X_train)
y_train = y_train_scaler.transform(y_train)

#Fit the scaler for the Test data 
x_test_scaler.fit(X_test)
y_test_scaler.fit(y_test)

#Scale the test data
X_test = x_test_scaler.transform(X_test)
y_test = y_test_scaler.transform(y_test)

In [None]:
#Create the XG Boost regressor instance
model = XGBRegressor(objective="reg:squarederror", n_estimators=1000)

In [None]:
#Fit the model
model.fit(X_train, y_train.ravel())

In [None]:
#Make predictions 
predicted = model.predict(X_test)

In [None]:
#Evaluating the model
print("Root Mean Squared Error: ", np.sqrt(metrics.mean_squared_error(y_test, predicted)))
print("R-squared: ", metrics.r2_score(y_test, predicted))

In [None]:
#Recover the original prices instead of the scaled version
predicted_prices = y_test_scaler.inverse_transform(predicted.reshape(-1, 1))
real_prices = y_test_scaler.inverse_transform(y_test.reshape(-1, 1))

In [None]:
#Create a dataframe of Real and Predicted values
stocks = pd.Dataframe({
    "Real": real_prices.ravel(),
    "Predicted": predicted_prices.ravel(),
}, index = df.index[-len(real_prices): ])
stokcs.head()

In [None]:
#Plot the real vs predicted values as a line chart
stocks.hvplot(title = "Real vs Predicted values of TSLA")