__Support Vector Machine Implementation for Predicting Stock Price Movement__

In [1]:
# Implementing Support Vector machines model on dataset for stock price prediction
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import yfinance as yf
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Importing the dataset
#df = pd.read_csv('SP500_stock_dataset_PEandMarketCap.csv', delimiter=',')
df = pd.read_csv('SP500_stock_prices.csv', delimiter=',')
# Create a new column 'Price Movement'
df['Price Movement'] = np.where(df['Close'] > df['Close'].shift(1), 1, 0)

df.head()



Unnamed: 0,Date,Ticker,Open,Low,High,Close,Volume,Sector,Price Movement
0,2016-01-04,MMM,148.050003,145.399994,148.320007,146.820007,3277200,Industrials,0
1,2016-01-05,MMM,146.820007,145.610001,147.5,147.460007,2688100,Industrials,1
2,2016-01-06,MMM,145.589996,143.419998,145.759995,144.490005,2997100,Industrials,0
3,2016-01-07,MMM,142.520004,140.630005,143.130005,140.970001,3553500,Industrials,0
4,2016-01-08,MMM,141.360001,140.220001,142.5,140.490005,2664000,Industrials,0


In [2]:
# Make date column the index
df = df.set_index('Date')



In [3]:
# only display year, month and day in index
df.index = pd.to_datetime(df.index).date
data = df
data

Unnamed: 0,Ticker,Open,Low,High,Close,Volume,Sector,Price Movement
2016-01-04,MMM,148.050003,145.399994,148.320007,146.820007,3277200,Industrials,0
2016-01-05,MMM,146.820007,145.610001,147.500000,147.460007,2688100,Industrials,1
2016-01-06,MMM,145.589996,143.419998,145.759995,144.490005,2997100,Industrials,0
2016-01-07,MMM,142.520004,140.630005,143.130005,140.970001,3553500,Industrials,0
2016-01-08,MMM,141.360001,140.220001,142.500000,140.490005,2664000,Industrials,0
...,...,...,...,...,...,...,...,...
2020-03-26,ZTS,109.510002,109.510002,118.690002,117.910004,4159000,Health Care,1
2020-03-27,ZTS,114.519997,111.040001,117.779999,111.769997,2870100,Health Care,0
2020-03-30,ZTS,113.629997,112.430000,117.750000,116.680000,2031900,Health Care,1
2020-03-31,ZTS,115.680000,113.900002,120.250000,117.690002,3956000,Health Care,1


In [4]:


print(data['Sector'].unique())
# assign each different Sector string its own number
data['Sector'] = data['Sector'].map({'Consumer Discretionary': 0, 'Consumer Staples': 1, 'Energy': 2, 'Financials': 3, 'Health Care': 4, 'Industrials': 5, 'Information Technology': 6, 'Materials': 7, 'Real Estate': 8, 'Communication Services': 9, 'Utilities': 10})

['Industrials' 'Health Care' 'Information Technology'
 'Communication Services' 'Consumer Staples' 'Consumer Discretionary'
 'Utilities' 'Financials' 'Materials' 'Real Estate' 'Energy']


In [5]:
data
# save data as csv file
data.to_csv('dummy.csv', index=True)


In [6]:
# Make the tickers to categoricals, but keep the original order of the tickers
data['Ticker'] = pd.Categorical(data['Ticker'], categories=data['Ticker'].unique(), ordered=True)
data['Ticker'] = data['Ticker'].cat.codes
data

Unnamed: 0,Ticker,Open,Low,High,Close,Volume,Sector,Price Movement
2016-01-04,0,148.050003,145.399994,148.320007,146.820007,3277200,5,0
2016-01-05,0,146.820007,145.610001,147.500000,147.460007,2688100,5,1
2016-01-06,0,145.589996,143.419998,145.759995,144.490005,2997100,5,0
2016-01-07,0,142.520004,140.630005,143.130005,140.970001,3553500,5,0
2016-01-08,0,141.360001,140.220001,142.500000,140.490005,2664000,5,0
...,...,...,...,...,...,...,...,...
2020-03-26,497,109.510002,109.510002,118.690002,117.910004,4159000,4,1
2020-03-27,497,114.519997,111.040001,117.779999,111.769997,2870100,4,0
2020-03-30,497,113.629997,112.430000,117.750000,116.680000,2031900,4,1
2020-03-31,497,115.680000,113.900002,120.250000,117.690002,3956000,4,1


In [7]:
# Calculate daily returns
returns = df['Close'].pct_change()

# Calculate volatility
volatility = returns.std()

In [8]:
returns

2016-01-04         NaN
2016-01-05    0.004359
2016-01-06   -0.020141
2016-01-07   -0.024362
2016-01-08   -0.003405
                ...   
2020-03-26    0.089641
2020-03-27   -0.052074
2020-03-30    0.043930
2020-03-31    0.008656
2020-04-01   -0.049027
Name: Close, Length: 523991, dtype: float64

In [9]:
volatility

0.08349674045972443

In [1]:
# Splitting the dataset into the Training set and Test set according to date
# Define the specific date to split the DataFrame
split_date = pd.to_datetime('2020-01-01')


# Split the DataFrame into training and test sets based on the specific date
train = data.loc[data.index < split_date]
test = data.loc[data.index >= split_date]
# Split the data into input (X) and output (y) variables
X_train = train[['Ticker','Open', 'Low', 'High', 'Volume', 'Sector']]
y_train = train['Price Movement']
X_test = test[['Ticker', 'Open', 'Low', 'High', 'Volume', 'Sector']]
y_test = test['Price Movement']

NameError: name 'pd' is not defined

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


# # Split the data into training and testing sets
# X = data[['Open', 'Low', 'High', 'Volume', 'Sector']]
# y = data['Price Movement']
# #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, shuffle=False)


# Standardize the input features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the SVM model
model = SVC(kernel='rbf') # linear, rbf, poly, sigmoid
model.fit(X_train, y_train)

# Make a prediction using the trained SVM model
y_pred = model.predict(X_test)

# Evaluate the performance of the model
accuracy = model.score(X_test, y_test)
print(f"Accuracy: {accuracy}")



Accuracy: 0.5855827043622873


In [78]:
# print entire array of predictions next to actual values
predictions = np.concatenate((y_pred.reshape(len(y_pred),1), y_test.values.reshape(len(y_test),1)),1)
# save predictions as csv file
np.savetxt('predictions.csv', np.concatenate((y_pred.reshape(len(y_pred),1), y_test.values.reshape(len(y_test),1)),1), delimiter=',', fmt='%d')

31268


In [111]:
# load predictions from csv file
predictions = pd.read_csv('predictions.csv', delimiter=',', header=None)
# calculate the total number of unique stocks
# Splitting the dataset into the Training set and Test set according to date
# Load the dataset without categorical tickers
data = pd.read_csv('dummy.csv')
# Convert the index column to datetime object
data['Unnamed: 0'] = pd.to_datetime(data['Unnamed: 0'])
# Set the date as the index
data = data.set_index('Unnamed: 0')
# rename the index to 'Date'
data.index.names = ['Date']
# Define the specific date to split the DataFrame
split_date = pd.to_datetime('2020-01-01')

# Split the DataFrame into training and test sets based on the specific date
#train = data.loc[data.index < split_date]
test = data.loc[data.index >= split_date]
# Split the data into input (X) and output (y) variables
X_train = train[['Ticker','Open', 'Low', 'High', 'Volume', 'Sector']]
y_train = train['Price Movement']
X_test = test[['Ticker', 'Open', 'Low', 'High', 'Volume', 'Sector']]
y_test = test['Price Movement']
# Extract the stock tickers from the X_test dataset
tickers_test = X_test['Ticker']
# convert the index to a column
tickers_test = tickers_test.reset_index(drop=False)
# Create a new dataframe with the tickers, dates and the predictions
#predictions_df = pd.DataFrame({'Date': tickers_test['Date'], 'Ticker': tickers_test['Ticker'], 'Prediction': predictions[0], 'Actual': predictions[1]})
predictions_df = pd.DataFrame({'Ticker': tickers_test['Ticker'], 'Prediction': predictions[0]})
# Count the number of predictions with value 1 for each ticker
predictions_df = predictions_df.groupby('Ticker').sum()
# sort the dataframe by the number of predictions with value 1
predictions_df = predictions_df.sort_values(by=['Prediction'], ascending=False)
predictions_df
# Only keep the tickers with at least 63 predictions with value 1
predictions_df = predictions_df[predictions_df['Prediction'] >= 63]

# save predictions as csv file
predictions_df.to_csv('predictions_df.csv', index=True)
# Choose 10 random stocks with at least prediction sum 63 and make a portfolio

portfolio = predictions_df.sample(n=10, random_state=42)
portfolio

Unnamed: 0_level_0,Prediction
Ticker,Unnamed: 1_level_1
AMD,63
FOXA,63
AES,63
ATVI,63
NWL,63
WY,63
ADM,63
BWA,63
SEE,63
IPG,63


In [118]:
# make portfolio as equally weighted portfolio
portfolio['Weight'] = 1/10
portfolio
# list with the start and end price for each stock in the portfolio
start_price = [{'AMD': 48.03, 'FOXA': 37.41, 'AES': 19.97, 'ATVI': 59.91, 'NWL': 19.43, 'WY': 30.21, 'ADM': 46.57, 'BWA': 44.00, 'SEE': 40.03, 'IPG': 23.27}]
end_price = [{'AMD': 43.66, 'FOXA': 22.34, 'AES': 12.80, 'ATVI': 57.50, 'NWL': 12.38, 'WY': 15.87, 'ADM': 33.87, 'BWA': 22.66, 'SEE': 24.52, 'IPG': 14.54}]
# calculate the return for each of the stocks in the portfolio
return_list = []
for i in range(0,10):
    return_list.append((end_price[0][portfolio.index[i]] - start_price[0][portfolio.index[i]])/start_price[0][portfolio.index[i]])
# calculate the return of the portfolio
portfolio_return = (sum(return_list)/10)*100
print('Our optimal portfolio return during the test period: ', portfolio_return, '%')
print('In the same period the SP500 index had a return of: ',-19.1,'%')

Our optimal portfolio return during the test period:  -32.50930368844083 %
In the same period the SP500 index had a return of:  -19.1 %
