In [8]:
#imports
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV


#read data using pandas
data = pd.read_csv("sp500_27270.csv")




In [9]:
#Filtering data satisfying correlation >=0.95 or <=-0.95
filtered_companies = []
correlation = data.corr()
for i in range(len(correlation)):
    if(correlation['SP500'][i]>=0.95 or correlation['SP500'][i]<=-0.95):
        filtered_companies.append(correlation['SP500'].keys().values[i])

#extracting the columns of the companies satisfying the correlation limits
filtered_data = data.filter(filtered_companies)

In [10]:
#specifying the inputs (columns of companies stock prices)
#specifying our output as the SP500 index column
Y = np.array(filtered_data['SP500'])
X = np.array(filtered_data.drop(columns=['SP500']))

#Normalizing Data (using mean and standard deviation)
X_scaled = preprocessing.scale(X)
Y_scaled = preprocessing.scale(Y)

#divide the dataset into training and testing sets in the ratio 80:20
X_train,X_test,Y_train,Y_test = train_test_split(X_scaled,Y_scaled,test_size=0.2)



In [12]:
#define the MLP regressor with max iterations
regressor = MLPRegressor(max_iter=3000)

#specifying the different parameters to perform the grid search on
param_grid = {'hidden_layer_sizes': [(15, 2), (10, 2)], 'activation': ['relu', 'tanh'], 'solver':['sgd', 'adam']}

#perform gridsearch on all possible combinations of parameters with cross validation division 
clf = GridSearchCV(regressor, param_grid=param_grid, cv=3)
clf.fit(X_train,Y_train)

#print the best architecture parameters and the score
print(clf.best_params_)
clf.score(X_test, Y_test)

{'activation': 'tanh', 'hidden_layer_sizes': (10, 2), 'solver': 'adam'}


0.9808503659585882

# TASK 2

In [13]:
#sort the dataframe by the SP500 index value descendingly
sorted_dataframe = correlation.sort_values(['SP500'], ascending=False)

#drop the value of the first row (one with the maximum SP500 value in the SP500 column) 
#since this represents the correlation of SP500 with itself. We actually looking for the most correlated company
#which will be the second highest value in the SP500 index column
sorted_dataframe = sorted_dataframe[1:]

#extract the company name (the key) of this maximum value
maximum_company = sorted_dataframe['SP500'].keys().values[0]

#extract the column of this maximum company from the data frame
maximum_company_data = data[maximum_company]

#initialize empty arrays for inputs and output
X = []
Y = []

#loop over the whole column
#in each iteration append 5 values representing days from (t-4 till t) to the input array
#and the following 4 values representing days (t+1 till t+4) to the output array
#shift the dataframe to the left by 1 step (-1) to take following values
for value in range(0,len(maximum_company_data)-9):
    x_element = list(maximum_company_data[:5])
    y_element = list(maximum_company_data[5:9])
    X.append(x_element)
    Y.append(y_element)
    maximum_company_data = maximum_company_data.shift(-1)


In [14]:
#Normalizing Data
X_scaled = preprocessing.scale(X)
Y_scaled = preprocessing.scale(Y)

#divide the dataset into training and testing sets with ratio 80:20
X_train,X_test,Y_train,Y_test = train_test_split(X_scaled,Y_scaled,test_size=0.2)

#define the MLP regressor with maximum iterations
regressor = MLPRegressor(max_iter=3000)

#specifying the different parameters to perform the grid search on
param_grid = {'hidden_layer_sizes': [(15, 2), (10, 2)], 'activation': ['relu', 'tanh'], 'solver':['sgd', 'adam']}

#perform gridsearch on all possible combinations of parameters with cross validation division
clf = GridSearchCV(regressor, param_grid=param_grid, cv=3)
clf.fit(X_train,Y_train)

#print the best architecture parameters and the score
print(clf.best_params_)
clf.score(X_test, Y_test)

{'activation': 'relu', 'hidden_layer_sizes': (10, 2), 'solver': 'sgd'}


0.974538146972305