In [2]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_csv("cleaned_ipo_data.csv")

In [4]:
data.head()

Unnamed: 0,initialPrice,sector,daysBeatSP,spvalue,priceChange100Days
0,17.6,Health Care,131,1946.17,0.669318
1,9.37,Technology,125,1698.67,0.338314
2,34.91,Health Care,138,1418.55,0.295617
3,19.5,Health Care,70,2575.21,1.777436
4,22.16,Finance,126,2013.43,-0.210289


In [5]:
daysBeatSP = data.iloc[:, 2].values
daysBeatSP

array([131, 125, 138, ..., 115, 115, 104], dtype=int64)

In [6]:
priceChangeAfter100 = data.iloc[:, 4].values
priceChangeAfter100

array([ 0.66931818,  0.33831377,  0.2956173 , ...,  0.00426649,
       -0.41153846, -0.26129032])

In [7]:
feature_variables = data.drop(['daysBeatSP', 'priceChange100Days'], axis=1)

sector_dummies = pd.get_dummies(feature_variables["sector"], prefix = "sector", drop_first = True)
feature_variables = pd.concat([feature_variables, sector_dummies], axis=1)
feature_variables = feature_variables.drop(['sector'], axis=1)

feature_variables

Unnamed: 0,initialPrice,spvalue,sector_Capital Goods,sector_Consumer Durables,sector_Consumer Non-Durables,sector_Consumer Services,sector_Energy,sector_Finance,sector_Health Care,sector_Miscellaneous,sector_Public Utilities,sector_Technology,sector_Transportation
0,17.60,1946.17,0,0,0,0,0,0,1,0,0,0,0
1,9.37,1698.67,0,0,0,0,0,0,0,0,0,1,0
2,34.91,1418.55,0,0,0,0,0,0,1,0,0,0,0
3,19.50,2575.21,0,0,0,0,0,0,1,0,0,0,0
4,22.16,2013.43,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1233,219.80,2029.55,0,0,0,0,0,0,1,0,0,0,0
1234,16.50,2133.04,0,0,0,0,0,0,0,0,0,0,1
1235,30.47,1513.17,0,0,0,0,0,0,1,0,0,0,0
1236,13.00,2384.20,0,0,0,0,0,0,1,0,0,0,0


In [8]:
from sklearn.model_selection import train_test_split

#SP Model

X_train_SP, X_test_SP, y_train_SP, y_test_SP = train_test_split(feature_variables, daysBeatSP, test_size=0.2, random_state=0)
X_train_100, X_test_100, y_train_100, y_test_100 = train_test_split(feature_variables, priceChangeAfter100, test_size=0.2, random_state=0)

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

regressor_SP = RandomForestRegressor(n_estimators=500, random_state=0)
regressor_SP.fit(X_train_SP, y_train_SP)
y_pred_SP = regressor_SP.predict(X_test_SP)

from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test_SP, y_pred_SP))

24.118536711350757

In [10]:
regressor_100 = RandomForestRegressor(n_estimators=500, random_state=0)
regressor_100.fit(X_train_100, y_train_100)
y_pred_100 = regressor_100.predict(X_test_100)

np.sqrt(mean_squared_error(y_test_100, y_pred_100))

0.5018243080392298

In [18]:
a = {"Month":"2021-07","Name":"Coinbase","Price":"55","Shares":"5555555","sector":"Consumer Durables"}
price = a['Price']
spvalue = 3974.54
sector = "sector_"+str(a['sector'])

argument_df = pd.DataFrame(columns=feature_variables.columns.values)

listForThis = []
columns = []
for i in feature_variables.columns.values:
    columns.append(i)
    if(i == "initialPrice"):
        listForThis.append(float(price))
    if(i == "spvalue"):
        listForThis.append(float(spvalue))
    if(i == sector):
        listForThis.append(int(1))
    if(i != sector and i != 'spvalue' and i != 'initialPrice'):
        listForThis.append(int(0))
a_series = pd.Series(listForThis, index = feature_variables.columns)
argument_df = argument_df.append(a_series, ignore_index=True)

return argument_df

Unnamed: 0,initialPrice,spvalue,sector_Capital Goods,sector_Consumer Durables,sector_Consumer Non-Durables,sector_Consumer Services,sector_Energy,sector_Finance,sector_Health Care,sector_Miscellaneous,sector_Public Utilities,sector_Technology,sector_Transportation
0,55.0,3974.54,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
