In [None]:
# data from here: https://www.kaggle.com/cnic92/200-financial-indicators-of-us-stocks-20142018?select=2015_Financial_Data.csv

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [2]:
df1v1 = pd.read_csv(os.path.join("..", "2014_Financial_Data.csv"))
df2v1 = pd.read_csv(os.path.join("..", "2015_Financial_Data.csv"))

In [3]:
df1v1

Unnamed: 0.1,Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,...,Receivables growth,Inventory Growth,Asset Growth,Book Value per Share Growth,Debt Growth,R&D Expense Growth,SG&A Expenses Growth,Sector,2015 PRICE VAR [%],Class
0,PG,7.440100e+10,-0.0713,3.903000e+10,3.537100e+10,0.000000e+00,2.146100e+10,2.146100e+10,1.391000e+10,7.090000e+08,...,-0.0187,-0.0217,0.0359,0.0316,0.1228,0.0000,-0.1746,Consumer Defensive,-9.323276,0
1,VIPS,3.734148e+09,1.1737,2.805625e+09,9.285226e+08,1.083303e+08,3.441414e+08,7.939267e+08,1.345959e+08,1.214869e+07,...,,,,,,1.6484,1.7313,Consumer Defensive,-25.512193,0
2,KR,9.837500e+10,0.0182,7.813800e+10,2.023700e+10,0.000000e+00,1.519600e+10,1.751200e+10,2.725000e+09,4.430000e+08,...,0.0618,0.0981,0.1886,0.3268,0.2738,0.0000,0.0234,Consumer Defensive,33.118297,1
3,RAD,2.552641e+10,0.0053,1.820268e+10,7.323734e+09,0.000000e+00,6.561162e+09,6.586482e+09,7.372520e+08,4.245910e+08,...,0.0211,-0.0510,-0.0189,0.1963,-0.0458,0.0000,-0.0060,Consumer Defensive,2.752291,1
4,GIS,1.790960e+10,0.0076,1.153980e+10,6.369800e+09,0.000000e+00,3.474300e+09,3.412400e+09,2.957400e+09,3.024000e+08,...,0.0257,0.0090,0.0215,0.0274,0.1025,0.0000,-0.0220,Consumer Defensive,12.897715,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3803,TSRI,4.952987e+07,0.1028,4.125164e+07,8.278229e+06,0.000000e+00,8.253061e+06,8.253061e+06,2.516800e+04,0.000000e+00,...,-0.0384,0.0000,-0.0041,-0.0049,0.0000,0.0000,0.0213,Technology,29.362884,1
3804,TZOO,1.532400e+08,-0.1019,1.917400e+07,1.340660e+08,1.132600e+07,1.125130e+08,1.162560e+08,1.781000e+07,0.000000e+00,...,0.1529,0.0000,-0.1872,0.1823,0.0000,0.2830,-0.0637,Technology,-31.167763,0
3805,USATP,4.200000e+07,,2.700000e+07,1.500000e+07,,1.400000e+07,1.500000e+07,0.000000e+00,0.000000e+00,...,,,,,,,,Technology,-23.558900,0
3806,WSTG,3.407580e+08,0.1344,3.159480e+08,2.481000e+07,0.000000e+00,1.651300e+07,1.651300e+07,8.297000e+06,-4.720000e+05,...,-0.0333,0.1338,0.0023,0.0890,0.0000,0.0000,0.0650,Technology,7.779579,1


In [4]:
# note: hypothesized financial indicators listed below
hypothesized_indicators = ['Revenue Growth', 'Net Income', 'EPS', 'EBITDA Margin', 'priceToSalesRatio', 'priceEarningsRatio', 'priceToFreeCashFlowsRatio', 'Debt to Equity', 
                          'ROIC', 'Sector']

def makeAdjustments(df, year):
    # filter for relevant columns    
    tickers = df['Unnamed: 0']
    df = df[['Revenue Growth', 'Net Income', 'EPS', 'EBITDA Margin', 'priceToSalesRatio', 'priceEarningsRatio', 'priceToFreeCashFlowsRatio', 'Debt to Equity', 'ROIC', 'Sector',
              str(year+1) + ' PRICE VAR [%]']]
    df = pd.concat([df, tickers], axis=1)
    df = df.rename(columns={'Unnamed: 0': 'Ticker',
                           'Revenue Growth': str(year) + ' Revenue Growth'})
    
    # drop NaN and reset index
    df = df.dropna()
    df = df.reset_index()
    df.drop(['index'], axis=1, inplace=True)
    
    # create increase/decrease variable -- increase = 1, decrease = 0
    temp = []
    for i in range(len(df)):
        yr_return = df[str(year+1) + ' PRICE VAR [%]'][i]
        if yr_return > 0 and yr_return < 10:
            temp.append('average')
        elif yr_return > 10:
            temp.append('outperform')
        else:
            temp.append('below')
    df['Target'] = temp
    
    # create dummy variables for sector categorical variable
    cat_variables = df[['Sector']]
    cat_dummies = pd.get_dummies(cat_variables)
    df.drop(['Sector'], axis=1, inplace=True)
    df = pd.concat([df, cat_dummies], axis=1)
    
    # remove outliers?????
    
    # change datatypes??????
    
    return df    

In [5]:
df1v2 = makeAdjustments(df1v1, 2014)
#df2v2 = makeAdjustments(df2v1, 2015)

In [6]:
df1v2

Unnamed: 0,2014 Revenue Growth,Net Income,EPS,EBITDA Margin,priceToSalesRatio,priceEarningsRatio,priceToFreeCashFlowsRatio,Debt to Equity,ROIC,2015 PRICE VAR [%],...,Sector_Communication Services,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Real Estate,Sector_Technology,Sector_Utilities
0,-0.0713,1.164300e+10,4.1900,0.2470,2.8583,18.7566,21.0348,0.5061,0.0753,-9.323276,...,0,0,1,0,0,0,0,0,0,0
1,1.1737,1.358227e+08,0.2396,0.0107,0.0443,81.5526,1.3589,1.5093,0.0000,-25.512193,...,0,0,1,0,0,0,0,0,0,0
2,0.0182,1.519000e+09,1.4700,0.0450,0.1858,12.0340,14.6302,2.1007,0.0859,33.118297,...,0,0,1,0,0,0,0,0,0,0
3,0.0053,2.494140e+08,4.6000,0.0420,0.2491,28.6087,17.2736,-2.7237,0.1062,2.752291,...,0,0,1,0,0,0,0,0,0,0
4,0.0076,1.824400e+09,2.9000,0.2010,1.8610,18.7034,17.6902,1.3445,0.1041,12.897715,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2634,0.0553,3.984000e+06,0.5600,0.0810,0.5289,16.7321,11.2440,0.2524,0.1048,-2.453386,...,0,0,0,0,0,0,0,0,1,0
2635,0.1028,-8.593600e+04,-0.0400,-0.0010,0.1216,0.0000,12.2691,0.0000,0.0035,29.362884,...,0,0,0,0,0,0,0,0,1,0
2636,-0.1019,1.306200e+07,0.8800,0.1360,1.2131,14.3409,0.0000,0.0279,0.0000,-31.167763,...,0,0,0,0,0,0,0,0,1,0
2637,0.1344,5.760000e+06,1.2400,0.0250,0.2476,13.8790,14.7618,0.0000,0.3154,7.779579,...,0,0,0,0,0,0,0,0,1,0


In [7]:
def runRandomForest(df, year):
    y = df['Target']
    
    data = df.drop(['Target', str(year+1) + ' PRICE VAR [%]', 'Ticker'], axis=1)
    feature_names = data.columns

    X = df[feature_names]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

    # Create the model with 100 trees
    model = RandomForestClassifier(n_estimators=100, 
                                   bootstrap = True,
                                   max_features = 'sqrt')
    # Fit on training data
    model.fit(X_train, y_train)


    # Actual class predictions
    rf_predictions = model.predict(X_test)
    # Probabilities for each class
    rf_probs = model.predict_proba(X_test)[:, 1]

    # Calculate roc auc
    roc_value = roc_auc_score(y_test, rf_probs)
    return roc_value, model

In [8]:
result = runRandomForest(df1v2, 2014)
print(result[0])
print(result[1])

ValueError: multi_class must be in ('ovo', 'ovr')