# ML in Finance Group Project
### Group 2: Barbara Capl, Mathias Lüthi, Pamela Matias, Stefanie Rentsch
## 1. Preperation of Datasets
In this part of the code the data is imported from the Wharton data source. It's than cleaned up and put into usable attribute matrices for further feature selection.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

### Market Data from Wharton
The financial data from the Wharton Database is loaded in. The redundant identification columns and columns which are more than 2/3 empty values are droped.  

In [2]:
# Load in data from the Wharton Database
wharton = pd.read_csv('Data/WhartonData.csv', sep=',', header=0)

# Delete unusable columns
wharton = wharton.drop(['NAMEENDT','SHRCD','EXCHCD','SICCD','TICKER','COMNAM', 'NCUSIP', 'TSYMBOL',
              'PERMCO', 'ISSUNO', 'HEXCD', 'HSICCD'], 1)
wharton = wharton.drop(['DLAMT', 'DLPDT', 'DLSTCD', 'NEXTDT', 'HSICMG', 'HSICIG', 'DIVAMT',
              'SHRCLS', 'ACPERM', 'ACCOMP', 'NWPERM', 'DLRETX', 'DLPRC', 'DLRET', 'NMSIND',
              'MMCNT', 'NSDINX', 'DCLRDT', 'PAYDT', 'RCRDDT', 'DISTCD', 'FACPR', 'FACSHR',
              'TRTSCD', ], 1)

# Formatting data and permno
wharton.columns.values[0] = 'permno'
wharton['date'] = wharton.date.astype(str).str[:4] + '-' + wharton.date.astype(str).str[4:6]

# Calculate SPREAD manually
wharton['SPREAD'] = wharton['BID'] - wharton['ASK']

# print(wharton.isnull().sum())
# display(wharton.head())

### Financial Ratios from Wharton
The financial ratios from the Wharton Database are loaded in. The adate and qdate are dropped because they are not relevant. Formatting "divyeld" from percentage to float.

In [12]:
# Load in the financial ratios from OLAT
ratios = pd.read_csv('Data/Ratios.csv', sep=',', header=0)

# Delete unusable columns
ratios = ratios.drop(['adate', 'qdate'], 1)

ratios.columns.values[1] = 'date'
ratios['date'] = ratios.date.str[6:] + '-' + ratios.date.str[3:5]

# Remove percentages in row "divyield" and divide with 100 (so its decimal percentage) with string split
ratios['divyield'] = ratios['divyield'].str.rstrip('%').astype('float')/100

# print(ratios.isnull().sum())
display(ratios.head())

Unnamed: 0,permno,date,CAPEI,bm,evm,pe_op_basic,pe_op_dil,pe_exi,pe_inc,ps,...,sale_nwc,rd_sale,adv_sale,staff_sale,accrual,ptb,PEG_trailing,divyield,PEG_1yrforward,PEG_ltgforward
0,10107,2006-01,33.861,0.176,15.245,23.074,23.264,23.856,23.856,7.211,...,1.296,0.151,0.025,0.0,0.055,6.019,11.538,0.0114,15.506,2.109
1,10107,2006-02,31.756,0.163,14.951,21.496,21.846,22.392,22.392,6.713,...,1.323,0.151,0.025,0.0,0.036,6.281,10.28,0.0134,14.555,1.838
2,10107,2006-03,31.82,0.163,14.951,21.768,22.122,22.675,22.675,6.727,...,1.323,0.151,0.025,0.0,0.036,6.293,10.41,0.0132,14.739,1.842
3,10107,2006-04,28.176,0.163,14.951,19.32,19.634,20.125,20.125,5.957,...,1.323,0.151,0.025,0.0,0.036,5.573,9.239,0.0149,13.081,1.666
4,10107,2006-05,25.921,0.151,15.12,17.695,17.835,17.976,17.976,5.419,...,1.388,0.15,0.025,0.0,0.024,5.496,0.709,0.0159,-5.842,1.48


In [4]:
# Merging the two dataframes
data = pd.merge(wharton, ratios, left_on=['date', 'permno'], right_on=['date', 'permno'])
display(data.head())

Unnamed: 0,permno,date,NAICS,PRIMEXCH,TRDSTAT,SECSTAT,CUSIP,SHRFLG,SHRENDDT,BIDLO,...,sale_nwc,rd_sale,adv_sale,staff_sale,accrual,ptb,PEG_trailing,divyield,PEG_1yrforward,PEG_ltgforward
0,10107,2006-01,511210.0,Q,A,R,59491810,0.0,20060330.0,26.28,...,1.296,0.151,0.025,0.0,0.055,6.019,11.538,0.0114,15.506,2.109
1,10107,2006-02,511210.0,Q,A,R,59491810,0.0,,26.39,...,1.323,0.151,0.025,0.0,0.036,6.281,10.28,0.0134,14.555,1.838
2,10107,2006-03,511210.0,Q,A,R,59491810,0.0,20060423.0,26.85,...,1.323,0.151,0.025,0.0,0.036,6.293,10.41,0.0132,14.739,1.842
3,10107,2006-04,511210.0,Q,A,R,59491810,0.0,20060629.0,24.15,...,1.323,0.151,0.025,0.0,0.036,5.573,9.239,0.0149,13.081,1.666
4,10107,2006-05,511210.0,Q,A,R,59491810,0.0,,22.56,...,1.388,0.15,0.025,0.0,0.024,5.496,0.709,0.0159,-5.842,1.48


In [14]:
# Creating a responce vector and an attribute matrix
forcast_periods = [1, 3, 6, 12]
for i in forcast_periods:
    data['return_' + str(i)] = np.where((data['permno'] == data['permno'].shift(i)), data['PRC'] / data['PRC'].shift(i), None)

data_1 = data.dropna(subset=['return_1'])
response_1 = np.where(data_1.return_1 >= 1, 1, 0)
attributes_ratios_1 = data_1.iloc[:, 28:-4]
attributes_additional_1 = data_1.iloc[:, 2:-4]

Unnamed: 0,permno,date,NAICS,PRIMEXCH,TRDSTAT,SECSTAT,CUSIP,SHRFLG,SHRENDDT,BIDLO,...,accrual,ptb,PEG_trailing,divyield,PEG_1yrforward,PEG_ltgforward,return_1,return_3,return_6,return_12
1,10107,2006-02,511210.0,Q,A,R,59491810,0.0,,26.3900,...,0.036,6.281,10.280,0.0134,14.555,1.838,0.954529,,,
2,10107,2006-03,511210.0,Q,A,R,59491810,0.0,20060423.0,26.8500,...,0.036,6.293,10.410,0.0132,14.739,1.842,1.01265,,,
3,10107,2006-04,511210.0,Q,A,R,59491810,0.0,20060629.0,24.1500,...,0.036,5.573,9.239,0.0149,13.081,1.666,0.887541,0.857904,,
4,10107,2006-05,511210.0,Q,A,R,59491810,0.0,,22.5600,...,0.024,5.496,0.709,0.0159,-5.842,1.480,0.937888,0.842948,,
5,10107,2006-06,511210.0,Q,A,R,59491810,0.0,20060817.0,21.5100,...,0.024,5.577,0.730,0.0155,-6.010,1.522,1.0287,0.856303,,
6,10107,2006-07,511210.0,Q,A,R,59491810,0.0,,22.2600,...,0.024,5.759,0.754,0.0150,,,1.03262,0.996273,0.854707,
7,10107,2006-08,511210.0,Q,A,R,59491810,0.0,20060907.0,23.9900,...,0.026,6.389,1.691,0.0140,1.511,1.692,1.06816,1.13466,0.956457,
8,10107,2006-09,511210.0,Q,A,R,59491810,0.0,20061022.0,25.4300,...,0.026,6.697,1.799,0.0132,1.608,1.838,1.0642,1.17382,1.00515,
9,10107,2006-10,511210.0,Q,A,R,59491810,0.0,20061228.0,27.3600,...,0.026,7.038,1.889,0.0125,1.787,2.019,1.04973,1.19327,1.18882,
10,10107,2006-11,511210.0,Q,A,R,59491810,0.0,,28.7300,...,0.018,7.995,1.077,0.0136,1.644,1.997,1.02264,1.14241,1.29625,


Unnamed: 0,sprtrn,CAPEI,bm,evm,pe_op_basic,pe_op_dil,pe_exi,pe_inc,ps,pcf,...,sale_nwc,rd_sale,adv_sale,staff_sale,accrual,ptb,PEG_trailing,divyield,PEG_1yrforward,PEG_ltgforward
0,0.025467,33.861,0.176,15.245,23.074,23.264,23.856,23.856,7.211,17.185,...,1.296,0.151,0.025,0.0,0.055,6.019,11.538,0.0114,15.506,2.109
1,0.000453,31.756,0.163,14.951,21.496,21.846,22.392,22.392,6.713,17.868,...,1.323,0.151,0.025,0.0,0.036,6.281,10.280,0.0134,14.555,1.838
2,0.011065,31.820,0.163,14.951,21.768,22.122,22.675,22.675,6.727,17.905,...,1.323,0.151,0.025,0.0,0.036,6.293,10.410,0.0132,14.739,1.842
3,0.012187,28.176,0.163,14.951,19.320,19.634,20.125,20.125,5.957,15.854,...,1.323,0.151,0.025,0.0,0.036,5.573,9.239,0.0149,13.081,1.666
4,-0.030917,25.921,0.151,15.120,17.695,17.835,17.976,17.976,5.419,15.303,...,1.388,0.150,0.025,0.0,0.024,5.496,0.709,0.0159,-5.842,1.480
5,0.000087,26.301,0.151,15.120,18.203,18.346,18.492,18.492,5.498,15.527,...,1.388,0.150,0.025,0.0,0.024,5.577,0.730,0.0155,-6.010,1.522
6,0.005086,27.159,0.151,15.120,18.797,18.945,19.095,19.095,5.678,16.034,...,1.388,0.150,0.025,0.0,0.024,5.759,0.754,0.0150,,
7,0.021274,25.198,0.171,12.624,19.922,20.078,21.417,21.417,5.786,17.789,...,1.667,0.149,0.028,0.0,0.026,6.389,1.691,0.0140,1.511,1.692
8,0.024566,26.412,0.171,12.624,21.202,21.367,22.792,22.792,6.065,18.646,...,1.667,0.149,0.028,0.0,0.026,6.697,1.799,0.0132,1.608,1.838
9,0.031508,27.755,0.171,12.624,22.256,22.430,23.925,23.925,6.374,19.594,...,1.667,0.149,0.028,0.0,0.026,7.038,1.889,0.0125,1.787,2.019
