# ML in Finance Group Project
### Group 2: Barbara Capl, Mathias Lüthi, Pamela Matias, Stefanie Rentsch
## 1. Preperation of Datasets
In this part of the code the data is imported from the Wharton data source. It's then cleaned up and put into usable attribute matrices for further feature selection.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

### Market Data from Wharton
The financial data from the Wharton Database is loaded in. The redundant identification columns and columns which are more than 2/3 empty values are dropped.  

In [2]:
# Load in data from the Wharton Database
wharton = pd.read_csv('Data/Whartondata.csv', sep=',', header=0)

# Delete unusable columns
wharton = wharton.drop(['NAMEENDT','SHRCD','EXCHCD','SICCD','TICKER','COMNAM', 'NCUSIP', 'TSYMBOL',
              'PERMCO', 'ISSUNO', 'HEXCD', 'HSICCD'], 1)
wharton = wharton.drop(['DLAMT', 'DLPDT', 'DLSTCD', 'NEXTDT', 'HSICMG', 'HSICIG', 'DIVAMT',
              'SHRCLS', 'ACPERM', 'ACCOMP', 'NWPERM', 'DLRETX', 'DLPRC', 'DLRET', 'NMSIND',
              'MMCNT', 'NSDINX', 'DCLRDT', 'PAYDT', 'RCRDDT', 'DISTCD', 'FACPR', 'FACSHR',
              'TRTSCD', ], axis=1)

# Rename PERMNO column
wharton.columns.values[0] = 'permno'

# Format date into string with only year and month
wharton['date'] = wharton.date.astype(str).str[:4] + '-' + wharton.date.astype(str).str[4:6]


# Calculate SPREAD manually
wharton['SPREAD'] = wharton['BID'] - wharton['ASK']

# print(wharton.isnull().sum())
# display(wharton.head())

### Financial Ratios from Wharton
The financial ratios from the Wharton Database (received on OLAT) are loaded in. The adate and qdate are dropped because they are not relevant. Formatting "divyeld" from percentage to float.

In [3]:
# Load in the financial ratios from OLAT
ratios = pd.read_csv('Data/Ratios.csv', sep=',', header=0)

# Delete unusable columns
ratios = ratios.drop(['adate', 'qdate'], axis=1)

ratios.columns.values[1] = 'date'
ratios['date'] = ratios.date.str[6:] + '-' + ratios.date.str[3:5]

# Remove percentages in row "divyield" and divide with 100 (so its decimal percentage) with string split
ratios['divyield'] = ratios['divyield'].str.rstrip('%').astype('float')/100

# print(ratios.isnull().sum())
# display(ratios.head())

### Merging the Dataset

In [4]:
# Merging the two dataframes on date and permno
data = pd.merge(wharton, ratios, left_on=['date', 'permno'], right_on=['date', 'permno'])
display(data.head())

Unnamed: 0,permno,date,NAICS,PRIMEXCH,TRDSTAT,SECSTAT,CUSIP,SHRFLG,SHRENDDT,BIDLO,...,sale_nwc,rd_sale,adv_sale,staff_sale,accrual,ptb,PEG_trailing,divyield,PEG_1yrforward,PEG_ltgforward
0,10107,2006-01,511210.0,Q,A,R,59491810,0.0,20060330.0,26.28,...,1.296,0.151,0.025,0.0,0.055,6.019,11.538,0.0114,15.506,2.109
1,10107,2006-02,511210.0,Q,A,R,59491810,0.0,,26.39,...,1.323,0.151,0.025,0.0,0.036,6.281,10.28,0.0134,14.555,1.838
2,10107,2006-03,511210.0,Q,A,R,59491810,0.0,20060423.0,26.85,...,1.323,0.151,0.025,0.0,0.036,6.293,10.41,0.0132,14.739,1.842
3,10107,2006-04,511210.0,Q,A,R,59491810,0.0,20060629.0,24.15,...,1.323,0.151,0.025,0.0,0.036,5.573,9.239,0.0149,13.081,1.666
4,10107,2006-05,511210.0,Q,A,R,59491810,0.0,,22.56,...,1.388,0.15,0.025,0.0,0.024,5.496,0.709,0.0159,-5.842,1.48


### Creating Attribute Matrices and a Response Vectors using only Ratios

In [5]:
# Creating a responce vector and an attribute matrix
forcast_periods = [1, 3, 6, 12]
for i in forcast_periods:
    data['return_' + str(i)] = np.where((data['permno'] == data['permno'].shift(i)), data['PRC'] / data['PRC'].shift(i), None)

# Creating a responce vector and an attribute matrix predicting one month into the future(for different forecast periods can be done later)
data_1 = data.dropna(subset=['return_1'])
response_1 = pd.DataFrame(np.where(data_1.return_1 >= 1, 1, 0))
attributes_ratios_1 = data_1.iloc[:, 28:-4]

# Export the attribute matrix and response vector
response_1.to_csv('Data/generated_datasets/response_1.csv', index=False)
attributes_ratios_1.to_csv('Data/generated_datasets/attributes_ratios_1.csv', index=False)

### Adding additonal Features

In [6]:
# Adding reporting period as seasonality attribute

# Breakdown of categorical data
print(data.dtypes)
#data['NAICS'] = data['NAICS'].astype(str)
#data = pd.get_dummies(data)

permno               int64
date                object
NAICS              float64
PRIMEXCH            object
TRDSTAT             object
SECSTAT             object
CUSIP               object
SHRFLG             float64
SHRENDDT           float64
BIDLO              float64
ASKHI              float64
PRC                float64
VOL                float64
RET                 object
BID                float64
ASK                float64
SHROUT             float64
CFACPR             float64
CFACSHR            float64
ALTPRC             float64
SPREAD             float64
ALTPRCDT           float64
RETX                object
vwretd             float64
vwretx             float64
ewretd             float64
ewretx             float64
sprtrn             float64
CAPEI              float64
bm                 float64
                    ...   
dltt_be            float64
debt_assets        float64
debt_capital       float64
de_ratio           float64
intcov             float64
intcov_ratio       float64
c

### Creating Attribute Matrices and a Response Vectors using additional attributes

In [7]:
# Creating a responce vector and an attribute matrix predicting one month into the future(for different forecast periods can be done later)
data_1 = data.dropna(subset=['return_1'])
response_1 = pd.DataFrame(np.where(data_1.return_1 >= 1, 1, 0))

attributes_additional_1 = data_1.iloc[:, 2:-4]
#display(attributes_additional_1)
attributes_additional_1.to_csv('Data/generated_datasets/attributes_additional_1.csv', index=False)