In [1]:
# import wichtige packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
plt.style.use('seaborn-whitegrid')

  from pandas.core import datetools


In [13]:
# Daten wharton einlesen
df = pd.read_csv('Data/WhartonData.csv', parse_dates = ["date"], dayfirst =True)
df.head(3)


Unnamed: 0,PERMNO,date,SICCD,TICKER,COMNAM,HSICIG,DIVAMT,BIDLO,ASKHI,PRC,BID,ASK,SPREAD,vwretd,vwretx
0,10107,2006-01-31,7370.0,MSFT,MICROSOFT CORP,,,26.28,28.15,28.15,28.15,28.13,,0.040044,0.038938
1,10107,2006-02-28,7370.0,MSFT,MICROSOFT CORP,,0.09,26.39,28.04,26.87,26.87,26.88,,-0.001637,-0.003552
2,10107,2006-03-31,7370.0,MSFT,MICROSOFT CORP,,,26.85,27.89,27.21,27.24,27.24,,0.019053,0.017585


In [14]:
# Daten ratios einlesen
ratios = pd.read_csv('Data/Ratios.csv', parse_dates = ["adate", "qdate"], dayfirst = True)
ratios.rename(columns = {"public_date": "date"}, inplace = True)
ratios.date = pd.to_datetime(ratios.date).dt.date.astype('datetime64[ns]')
ratios.head(3)

Unnamed: 0,permno,adate,qdate,date,CAPEI,bm,evm,pe_op_basic,pe_op_dil,pe_exi,...,sale_nwc,rd_sale,adv_sale,staff_sale,accrual,ptb,PEG_trailing,divyield,PEG_1yrforward,PEG_ltgforward
0,10107,2005-06-30,2005-09-30,2006-01-31,33.861,0.176,15.245,23.074,23.264,23.856,...,1.296,0.151,0.025,0.0,0.055,6.019,11.538,1.14%,15.506,2.109
1,10107,2005-06-30,2005-12-31,2006-02-28,31.756,0.163,14.951,21.496,21.846,22.392,...,1.323,0.151,0.025,0.0,0.036,6.281,10.28,1.34%,14.555,1.838
2,10107,2005-06-30,2005-12-31,2006-03-31,31.82,0.163,14.951,21.768,22.122,22.675,...,1.323,0.151,0.025,0.0,0.036,6.293,10.41,1.32%,14.739,1.842


In [11]:
# checken für NaN df
df.isnull().sum()


PERMNO       0
date         0
SICCD        1
TICKER       1
COMNAM       1
HSICIG    3416
DIVAMT    2472
BIDLO        1
ASKHI        1
PRC          1
BID          1
ASK          1
SPREAD    3579
vwretd       0
vwretx       0
dtype: int64

In [12]:
# Berechnen der % der missing values einer "Test-Kolonne"
df["NAMEENDT"].isnull().sum()/(len(df))

KeyError: 'NAMEENDT'

In [6]:
# drop alle Kolonnen mit > 40%
df = df.drop(["NAMEENDT", "SHRCLS", "DCLRDT", "DLAMT","DLPDT", "DLSTCD","NEXTDT", "PAYDT","RCRDDT", "HSICMG", "HSICIG","ACPERM", "ACCOMP","NWPERM","DLRETX","DLPRC","DLRET", "TRTSCD", "NMSIND","MMCNT","NSDINX"], axis=1)

ratios = ratios.drop(["adate", "qdate"], 1)

In [7]:
# restliche Kolonnen mit wenig missing values mit forwardfill auffüllen (?? haben dann Train daten Test daten INFOS??)
df = df.fillna(axis= "rows", method = "ffill")

In [8]:
# grosse Restkolonnen dropen
df = df.drop(["DISTCD","DIVAMT","FACPR","FACSHR",], axis = 1)
   
ratios = ratios.drop(["sale_nwc"], axis = 1)

# drop einige Variablen, welche "von Hand" ausgesucht wurden
df = df.drop(["COMNAM", "HSICCD", "ISSUNO", "PERMCO","SHRFLG","TSYMBOL","SHRCD"],axis = 1)

In [9]:
# Spread erstellen
df.SPREAD = df.BID - df.ASK

In [10]:
# RETX & RET value C austauschen und zu float konvertieren
df["RETX"] = df["RETX"].replace(["C"], method = "ffill")
df.RETX = df.RETX.astype("float32")
df["RET"] = df["RET"].replace(["C"], method = "ffill")
df.RET = df.RET.astype("float32")

In [11]:
# alle wichtigen Variablen konvertieren und mit get dummies verändern
#df["CFACPR"] = df["CFACPR"].astype(str)
#df["CFACSHR"] = df["CFACSHR"].astype(str)
#df["EXCHCD"] = df["EXCHCD"].astype(str)
#df["HEXCD"] = df["HEXCD"].astype(str)
#df["NAICS"] = df["NAICS"].astype(str)
#df["NCUSIP"] = df["NCUSIP"].astype(str)
#df["PRIMEXCH"] = df["PRIMEXCH"].astype(str)
#df["SECSTAT"] = df["SECSTAT"].astype(str)
#df["SICCD"] = df["SICCD"].astype(str)
#df["TICKER"] = df["TICKER"].astype(str)
#df["TRDSTAT"] = df["TRDSTAT"].astype(str)

#df = pd.get_dummies(df)

In [12]:
# PERMNO und date als index
df.rename(columns = {'PERMNO': 'PERMNO', 'date': 'DATE'}, inplace = True)
ratios.rename(columns = {'permno': 'PERMNO', 'date': 'DATE'}, inplace = True)


df = df.set_index(['PERMNO', "DATE"])  
ratios = ratios.set_index(["PERMNO", "DATE"])

In [13]:
# divyield % wegbekommen
ratios['divyield'] = ratios['divyield'].str.rstrip('%').astype('float')/100

In [14]:
# NaNs von ratios wegbekommen
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values= np.nan, strategy = "mean" , axis=0)
imp = imp.fit(ratios.values)
imputed_ratios = pd.DataFrame(imp.transform(ratios.values))
imputed_ratios.columns = ratios.columns
imputed_ratios.index = ratios.index

In [15]:
# shifted Price berechnen nach PERMNO Gruppe
grouped_by_permno = pd.DataFrame()

for df_key in df.groupby("PERMNO").groups: 
    permno_group = df.groupby("PERMNO").get_group(df_key) 
    permno_group["PRC_RET"] = np.log(permno_group["PRC"]/ permno_group["PRC"].shift(1)) 
    grouped_by_permno = pd.concat([grouped_by_permno, permno_group])

df_edited = grouped_by_permno

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [16]:
df_edited.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,EXCHCD,SICCD,NCUSIP,TICKER,NAICS,PRIMEXCH,TRDSTAT,SECSTAT,HEXCD,CUSIP,...,ALTPRC,SPREAD,ALTPRCDT,RETX,vwretd,vwretx,ewretd,ewretx,sprtrn,PRC_RET
PERMNO,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10107,2006-01-31,3.0,7370.0,59491810,MSFT,511210.0,Q,A,R,3,59491810,...,28.15,0.02,20060131.0,0.076482,0.040044,0.038938,0.076354,0.07515,0.025467,
10107,2006-02-28,3.0,7370.0,59491810,MSFT,511210.0,Q,A,R,3,59491810,...,26.87,-0.01,20060228.0,-0.045471,-0.001637,-0.003552,0.004836,0.003476,0.000453,-0.046537
10107,2006-03-31,3.0,7370.0,59491810,MSFT,511210.0,Q,A,R,3,59491810,...,27.21,0.0,20060331.0,0.012653,0.019053,0.017585,0.036978,0.035232,0.011065,0.012574


In [17]:
# Delete unused variables
del df_key, permno_group, grouped_by_permno

In [18]:
# Set 0, 1 for PRC_RET
def set_mov(mydata):
    if mydata['PRC_RET'] > 0:
        return 'UP'
    elif mydata['PRC_RET'] < 0:
        return 'DOWN'
    elif mydata['PRC_RET'] == 0:
        return 'UP'
    
df_edited = df_edited.assign(MOVEMENT = df_edited.apply(set_mov, axis = 1))
df_edited['PRC_MOV'] = df_edited['MOVEMENT'].factorize()[0]

df_edited = df_edited.drop('MOVEMENT', axis =1)

In [19]:
df_edited.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,EXCHCD,SICCD,NCUSIP,TICKER,NAICS,PRIMEXCH,TRDSTAT,SECSTAT,HEXCD,CUSIP,...,SPREAD,ALTPRCDT,RETX,vwretd,vwretx,ewretd,ewretx,sprtrn,PRC_RET,PRC_MOV
PERMNO,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10107,2006-01-31,3.0,7370.0,59491810,MSFT,511210.0,Q,A,R,3,59491810,...,0.02,20060131.0,0.076482,0.040044,0.038938,0.076354,0.07515,0.025467,,-1
10107,2006-02-28,3.0,7370.0,59491810,MSFT,511210.0,Q,A,R,3,59491810,...,-0.01,20060228.0,-0.045471,-0.001637,-0.003552,0.004836,0.003476,0.000453,-0.046537,0
10107,2006-03-31,3.0,7370.0,59491810,MSFT,511210.0,Q,A,R,3,59491810,...,0.0,20060331.0,0.012653,0.019053,0.017585,0.036978,0.035232,0.011065,0.012574,1


In [20]:
imputed_ratios.isnull().sum()

CAPEI              0
bm                 0
evm                0
pe_op_basic        0
pe_op_dil          0
pe_exi             0
pe_inc             0
ps                 0
pcf                0
dpr                0
npm                0
opmbd              0
opmad              0
gpm                0
ptpm               0
cfm                0
roa                0
roe                0
roce               0
efftax             0
aftret_eq          0
aftret_invcapx     0
aftret_equity      0
pretret_noa        0
pretret_earnat     0
GProf              0
equity_invcap      0
debt_invcap        0
totdebt_invcap     0
capital_ratio      0
                  ..
profit_lct         0
ocf_lct            0
cash_debt          0
fcf_ocf            0
lt_ppent           0
dltt_be            0
debt_assets        0
debt_capital       0
de_ratio           0
intcov             0
intcov_ratio       0
cash_ratio         0
quick_ratio        0
curr_ratio         0
cash_conversion    0
inv_turn           0
at_turn      

In [21]:
# Indexe entfernen, um Datum als Variable zu haben
df_edited = df_edited.reset_index()
imputed_ratios = imputed_ratios.reset_index()

# Datum auf 1. des Monats setzen (weil tw. 2 verschiedene Tage in Monaat)
df_edited['DATE'] = df_edited['DATE'].apply(lambda dt: dt.replace(day=1))
imputed_ratios['DATE'] = imputed_ratios['DATE'].apply(lambda dt: dt.replace(day=1))

# join datasets
df_edited = df_edited.set_index(["PERMNO", "DATE"])
imputed_ratios = imputed_ratios.set_index(["PERMNO", "DATE"])
data = df_edited.join(imputed_ratios, how = 'outer')

In [22]:
# reset index
data = data.reset_index()

grouped_by_permno = pd.DataFrame()

In [23]:
# Group by PERMNO-Code and then remove first (0) row because it is nan in every PRC_RET group
for df_key in data.groupby('PERMNO').groups:
    permno_group = data.groupby('PERMNO').get_group(df_key)
    permno_group = permno_group[1:]
    grouped_by_permno = pd.concat([grouped_by_permno, permno_group])

data = grouped_by_permno

In [24]:
# alle unnützen Variablen löschen
del df_edited, ratios, df_key, permno_group, grouped_by_permno

In [25]:
# alle wichtigen Variablen konvertieren und mit get dummies verändern
data["CFACPR"] = data["CFACPR"].astype(str)
data["CFACSHR"] = data["CFACSHR"].astype(str)
data["EXCHCD"] = data["EXCHCD"].astype(str)
data["HEXCD"] = data["HEXCD"].astype(str)
data["NAICS"] = data["NAICS"].astype(str)
data["NCUSIP"] = data["NCUSIP"].astype(str)
data["PRIMEXCH"] = data["PRIMEXCH"].astype(str)
data["SECSTAT"] = data["SECSTAT"].astype(str)
data["SICCD"] = data["SICCD"].astype(str)
data["TICKER"] = data["TICKER"].astype(str)
data["TRDSTAT"] = data["TRDSTAT"].astype(str)

data = pd.get_dummies(data)

In [27]:
data.head()

Unnamed: 0,PERMNO,DATE,SHRENDDT,BIDLO,ASKHI,PRC,VOL,RET,BID,ASK,...,CFACPR_1.1134979999999999,CFACPR_2.0,CFACPR_4.0,CFACPR_7.0,CFACPR_8.0,CFACSHR_1.0,CFACSHR_2.0,CFACSHR_4.0,CFACSHR_7.0,CFACSHR_8.0
1,10107,2006-02-01,20060330.0,26.39,28.04,26.87,11088149.0,-0.042273,26.87,26.88,...,0,0,0,0,0,1,0,0,0,0
2,10107,2006-03-01,20060423.0,26.85,27.89,27.21,14514337.0,0.012653,27.24,27.24,...,0,0,0,0,0,1,0,0,0,0
3,10107,2006-04-01,20060629.0,24.15,27.74,24.15,14689919.0,-0.112459,24.16,24.16,...,0,0,0,0,0,1,0,0,0,0
4,10107,2006-05-01,20060629.0,22.56,24.29,22.65,23651189.0,-0.058385,22.7,22.7,...,0,0,0,0,0,1,0,0,0,0
5,10107,2006-06-01,20060817.0,21.51,23.4702,23.3,19980809.0,0.028698,23.38,23.31,...,0,0,0,0,0,1,0,0,0,0


In [28]:
data = data.set_index(["PERMNO", "DATE"])
imp = Imputer(missing_values=np.nan, strategy = "mean", axis=0)
imputed_data = pd.DataFrame(imp.fit_transform(data))
imputed_data.columns = data.columns
imputed_data.index = data.index

In [29]:
# Variable PRC_mov mit lag erstellen
imputed_data['PREDICTION'] = imputed_data['PRC_MOV']
grouped_by_permno = pd.DataFrame()

for df_key in imputed_data.groupby('PERMNO').groups:
    permno_group = imputed_data.groupby('PERMNO').get_group(df_key)
    permno_group['PREDICTION'] = permno_group['PREDICTION'].shift(-1)
    permno_group = permno_group[pd.notnull(permno_group['PREDICTION'])]
    grouped_by_permno = pd.concat([grouped_by_permno, permno_group])
imputed_data = grouped_by_permno


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [31]:
display(imputed_data.PREDICTION.head())

PERMNO  DATE      
10107   2006-02-01    1.0
        2006-03-01    0.0
        2006-04-01    0.0
        2006-05-01    1.0
        2006-06-01    1.0
Name: PREDICTION, dtype: float64

In [32]:
# delete unnütze Variabeln
del df_key, permno_group, grouped_by_permno

In [33]:
imputed_data.to_csv("Q:/UZH/FS18/Machine Learning/Projekt/Code/imputed_data.csv")