# 5. ML y modelos predictivos

### Finanzas Cuantitativas y Ciencia de Datos
#### Rodrigo Lugo Frias y León Berdichevsky Acosta
#### ITAM Primavera 2019

_INSTRUCCIONES:_
* Todas las celdas se corren haciendo __Shift + Enter__ o __Ctrl + Enter__

_NOTAS:_
* _Notebook adaptado de distintas fuentes y proyectos personales_
___

## Contenido

1. __Preparar la data__
2. __Moving average__
3. __Exponential Moving average__
4. __Regresion Lineal__
5. __k Nearest Neighbors__
6. __ARIMA__
7. __Long Short Term Memory__

___

In [None]:
# import utils.frontera_eficiente

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.5)
import datetime as dt
# Inline command for matplotlib
%matplotlib inline

#Silence all warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
stocks = 'data/ALSEA MM Equity.csv'
alsea  = pd.read_csv(stocks)

In [None]:
alsea.sample()

In [None]:
alsea.Open.tail(50).plot()
plt.show()

In [None]:
def change_date( df ):
    df.Date = df.Date.apply(lambda x : pd.to_datetime(str(x), format = "%Y%m%d"))
    df.set_index(df.Date, inplace = True)
    df = df.copy()[df.columns[1:]]
    return df

alsea = change_date(alsea)

# Tomamos una muestra de los datos (Los primeros 12 anios)
df = alsea.head(3000)
df.tail()

In [None]:
df.Open.tail(50).plot()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(16,6))
ax.set_title('Alsea')
df['Last'].plot(ax=ax)
plt.show()

In [None]:
play_data = df.copy()[['Last']].reset_index()
play_data.info()

In [None]:
# Splitting data into train and validation

train_size = 2./3.

df_train = play_data[:int(len(play_data)*train_size)]
df_valid = play_data[int(len(play_data)*train_size):]

In [None]:
play_data.shape, df_train.shape, df_valid.shape

In [None]:
print ('Training dates: {0} - {1}'.format(df_train['Date'].min(), df_train['Date'].max()))
print ('Validation dates: {0} - {1}'.format(df_valid['Date'].min(), df_valid['Date'].max()))

### Moving average

In [None]:
preds = []
for i in range(0,1000):
    #print(preds,len(df_train)-1000+i,df_train['Last'][len(df_train)-1000+i:].sum())
    a = df_train['Last'][len(df_train)-1000+i:].sum() + sum(preds)
    b = a/1000
    preds.append(b)

In [None]:
rms=np.sqrt(np.mean(np.power((np.array(df_valid['Last'])-preds),2)))
rms

In [None]:
from sklearn.metrics import mean_squared_error

np.sqrt(mean_squared_error(df_valid['Last'].tolist(),preds))

In [None]:
#plot
fig, ax = plt.subplots(figsize=(16,6))
df_valid['Predictions'] = 0.
df_valid['Predictions'] = preds
plt.plot(df_train['Last'])
plt.plot(df_valid[['Last', 'Predictions']])
plt.show()

### Exponential Moving Average

In [None]:
_preds_ = df_train.Last.ewm(com=0.25).mean().tail(1000).tolist()
df_valid['_Predictions_'] = 0
df_valid['_Predictions_'] = _preds_
fig, ax = plt.subplots(figsize=(16,6))

plt.plot(df_train['Last'])
plt.plot(df_valid[['Last', '_Predictions_']])
plt.show()

In [None]:
rms=np.sqrt(np.mean(np.power((np.array(df_valid['Last'])-_preds_),2)))
rms

In [None]:
np.sqrt(mean_squared_error(df_valid['Last'].tolist(),_preds_))

In [None]:
from utils.frontera_eficiente import getData

In [None]:
kk = pd.read_csv('data/ALSEA MM Equity.csv')
kk.columns = ['Date', 'DOpen', 'DHigh', 'DLow', 'DClose', 'Volume', 'PE']
kk.to_csv('data/alsea.csv')

In [None]:
!ls data

In [None]:
#########################################
#---------------------------------------#
# getData 							    #
#---------------------------------------#
#########################################
class getData:
	def __init__( self, file ):
		self.file = file
		# ----- #
		df = pd.read_csv(self.file, index_col = 0)
		df = self.index_to_datetime(df)
		self.n = 22 # Days to ATR
		# ----- #
		self.timeseries = df
		self.truerange  = self.truerange()
		self.atr 	    = self.atr()
		self.atr_return = self.atr_return()
		self.cum_sum    = self.cum_sum()
		self.dataframe	= self.dataframe()

	def index_to_datetime( self, df ):
		#df.index = df.index.astype('str')
		#df.index = df.index.to_datetime()
		return change_date(df)

	def truerange( self ):
		adf = self.timeseries
		s1 = pd.Series(np.abs(adf.DHigh - adf.DLow))
		s2 = pd.Series(np.abs(adf.DHigh - adf.DClose.shift()))
		s3 = pd.Series(np.abs(adf.DLow  - adf.DClose.shift()))
		TR = pd.Series(pd.concat([s1,s2,s3],axis=1).max(axis=1), name = 'TrueRange')
		return TR

	def atr( self ):
		n = self.n
		TR = self.truerange
		ATR = pd.Series(pd.ewma(TR, span = n, min_periods = n), name = 'ATR_{}'.format(n))
		return ATR

	def atr_return( self ):
		tday    = self.timeseries.DClose
		yday    = self.timeseries.DClose.shift()
		atryday = self.atr.shift()
		atr_ret = (tday - yday) / atryday
		atr_ret = atr_ret.rename('ATR_RET')
		return atr_ret

	def cum_sum( self ):
		atr_ret = self.atr_return
		cum_sum = atr_ret.cumsum(axis = 0)
		cum_sum = cum_sum.rename('PATR')
		return cum_sum

	def dataframe( self ):
		cols =  ['DOpen', 'DHigh', 'DLow', 'DClose', 'TrueRange', 'ATR_{}'.format(22)]
		cols += ['ATR_RET', 'PATR']
		adf = self.timeseries.join([self.truerange,self.atr,self.atr_return,self.cum_sum])
		adf = adf[cols]
		return adf

	def plot( self, Series, *args):
		fig, ax = plt.subplots(1,figsize=(10, 7))
		ser = self.dataframe[Series]
		ser.plot()
		plt.xlabel('Year')
		plt.ylabel(Series)
		if len(args) != 0:
			plt.title(args[0])
		plt.show()
#########################################
# END: getData  						#
#########################################

In [None]:
dd = getData('data/alsea.csv')

In [None]:
df = dd.dataframe.PATR[23:3023]
df.head()

In [None]:
len(df)

In [None]:
df_ = pd.DataFrame(df.copy()).reset_index()

In [None]:
train_size = 2./3.

df_train = df_[:int(len(play_data)*train_size)]
df_valid = df_[int(len(play_data)*train_size):]

In [None]:
preds = []
for i in range(0,1000):
    #print(preds,len(df_train)-1000+i,df_train['Last'][len(df_train)-1000+i:].sum())
    a = df_train['PATR'][len(df_train)-1000+i:].sum() + sum(preds)
    b = a/1000
    preds.append(b)

In [None]:
np.sqrt(mean_squared_error(df_valid['PATR'].tolist(),preds))

In [None]:
#plot
fig, ax = plt.subplots(figsize=(16,6))
df_valid['Predictions'] = 0.
df_valid['Predictions'] = preds
plt.plot(df_train['PATR'])
plt.plot(df_valid[['PATR', 'Predictions']])
plt.show()

In [None]:
_preds_ = df_train.PATR.ewm(com=0.25).mean().tail(1000).tolist()
df_valid['_Predictions_'] = 0
df_valid['_Predictions_'] = _preds_
fig, ax = plt.subplots(figsize=(16,6))

plt.plot(df_train['PATR'])
plt.plot(df_valid[['PATR', '_Predictions_']])
plt.show()

In [None]:
np.sqrt(mean_squared_error(df_valid['PATR'].tolist(),_preds_))

### EMA: different rolling windows

In [None]:
roll_windows = [1,30,90]
for win in roll_windows:
    _preds_ = df_train.PATR.rolling(win).mean().tail(1000).tolist()
    df_valid['roll_window_{0}'.format(win)] = 0
    df_valid['roll_window_{0}'.format(win)] = _preds_
    error_ = np.sqrt(mean_squared_error(df_valid['PATR'].tolist(),_preds_))
    print('RMS = {0}'.format(error_))

fig, ax = plt.subplots(figsize=(16,6))
plt.plot(df_train['PATR'])
df_valid[df_valid.columns[1:]].plot(ax=ax)
plt.show()

### Regresion lineal

In [None]:
df = df_train.copy()
df['_index_'] = df.index
df.head()

In [None]:
import statsmodels.formula.api as smf

# Initialise and fit linear regression model using `statsmodels`
model = smf.ols('PATR ~ _index_', data = df )
model = model.fit()

In [None]:
model.params

In [None]:
_preds_ = model.predict()

print(len(_preds_))

In [None]:
fig, ax = plt.subplots(figsize=(16,6))
plt.plot(df_train['PATR'])
plt.plot(_preds_)
plt.show()

### Jugando con el data set

In [None]:
x = dd.dataframe[1700:1850].copy().reset_index()

fig, ax = plt.subplots(figsize=(16,6))
x.DClose.plot(ax=ax)
plt.show()

In [None]:
from utils.frontera_eficiente import RegressionML

reg = RegressionML(x.DClose)

In [None]:
reg.Results()

In [None]:
reg.Plot('s')

In [None]:
x_ = dd.dataframe[1850:1851].copy().reset_index()
x_

In [None]:
to_predict = np.array(150)
pred       = reg.model.predict(to_predict)

In [None]:
print('Valor t+1\n\tReal:\t{0}\n\tPred:\t{1}'.format(x_.DClose.iloc[0],pred[0]))
print('\tRMS:\t{0}'.format(np.sqrt(mean_squared_error(x_.DClose.tolist(),pred))))

#### More than one feature

In [None]:
df_mf = dd.dataframe.copy()[23:3023]
df_mf = df_mf[['DOpen', 'DHigh', 'DLow', 'DClose','PATR']]
df_mf.head()

In [None]:
df_mf_train = df_mf[:int(len(play_data)*train_size)]
df_mf_valid = df_mf[int(len(play_data)*train_size):]

In [None]:
from sklearn.linear_model import LinearRegression

# Build linear regression model to get DClose price
# Split data into predictors X and output Y
predictors = ['DLow','DHigh','DOpen']
X = df_mf_train[predictors]
y = df_mf_train['DClose']

# Initialise and fit model
lm    = LinearRegression()
model = lm.fit(X, y)

In [None]:
df_mf_train.corr()

In [None]:
print(f'alpha = {model.intercept_}')
print(f'betas = {model.coef_}')

In [None]:
_preds_ = model.predict(X)

In [None]:
fig, ax = plt.subplots(figsize=(16,6))
plt.plot(df_mf_train['DClose'].tolist())
plt.plot(_preds_)
plt.show()

In [None]:
X_new = df_mf_valid[predictors]
_preds_valid = model.predict(X_new)

df_mf_valid['pred'] = _preds_valid

In [None]:
fig, ax = plt.subplots(figsize=(16,6))
plt.plot(df_mf_train['DClose'])
plt.plot(df_mf_valid.DClose)
plt.plot(df_mf_valid.pred)
plt.show()

### Regresion Logistica

In [None]:
x = dd.dataframe[2700:2950].copy().reset_index() 
x.DClose.plot()
plt.show()

In [None]:
rets = x.DClose.diff()
rets.plot.density()
plt.show()

In [None]:
func_log = 1./(1+np.exp(-rets))
x['Log'] = func_log
x.Log.plot()
plt.show()

In [None]:
# New library for techinical indicators
import talib as ta

In [None]:
df = dd.dataframe[50:3050].copy()[['DOpen','DHigh','DLow','DClose','ATR_22']]
df['S_10'] = df['DClose'].rolling(window=10).mean()
df['Open-Close'] = df['DOpen'] - df['DClose'].shift(1)
df['Open-Open']  = df['DOpen'] - df['DOpen'].shift(1)
# Relative Strength Index
df['RSI'] = ta.RSI(np.array(df['DClose']), timeperiod =10)
df.RSI.plot()
plt.show()

In [None]:
df.head()

In [None]:
df = df.dropna()
df[df==np.inf]=np.nan
df.fillna(df.mean(), inplace=True)
df.tail()

In [None]:
X = df.reset_index()
X = X.iloc[:,1:10].fillna(method='pad')
X.iloc[2000]

In [None]:
# Define target
y = np.where (X['DClose'].shift(-1) > X['DClose'],1,-1)
y

In [None]:
# train-test split
split = 250

X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:]

In [None]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
#from sklearn.model_selection import cross_val_score

In [None]:
# Train the model
logmodel = LogisticRegression()
logmodel = logmodel.fit(X_train,y_train)

In [None]:
pd.DataFrame(dict(zip(X.columns, np.transpose(logmodel.coef_)))).T

In [None]:
X_train.corr()

In [None]:
probability = logmodel.predict_proba(X_test)
probability

In [None]:
probs = pd.DataFrame(probability,columns=['PVenta','PCompra'])
probs['Close'] = X_test.DClose.reset_index(drop=True).values
probs.head()

In [None]:
predicted  = logmodel.predict(X_test)
resultados = probs.copy()
resultados['Prediccion'] = predicted
resultados['Real']       = y_test
resultados.head()

In [None]:
# Validacion del modelo

conf_mat = pd.DataFrame(metrics.confusion_matrix(y_test, predicted),
            columns=['Pred: VENTA','Pred: COMPRA'])
conf_mat['ind'] = ['Real: VENTA','Real: COMPRA']
conf_mat.set_index('ind')

In [None]:
metrics.confusion_matrix(y_test, predicted)



In [None]:
x = metrics.confusion_matrix(y_test, predicted)

x.trace()
x.sum()

In [None]:
print (metrics.classification_report(y_test, predicted))

In [None]:
print (logmodel.score(X_test,y_test))

In [None]:
np.mean(y == -1) 

In [None]:
logmodel = LogisticRegression()

In [None]:
logmodel = LogisticRegression()

def accuracy( matrix ):
    return x.trace()/x.sum()

def explorar_hiperparametros( df, roll_win, split):
    df['S_10'] = df['DClose'].rolling(window=roll_win).mean()
    df['Open-Close'] = df['DOpen'] - df['DClose'].shift(1)
    df['Open-Open']  = df['DOpen'] - df['DOpen'].shift(1)
    # Relative Strength Index
    df['RSI'] = ta.RSI(np.array(df['DClose']), timeperiod =10)
    df = df.dropna()
    df[df==np.inf]=np.nan
    df.fillna(df.mean(), inplace=True)
    # Determinar observables
    X = df.reset_index()
    X = X.iloc[:,1:10].fillna(method='pad')
    # TTSplit
    X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:]
    # Entrenar modelo
    logmodel = LogisticRegression()
    logmodel = logmodel.fit(X_train,y_train)
    # Crear predicciones
    predicted  = logmodel.predict(X_test)
    # Confussion matrix
    mat_conf = metrics.confusion_matrix(y_test, predicted)
    
    return accuracy(mat_conf)
    
    
    

In [None]:
df.columns

In [None]:
df = dd.dataframe[50:3050].copy()[['DOpen','DHigh','DLow','DClose','ATR_22']]
explorar_hiperparametros(df, 20, 2500)

In [None]:
roll_windows = range(0,50,10)
splits       = range(500,2500,100)

accs = []
for roll in roll_windows:
    for splt in splits:
        acc = explorar_hiperparametros(df, roll, splt)
        accs.append([roll,splt,acc])

In [None]:
hm = pd.DataFrame(accs)

import seaborn as sns
sns.heatmap( data=hm )

In [None]:
hm