In [1]:
import pandas as pd
#modelos ml de sklearn
from sklearn import svm, neighbors
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
import pymongo
import logging
import os

In [17]:
import numpy as np

In [2]:
def read_stock_csv(ticker, file_dir):
	df = pd.read_csv('{}/{}.csv'.format(file_dir, ticker))
	df.set_index('Date', inplace = True)
	return df 
def get_adj_close_df(ticker, data_dir):
	df = read_stock_csv(ticker, data_dir)
	df.drop(['Open','Close','Volume', 'High', 'Low'],1, inplace=True)
	return df
def modelo_fit_predict_and_score(modelo, x_train, y_train, x_test, y_test):
	#entrenamos con datos de entrenamiento
	modelo.fit(x_train, y_train)
	#prediccion con datos de test
	y_pred = modelo.predict(x_test)
	#score
	accuracy = modelo.score(x_test, y_test)
	return accuracy, y_pred
def get_tickers_to_process(*tickers, ticker_col = False):
	if ticker_col:
		mongo_tickers_list = Mongo_stock_metadata('stock_metadata', 'tickers_list')
		tickers_list = mongo_tickers_list.read_by_key_from_mongodb('ticker_market', ticker_col)['ticker_list']
	else:
		tickers_list = list(tickers)
	return tickers_list
class Mongo_stock_metadata:
	def __init__(self, bbdd, coleccion):
		self.client = pymongo.MongoClient('localhost', 27017)
		self.bbdd = bbdd
		self.logger = logger
		self.db = self.client[self.bbdd]
		self.coleccion = self.db[coleccion]
		
	#funcion para almacenar datos en formato diccionario en MongoDB
	def insert_in_mongodb(self, dict_key_value):
		#se inserta
		logger.info('Insertando datos en la coleccion de MongoDB')
		if type(dict_key_value) == 'list':
			self.coleccion.insert_many(dict_key_value)
		elif len(dict_key_value) >= 1:
			self.coleccion.insert_one(dict_key_value)
		else:
			logger.warning('El documento a insertar esta vacio')
	
	def read_by_key_from_mongodb(self, key, value):
		logger.info('Leyendo datos en la coleccion de MongoDB')
		return self.coleccion.find_one({key:value})

	def read_all_from_mongodb(self):
		logger.info('Leyendo datos en la coleccion de MongoDB')
		return self.coleccion.find()

	def read_doc_keys_from_collection(self):
		logger.info('Leyendo datos en la coleccion de MongoDB')
		list_keys = []
		for document in self.coleccion.find():
			list_keys.append(document.keys())
		return list_keys

	def query_collections(self, my_query):
		logger.info('Leyendo query en la coleccion de MongoDB')
		return self.coleccion.find(my_query)
		 
	def count_elements_collection(self):
		return self.coleccion.find().count()

	def delete_collection(self):
		return self.coleccion.drop()

def logger_for_my_stock_app():
	logging.basicConfig(filename='scrapingstock.log', level=logging.INFO, format = LOG_FORMAT, filemode ='w', datefmt=DATETIME_FORMAT)
	logger = logging.getLogger()
	return logger

In [10]:
def porcentaje_variacion_dias(df, n_dias, columna):
	for i in range(1, n_dias + 1):
		df['porc_variacion_dia_{}'.format(i)] = (df[columna].shift(-i) - df[columna]) / df[columna]
		df.fillna(0, inplace=True)
	return df

In [44]:
#para la variable independiente solo se mantienen las columnas de variacion
def get_variables_indep_y_target(df, col_target):
	cols_to_drop = []
	for col in df.columns:
		if 'target' in col:
			cols_to_drop.append(col)
	cols_to_drop.append('Adj Close')
	df_indep = df.drop(cols_to_drop, 1, inplace=False)
	x = df_indep.to_numpy()
	y = df[col_target].to_numpy()
	return x, y


In [52]:
def get_data_model_sube_baja_mantiene(df_porc, porcentaje_cambio_etiqueta, dias_analisis_variacion):
	#aplicamos las etiquetas con la columna 'target'
	for i in range(1, dias_analisis_variacion +1):
		#si el porcentaje de variacion es mayor que el porcentaje de cambio como parametro se le asigna 1, sino -1
		df_porc['target_dia'+'_{}'.format(i)] = np.select(
			[
			df_porc['porc_variacion_dia' +'_'+'{}'.format(i)].between(porcentaje_cambio_etiqueta*-1, porcentaje_cambio_etiqueta, inclusive = False),
			df_porc['porc_variacion_dia' +'_'+'{}'.format(i)] >= porcentaje_cambio_etiqueta,
			df_porc['porc_variacion_dia' +'_'+'{}'.format(i)] <= porcentaje_cambio_etiqueta,
			],
			[0,1,-1],
			default=0)
	return df_porc

In [58]:
def porcentaje_acierto_modelo(y_pred, y_test):
	acertado = []
	n_rows = len(y_pred)
	for i in range(n_rows):
		if y_pred[i] == y_test[i]:
			acertado.append(i)
	porc_acierto = (len(acertado)/n_rows)  * 100
	return porc_acierto

In [3]:
LOG_FORMAT = '%(asctime)s %(levelname)s - %(message)s'
DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
logging.basicConfig(filename='scrapingstock.log', level=logging.INFO, format = LOG_FORMAT, filemode ='w', datefmt=DATETIME_FORMAT)
logger = logging.getLogger()

In [4]:
#datos a procesar en el directorio
annomes_procesar_train = '200001_201812' #train
annomes_procesar_test = '201901_201912' #test
data_dir_train = 'datos_y_funciones/historic_stock_data_{}'.format(annomes_procesar_train)
data_dir_test = 'datos_y_funciones/historic_stock_data_{}'.format(annomes_procesar_test)

#data_pred_dir
data_pred_dir = 'datos_y_funciones/predicted_data'

In [5]:
tickers_list = get_tickers_to_process(ticker_col='tickers_ibex35')
tickers_list = ['IDR.MC']

In [6]:
#objeto Mongo para almacenar scores en la colección model_scores
mongo_scores = Mongo_stock_metadata('stock_metadata', 'model_scores')

In [7]:
#seleccionamos parametros para construir el modelo de entrenamiento y ajuste
porcentaje_cambio_etiqueta = 0.04
dias_analisis_variacion = 5

In [8]:
#parametros para lanzar modelos
modelo_knc = False
modelo_vc = False

In [9]:
ticker = 'IDR.MC'

In [13]:
	df_train = get_adj_close_df(ticker, data_dir_train)
	df_test = get_adj_close_df(ticker, data_dir_test)

In [14]:
	df_porc_train = porcentaje_variacion_dias(df = df_train, n_dias=dias_analisis_variacion, columna='Adj Close')
	df_porc_test = porcentaje_variacion_dias(df = df_test, n_dias=dias_analisis_variacion, columna='Adj Close')

In [53]:
	df_class_train = get_data_model_sube_baja_mantiene(df_porc_train, porcentaje_cambio_etiqueta, dias_analisis_variacion)
	df_class_test = get_data_model_sube_baja_mantiene(df_porc_test, porcentaje_cambio_etiqueta, dias_analisis_variacion)

In [54]:
	x_train, y_train = get_variables_indep_y_target(df_class_train, 'target_dia' + '_' + str(dias_analisis_variacion))
	x_test, y_test = get_variables_indep_y_target(df_class_test, 'target_dia' + '_' + str(dias_analisis_variacion))

In [55]:
clasificador = neighbors.KNeighborsClassifier()

In [56]:
accuracy_knc, y_pred = modelo_fit_predict_and_score(clasificador, x_train, y_train, x_test, y_test)

In [59]:
mi_accuracy = porcentaje_acierto_modelo(y_pred, y_test)

In [61]:
print(accuracy_knc)
print(mi_accuracy)

0.9568627450980393
95.68627450980392


In [62]:
y_pred

array([ 1,  1,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,
        0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,  0,
       -1,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  1,  1,  1,  1,  0, -1, -1, -1, -1, -1,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1,
       -1, -1,  1,  0,  0,  0,  0, -1, -1, -1, -1,  0,  0,  0, -1, -1, -1,
       -1, -1,  0, -1,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  0,  0,  0, -1,  0,  0,  0,  0,  0, -1, -1,  0,  0,
        0,  0, -1, -1, -1, -1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  1,  0,  0,  1,  1,  1,  1,  1,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [68]:
df_class_test

Unnamed: 0_level_0,Adj Close,porc_variacion_dia_1,porc_variacion_dia_2,porc_variacion_dia_3,porc_variacion_dia_4,porc_variacion_dia_5,target_dia_1,target_dia_2,target_dia_3,target_dia_4,target_dia_5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-01-02,8.125,-0.015385,0.009846,-0.009231,0.007385,0.056000,0,0,0,0,1
2019-01-03,8.000,0.025625,0.006250,0.023125,0.072500,0.066875,0,0,0,1,1
2019-01-04,8.205,-0.018891,-0.002437,0.045704,0.040219,0.033516,0,0,1,1,0
2019-01-07,8.050,0.016770,0.065838,0.060248,0.053416,0.030435,0,1,1,1,0
2019-01-08,8.185,0.048259,0.042761,0.036041,0.013439,0.015272,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2019-12-23,10.110,0.000989,0.000989,0.018793,0.007913,0.000000,0,0,0,0,0
2019-12-24,10.120,0.000000,0.017787,0.006917,0.000000,0.000000,0,0,0,0,0
2019-12-25,10.120,0.017787,0.006917,0.000000,0.000000,0.000000,0,0,0,0,0
2019-12-27,10.300,-0.010680,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0
