In [2]:
import pandas as pd

#modelos de regresion lineal de sklearn y regresion de SVM
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR

In [14]:
import pymongo
import logging

In [109]:
import os

In [3]:
def read_stock_csv(ticker, file_dir):
	df = pd.read_csv('{}/{}.csv'.format(file_dir, ticker))
	df.set_index('Date', inplace = True)
	return df 

In [4]:
def get_adj_close_df(ticker, data_dir):
	df = read_stock_csv(ticker, data_dir)
	df.drop(['Open','Close','Volume', 'High', 'Low'],1, inplace=True)
	return df

In [5]:
def modelo_fit_predict_and_score(modelo, x_train, y_train, x_test, y_test):
	#entrenamos con datos de entrenamiento
	modelo.fit(x_train, y_train)
	#prediccion con datos de test
	y_pred = modelo.predict(x_test)
	#score
	accuracy = modelo.score(x_test, y_test)
	return accuracy, y_pred

In [6]:
def get_tickers_to_process(*tickers, ticker_col = False):
	if ticker_col:
		mongo_tickers_list = Mongo_stock_metadata('stock_metadata', 'tickers_list')
		tickers_list = mongo_tickers_list.read_by_key_from_mongodb('ticker_market', ticker_col)['ticker_list']
	else:
		tickers_list = list(tickers)
	return tickers_list

In [9]:
class Mongo_stock_metadata:
	def __init__(self, bbdd, coleccion):
		self.client = pymongo.MongoClient('localhost', 27017)
		self.bbdd = bbdd
		self.logger = logger
		self.db = self.client[self.bbdd]
		self.coleccion = self.db[coleccion]
		
	#funcion para almacenar datos en formato diccionario en MongoDB
	def insert_in_mongodb(self, dict_key_value):
		#se inserta
		logger.info('Insertando datos en la coleccion de MongoDB')
		if type(dict_key_value) == 'list':
			self.coleccion.insert_many(dict_key_value)
		elif len(dict_key_value) >= 1:
			self.coleccion.insert_one(dict_key_value)
		else:
			logger.warning('El documento a insertar esta vacio')
	
	def read_by_key_from_mongodb(self, key, value):
		logger.info('Leyendo datos en la coleccion de MongoDB')
		return self.coleccion.find_one({key:value})

	def read_all_from_mongodb(self):
		logger.info('Leyendo datos en la coleccion de MongoDB')
		return self.coleccion.find()

	def read_doc_keys_from_collection(self):
		logger.info('Leyendo datos en la coleccion de MongoDB')
		list_keys = []
		for document in self.coleccion.find():
			list_keys.append(document.keys())
		return list_keys

	def query_collections(self, my_query):
		logger.info('Leyendo query en la coleccion de MongoDB')
		return self.coleccion.find(my_query)
		 
	def count_elements_collection(self):
		return self.coleccion.find().count()

	def delete_collection(self):
		return self.coleccion.drop()

In [15]:
LOG_FORMAT = '%(asctime)s %(levelname)s - %(message)s'
DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
logging.basicConfig(filename='scrapingstock.log', level=logging.INFO, format = LOG_FORMAT, filemode ='w', datefmt=DATETIME_FORMAT)
logger = logging.getLogger()

In [16]:
def logger_for_my_stock_app():
	logging.basicConfig(filename='scrapingstock.log', level=logging.INFO, format = LOG_FORMAT, filemode ='w', datefmt=DATETIME_FORMAT)
	logger = logging.getLogger()
	return logger

In [23]:
annomes_procesar_train = '200001_201812' #train
annomes_procesar_test = '201901_201912' #test
data_dir_train = 'historic_stock_data_{}'.format(annomes_procesar_train)
data_dir_test = 'historic_stock_data_{}'.format(annomes_procesar_test)

In [17]:
tickers_list = get_tickers_to_process(ticker_col='tickers_ibex35')

In [19]:
tickers_list = ['IDR.MC']

In [20]:
n_dias_pred = 15

In [112]:
data_pred_dir = 'datos_y_funciones/predicted_data'

In [21]:
ticker='IDR.MC'

In [79]:
df_train = get_adj_close_df(ticker, data_dir_train)
df_test  = get_adj_close_df(ticker, data_dir_test)

In [80]:
df_train['prediccion_n_dias'] = df_train[['Adj Close']].shift(-n_dias_pred)
df_test['prediccion_n_dias'] = df_test[['Adj Close']].shift(-n_dias_pred)

In [81]:
	df_train.dropna(subset=['prediccion_n_dias'], inplace=True)
	df_test.dropna(subset=['prediccion_n_dias'], inplace=True)

In [82]:
	x_train = df_train.drop(['prediccion_n_dias'], 1, inplace=False)
	y_train = df_train.drop(['Adj Close'], 1, inplace=False)
	x_test = df_test.drop(['prediccion_n_dias'], 1, inplace=False)
	y_test = df_test.drop(['Adj Close'], 1, inplace=False)

In [85]:
	x_train = x_train.to_numpy()
	y_train = y_train.to_numpy()
	x_test = x_test.to_numpy() 
	y_test = y_test.to_numpy() 


In [87]:
regresion_lineal = LinearRegression()

In [88]:
accuracy_regresion, y_pred_regresion = modelo_fit_predict_and_score(regresion_lineal, x_train, y_train, x_test, y_test)

In [89]:
accuracy_regresion

0.5010365939534506

In [93]:
len(y_pred_regresion)

240

In [94]:
df_test.count()

Adj Close            240
prediccion_n_dias    240
dtype: int64

In [95]:
result = pd.concat([df_test, pd.DataFrame(y_pred_regresion)], axis=1)

In [101]:
y_df = pd.DataFrame(y_pred_regresion)

In [103]:
y_df_test = df_test.reset_index()

In [106]:
result = y_df_test.join(y_df)

In [107]:
result

Unnamed: 0,Date,Adj Close,prediccion_n_dias,0
0,2019-01-02,8.125,8.89,8.213349
1,2019-01-03,8.000,9.00,8.093514
2,2019-01-04,8.205,9.12,8.290044
3,2019-01-07,8.050,9.06,8.141448
4,2019-01-08,8.185,9.03,8.270871
...,...,...,...,...
235,2019-12-02,9.350,10.11,9.387733
236,2019-12-03,9.210,10.12,9.253517
237,2019-12-04,9.325,10.12,9.363765
238,2019-12-05,9.185,10.30,9.229551


In [121]:
result.set_index('Date', inplace=True)

In [122]:
result

Unnamed: 0_level_0,Adj Close,prediccion_n_dias,0
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-02,8.125,8.89,8.213349
2019-01-03,8.000,9.00,8.093514
2019-01-04,8.205,9.12,8.290044
2019-01-07,8.050,9.06,8.141448
2019-01-08,8.185,9.03,8.270871
...,...,...,...
2019-12-02,9.350,10.11,9.387733
2019-12-03,9.210,10.12,9.253517
2019-12-04,9.325,10.12,9.363765
2019-12-05,9.185,10.30,9.229551


In [108]:
y_pred_regresion

array([[ 8.21334933],
       [ 8.09351429],
       [ 8.29004368],
       [ 8.14144849],
       [ 8.27087055],
       [ 8.64954879],
       [ 8.6064081 ],
       [ 8.55368039],
       [ 8.37632505],
       [ 8.39070559],
       [ 8.57285444],
       [ 8.64475528],
       [ 8.84128466],
       [ 8.86525222],
       [ 8.82690504],
       [ 8.94674008],
       [ 9.05219458],
       [ 9.16723611],
       [ 9.1097158 ],
       [ 9.08095474],
       [ 9.02822794],
       [ 9.03302054],
       [ 8.95632619],
       [ 9.08574825],
       [ 9.26789801],
       [ 9.22475733],
       [ 9.17202962],
       [ 9.07616122],
       [ 9.18641015],
       [ 9.20558328],
       [ 9.34938588],
       [ 9.51715419],
       [ 9.56508839],
       [ 9.57946893],
       [ 9.58426244],
       [ 9.66095679],
       [ 9.53153473],
       [ 9.819139  ],
       [ 9.67533641],
       [ 9.59384854],
       [ 9.56029488],
       [ 9.54591526],
       [ 9.88145281],
       [10.04922205],
       [ 9.86227968],
       [ 9

In [123]:
def join_test_df_with_pred_array_to_csv(y_pred, df_test, ticker, data_pred_dir, modelo):
	#convertimos el array predicho en dataframe y se nombra la columns
	pred_df = pd.DataFrame(y_pred, columns=['prediccion_modelo'])
	#se resetea el indice del dataframe de test para hace el join por indice
	reset_index_df = df_test.reset_index()
	#se hace el join
	resultado_df = reset_index_df.join(pred_df)
	#se vuelve a poner la columna 'Date' como indice
	resultado_df.set_index('Date', inplace=True)
	#finalmente se guarda en una nueva carpeta en un fichero csv. Si no existee la carpeta se crea
	if not os.path.exists(data_pred_dir):
		os.makedirs(data_pred_dir)
	resultado_df.to_csv('{}/{}.csv'.format(data_pred_dir, ticker + '_' + modelo))

In [124]:
join_test_df_with_pred_array_to_csv(y_pred_regresion, df_test, ticker, data_pred_dir, 'regresion_linal')

In [125]:
type(accuracy_regresion)

numpy.float64

In [126]:
accuracy_regresion

0.5010365939534506