# Labeling

This notebook includes a script for labeling candlestick charts, which are randomly chosen from daily NASDAQ data.

Load necessary libraries

In [None]:
!pip install mpl_finance

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from mpl_finance import candlestick2_ohlc
import warnings
warnings.filterwarnings('ignore')

Load stock data from Yahoo Finance

In [None]:
# Date and only valid symbols
date = pd.read_csv('Data_stock_yahoo/date.csv',header=None)
aux = date.iloc[:,0].values
validSymbols = pd.read_csv('Data_stock_yahoo/selectedSymbols.csv', header=None)
validCols = validSymbols.iloc[0,:].values - 1
# OHLC
close_quotes = pd.read_csv('Data_stock_yahoo/close.csv', header=None, usecols=validCols)
open_quotes = pd.read_csv('Data_stock_yahoo/open.csv',header=None, usecols=validCols)
low_quotes = pd.read_csv('Data_stock_yahoo/low.csv',header=None, usecols=validCols)
high_quotes = pd.read_csv('Data_stock_yahoo/high.csv',header=None, usecols=validCols)
volume_quotes = pd.read_csv('Data_stock_yahoo/volume.csv',header=None, usecols=validCols)

In [None]:
# Rename df
# Col names --> stock ticker names
ticker = pd.read_csv('Data_stock_yahoo/ticker.csv', header=None)
valid_stock_tickers = ticker.loc[validCols, 0].values
close_quotes.columns = valid_stock_tickers
open_quotes.columns = valid_stock_tickers
high_quotes.columns = valid_stock_tickers
low_quotes.columns = valid_stock_tickers
volume_quotes.columns = valid_stock_tickers
# Row index --> date index
close_quotes.index = aux
open_quotes.index = aux
high_quotes.index = aux
low_quotes.index = aux
volume_quotes.index = aux

In [None]:
# Check
volume_quotes.shape

#### Labeling loop
The following code is a loop that:
1. Picks a random position in the series to create a 40-day window, as well as a particular action.
2. Draws the candlestick chart 
3. Asks the user to enter a number to label the chart.

First choose the size of the window and the number of examples you want to generate.  
You have the option to discard an example if you do not want to include it (e.g. possibly ambiguous).


Parameters

In [None]:
window = 40 
n = 300 # 300.000

For-loop

In [None]:
etiquetas = np.zeros(n)
idfechas = np.zeros(n)
idstocks = np.zeros(n)
i = 0
while i<n:
    i = i + 1
    idx_stock = np.random.randint(0, close_quotes.shape[1])
    idx = np.random.randint(0, close_quotes.shape[0] - window)
    windata_close = close_quotes.iloc[idx:idx+window, idx_stock]
    if np.sum(np.isnan(windata_close))>0:
        i = i - 1 # repeat iteration if there is a NaN
    else:
        # Save case info
        idfechas[i-1] = idx
        idstocks[i-1] = idx_stock
        windata_open = open_quotes.iloc[idx:idx+window, idx_stock]
        windata_high = high_quotes.iloc[idx:idx+window, idx_stock]
        windata_low = low_quotes.iloc[idx:idx+window, idx_stock]
        fig, ax = plt.subplots(figsize=(6,4))
        _ = candlestick2_ohlc(ax, windata_open, windata_high,
                             windata_low , windata_close,
                             colorup='g', colordown='r', width=0.66, alpha=0.8)
        plt.show()
        print(f'Ejemplo {i} de {n}')
        print("Etiquete la siguiete grafica como:")
        print(" [1] Doble suelo, [0] Sin doble suelo, [2] Descartar ejemplo")
        opcion_usuario = input()
        if opcion_usuario >= '0' and opcion_usuario <= '2':
            etiquetas[i-1] = opcion_usuario   
        else:
            print('Error en la opción introducida. Ejemplo descartado')
            etiquetas[i-1] = 2


Save info

In [None]:
# Save info
doble_suelo_labels = pd.DataFrame(np.zeros((n, 4)))
doble_suelo_labels.columns = ['Date', 'DateIndex','Ticker', 'Label']
doble_suelo_labels
doble_suelo_labels.Date = aux[idfechas.astype(np.int)]
doble_suelo_labels.DateIndex = idfechas.astype(np.int)
doble_suelo_labels.Ticker = idstocks.astype(np.int)    
doble_suelo_labels.Label =  etiquetas                             

In [None]:
doble_suelo_labels

Labels discarted are not considered

In [None]:
doble_suelo_labels = doble_suelo_labels[doble_suelo_labels.Label!=2]

In [None]:
doble_suelo_labels.Label.value_counts() # to treat unbalanced data

Save the CSV file for further processing

In [None]:
doble_suelo_labels.to_csv('Data_labeled/ejemplo25_doble_suelo.csv')