# Transfer Learning & Self-Training

1.   The main objective of this notebook is to develop the semi-supervised learning task. For that purpose, first it is designed a model that starting from the ResNet network, builds the classifier that is used as the model in the Self-Training algorithm
2.  Data generated is available in the data_stock_transfer_learning model
3.  The model generated using the ResNet network was saved and it can also be found on that folder

Thefore, if taking so, one can skip all the subsequent steps and go directly to the Self-Training section



Load necesary libraries

In [None]:
!pip install mpl_finance

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from mpl_finance import candlestick2_ohlc
import warnings
warnings.filterwarnings('ignore')
from tensorflow.keras.utils import to_categorical
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras

### DATA PREPARATION

Load stock data

In [None]:
def load_data_stock():
    '''
    Loads stock data previously gathered from Yahoo Finance
    # Args: None
    # Returns: close, open, high, low and volume quotes
    '''
    # Load data
    date = pd.read_csv('Data_stock_yahoo/date.csv',header=None)
    aux = date.iloc[:,0].values
    validSymbols = pd.read_csv('Data_stock_yahoo/selectedSymbols.csv', header=None)
    validCols = validSymbols.iloc[0,:].values - 1
    close_quotes = pd.read_csv('Data_stock_yahoo/close.csv', header=None, usecols=validCols)
    open_quotes = pd.read_csv('Data_stock_yahoo/open.csv',header=None, usecols=validCols)
    low_quotes = pd.read_csv('Data_stock_yahoo/low.csv',header=None, usecols=validCols)
    high_quotes = pd.read_csv('Data_stock_yahoo/high.csv',header=None, usecols=validCols)
    volume_quotes = pd.read_csv('Data_stock_yahoo/volume.csv',header=None, usecols=validCols)
    # Rename df
    # Col names --> stock ticker names
    ticker = pd.read_csv('Data_stock_yahoo/ticker.csv', header=None)
    valid_stock_tickers = ticker.loc[validCols, 0].values
    close_quotes.columns = valid_stock_tickers
    open_quotes.columns = valid_stock_tickers
    high_quotes.columns = valid_stock_tickers
    low_quotes.columns = valid_stock_tickers
    volume_quotes.columns = valid_stock_tickers
    # Row index --> date index
    close_quotes.index = aux
    open_quotes.index = aux
    high_quotes.index = aux
    low_quotes.index = aux
    volume_quotes.index = aux
    return close_quotes, open_quotes, high_quotes, low_quotes, volume_quotes

Download files that have already been labeled

In [None]:
def download_files(): 
    '''
    Loads the examples that have been already labeled using the Labeling notebook
    # Args: None
    # Return: data labeled using the Labeling notebook
    '''
    onlyfiles = [f for f in listdir('Data_labeled') if isfile(join('Data_labeled', f))]
    data = []
    for file_ in onlyfiles:
        df = pd.read_csv(join('Data_labeled', file_), usecols=[1,2,3,4])    
        data.append(df)

    data = pd.concat(data)
    return data

In [None]:
data = download_files()
data

In [None]:
close_quotes, open_quotes, high_quotes, low_quotes, volume_quotes = load_data_stock()

In [None]:
close_quotes.head(10)

Dataset creation: from the examples labeled, create a dataset in which each row is a new sample of 40 moving sliding window from a specific date and stock

In [None]:
def get_hist_window(hist, idx_date, idx_stock, window_size):
    return hist.iloc[idx_date:idx_date+window_size, idx_stock]

In [None]:
def get_df_sample(data, close_quotes, open_quotes, high_quotes, low_quotes, volume_quotes, sample):
    '''

    # Args: 
      data: dataset created using the download_file function.
      OHLC from Yahoo Finance
      sample: number of example from the dataset to be analyzed.
    # Returns:
      df: a daframe that contains the OHCL on a 40 days window of a specific stock on a specific date 
    '''
    window_size = 40
    idx_stock = data.Ticker.iloc[sample]
    idx_date = data.DateIndex.iloc[sample]
    open_df = get_hist_window(open_quotes, idx_date, idx_stock, window_size)
    close_df = get_hist_window(close_quotes, idx_date, idx_stock, window_size)
    high_df = get_hist_window(high_quotes, idx_date, idx_stock, window_size)
    low_df = get_hist_window(low_quotes, idx_date, idx_stock, window_size)
    volume_df = get_hist_window(volume_quotes, idx_date, idx_stock, window_size)
    df = pd.concat([open_df, close_df, high_df, low_df, volume_df], axis=1)
    df.columns = ['open_price', 'close_price', 'high', 'low', 'volume']
    df.index = pd.to_datetime(pd.to_datetime(df.index), format='%d-%m-%Y')
    return df
    

In [None]:
df = get_df_sample(data, close_quotes, open_quotes, high_quotes, low_quotes, volume_quotes, 0)

In [None]:
df

### FIGURES GENERATOR

Generate figures to train the ResNet Model

In [None]:
# Try one example
from PIL import Image, ImageChops

fig2save, ax = plt.subplots(figsize=(6,4))
_ = candlestick2_ohlc(ax, df.open_price, df.high,
                             df.low , df.close_price,
                             colorup='g', colordown='r', width=0.66, alpha=1)
_ = plt.axis('off')


Generate the figures that have already been labeled and save them into separed folders

In [None]:
for i in range(data.shape[0]):
    df = get_df_sample(data, close_quotes, open_quotes, high_quotes, low_quotes, volume_quotes, i)
    fig2save, ax = plt.subplots(figsize=(6,4))
    _ = candlestick2_ohlc(ax, df.open_price, df.high,
                                df.low , df.close_price,
                                colorup='g', colordown='r', width=0.66, alpha=1)
    _ = plt.axis('off')
    if data.Label.iloc[i] == 1:
        filename = "Figures_labeled/True/ej_%i.jpg"%i
        fig2save.savefig(filename)
    else:
        filename = "Figures_labeled/False/ej_%i.jpg"%i
        fig2save.savefig(filename)


Figure preparation: images center and trim, to help the classifier identify the patterns

In [None]:
def trim(im):
    '''
    Trim an specific image
    # Args: the imaged to be cropped
    # Retunrs: the cropped image
    '''
    bg = Image.new(im.mode, im.size, im.getpixel((0,0)))
    diff = ImageChops.difference(im, bg)
    diff = ImageChops.add(diff, diff, 2.0, -100)
    bbox = diff.getbbox()
    if bbox:
        return im.crop(bbox)

In [None]:
def trimFiles(label): 
  '''
  Trim all the images generated before
  # Arg: the class label of the images to be cropped.
  # Returns: None
  '''
  i = 0
  if label == 1:
    dir = 'True'
    onlyfiles = [f for f in listdir(dir) if isfile(join(dir, f))]
    len(onlyfiles)
    for file_ in onlyfiles:
        image = Image.open(join(dir, file_))
        image = trim(image)
        filename = "Figures_labeled/True_truncated/ej_%i.jpg"%i
        image.save(filename)
        i += 1
  else:
    dir = 'False'
    onlyfiles = [f for f in listdir(dir) if isfile(join(dir, f))]
    len(onlyfiles)
    for file_ in onlyfiles:
        image = Image.open(join(dir, file_))
        image = trim(image)
        filename = "Figures_labeled/False_truncated/ej_%i.jpg"%i
        image.save(filename)
        i += 1

In [None]:
# Trim all files 
trimFiles(1)
trimFiles(0)

Convert images to numpy arrays

In [None]:
# Try one example to get dimensions
onlyfiles = [f for f in listdir('Figures_labeled/True_truncated') if isfile(join('Figures_labeled/True_truncated', f))]
dir = 'Figures_labeled/True_truncated/' + onlyfiles[0]
image = Image.open(dir)
image = image.resize((230, 230), Image.BILINEAR) 
image_array = np.array(image)
sizes = image_array.shape
sizes

Images are converted into numpy array and before they are reshaped.

In [None]:
def convert_images2array(dir): 
    '''
    Converts the images into a np array and before they are reshaped to properly apply convolutions
    # Args: directory of the folder
    # Return: np array of the all the data joint
    '''
    onlyfiles = [f for f in listdir(dir) if isfile(join(dir, f))]
    len(onlyfiles)
    data = []
    for file_ in onlyfiles:
        image = Image.open(join(dir, file_))
        image = image.resize((230, 230), Image.BILINEAR) # square filters
        image_array = np.array(image)
        data.append(image_array)
    return np.array(data)

In [None]:
x_true = convert_images2array('Figures_labeled/True_truncated')

In [None]:
x_false = convert_images2array('Figures_labeled/False_truncated')

In [None]:
# Add label
y = np.zeros((len(x_true)+len(x_false),1))
y[0:len(x_true)] = 1.
X = np.vstack([x_true, x_false])
print('X shape:',X.shape)
print('y shape:',y.shape)

### TRAIN RESNET MODEL

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

Train and test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,y, stratify=y)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

Run the above cell if you want to get the data already processed

In [None]:
import pickle

In [None]:
# Load data
f = open('Data_stock_transfer_learning/y_train.pckl', 'rb')
y_train = pickle.load(f)
f.close()

f = open('Data_stock_transfer_learning/y_test.pckl', 'rb')
y_test = pickle.load(f)
f.close()

f = open('Data_stock_transfer_learning/x_train.pckl', 'rb')
x_train = pickle.load(f)
f.close()

f = open('Data_stock_transfer_learning/x_test.pckl', 'rb')
x_test = pickle.load(f)
f.close()

Transfer learning model construction

In [None]:
from tensorflow.keras import layers, models, Model, losses

In [None]:
from tensorflow.keras.applications import ResNet152 
from tensorflow import keras

In [None]:
sizes = x_train[0].shape
sizes

In [None]:
base_model = ResNet152(weights='imagenet', include_top=False, input_shape=sizes, classes=2)

In [None]:
base_model = ResNet152(weights='imagenet', include_top=False, input_shape=sizes, classes=2)
for layer in base_model.layers:
  layer.trainable = False


In [None]:
# Build own classifier
x = layers.Flatten()(base_model.output)
x = layers.Dense(10, activation='relu')(x)
predictions = layers.Dense(2, activation = 'softmax')(x)

In [None]:
head_model = Model(inputs = base_model.input, outputs = predictions)
head_model.compile(optimizer='adam', loss=losses.categorical_crossentropy, metrics=['accuracy'])

Train the model

In [None]:
history = head_model.fit(x_train, y_train, batch_size=128, epochs=100, validation_data=(x_test, y_test))

In [None]:
# Save trained model
head_model.save('Data_stock_transfer_learning/transfer_learning_model.h5')

In [None]:
def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
    Arguments
    ---------
    cf:            confusion matrix to be passed in
    group_names:   List of strings that represent the labels row by row to be shown in each square.
    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'
    count:         If True, show the raw number in the confusion matrix. Default is True.
    normalize:     If True, show the proportions for each category. Default is True.
    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.
    xyticks:       If True, show x and y ticks. Default is True.
    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.
    sum_stats:     If True, display summary statistics below the figure. Default is True.
    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.
    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html
                   
    title:         Title for the heatmap. Default is None.
    '''


    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)

 Visualize the results

In [None]:
import seaborn as sns

In [None]:
from sklearn.metrics import confusion_matrix
y_test_aux = pd.DataFrame(y_train).idxmax(axis=1).values
cf_matrix = confusion_matrix(y_test_aux, np.argmax(y_pred, axis=1))
labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['0', '1']
make_confusion_matrix(cf_matrix, 
                      group_names=labels,
                      categories=categories)

In [None]:
auc = metrics.roc_auc_score(y_test_aux, np.argmax(y_pred, axis=1))
fpr, tpr, _ = metrics.roc_curve(y_test_aux,  np.argmax(y_pred, axis=1))
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

# Self-Training algorithm

Import Transfer-Learning model

In [None]:
 model = tf.keras.models.load_model('Data_stock_transfer_learning/transfer_learning_model.h5')

Load data used in the previous model

In [None]:
f = open('Data_stock_transfer_learning/y_train.pckl', 'rb')
y_train = pickle.load(f)
f.close()

f = open('Data_stock_transfer_learning/y_test.pckl', 'rb')
y_test = pickle.load(f)
f.close()

f = open('Data_stock_transfer_learning/x_train.pckl', 'rb')
x_train = pickle.load(f)
f.close()

f = open('Data_stock_transfer_learning/x_test.pckl', 'rb')
x_test = pickle.load(f)
f.close()

Generate data to label

In [None]:
n_samples = 20000 
data_unlabeled = pd.DataFrame(columns=['Ticker', 'DateIndex', 'Label'])

In [None]:
data_unlabeled.Ticker = np.random.randint(low=0, high=close_quotes.shape[1], size=(n_samples,))
data_unlabeled.DateIndex = np.random.randint(low=0, high=close_quotes.shape[0], size=(n_samples,))
data_unlabeled.Label = np.ones((n_samples,))*(-1)

In [None]:
data_unlabeled.head(10)

In [None]:
from PIL import Image, ImageChops

In [None]:
def fig2img(fig):
    """
    @brief Convert a Matplotlib figure to a PIL Image in RGBA format and return it
    @param fig a matplotlib figure
    @return a Python Imaging Library ( PIL ) image
    """
    # put the figure pixmap into a numpy array
    buf = fig2data (fig)
    w, h, d = buf.shape
    return Image.frombytes("RGB", (w ,h), buf.tostring())

In [None]:
def fig2data(fig):
    """
    @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it
    @param fig a matplotlib figure
    @return a numpy 3D array of RGBA values
    """
    # draw the renderer
    fig.canvas.draw ( )
 
    # Get the RGBA buffer from the figure
    w,h = fig.canvas.get_width_height()
    buf = np.fromstring ( fig.canvas.tostring_rgb(), dtype=np.uint8)
    buf.shape = (w, h, 3)
 
    # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode
    buf = np.roll (buf, 3, axis=2)
    return buf

For-loop to create the data to label

In [None]:
data = []
validCols = np.zeros((data_unlabeled.shape[0]))
for i in range(idx_start, data_unlabeled.shape[0]):
    df = get_df_sample(data_unlabeled, close_quotes, open_quotes, high_quotes, low_quotes, volume_quotes, i)
    if (df.dropna().shape ==  df.shape): # there is no NaNs
      fig2save, ax = plt.subplots(figsize=(6,4))
      _ = candlestick2_ohlc(ax, df.open_price, df.high,
                                  df.low , df.close_price,
                                  colorup='g', colordown='r', width=0.66, alpha=1)
      _ = plt.axis('off')
      image = trim(fig2img(fig2save))
      image = image.resize((230, 230), Image.BILINEAR) 
      image_array = np.array(image)
      data.append(image_array)
      validCols[i] = 1


In [None]:
data_unlabeled = data_unlabeled[validCols == 1]

In [None]:
x_unlabeled = np.array(data)

In [None]:
# Add label
y = np.ones((len(x_unlabeled),1))*(-1)

Algorithm implementation

In [None]:
# Copy structure
x_train_semi = x_train
x_test_semi = x_test
y_train_semi = y_train
y_test_semi = y_test

In [None]:
def re_train_model(x_train, y_train, x_test, y_test):
  '''
  This functions retrains the transfer learning model developed before
  # Args: xtrain, ytrain, xtest, ytest
  # Returns: None but the model has been trained
  '''
  model.compile(optimizer='adam', loss=losses.categorical_crossentropy, metrics=['accuracy'])
  history = model.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_test, y_test))

In [None]:
count = 0
while (len(x_unlabeled)>0):
    print('Still remaining', len(x_unlabeled), 'samples to label...')
    # 1:4 label in each iteration
    n_pool_to_label = int((len(x_train_semi) + len(x_test_semi))/2)
    # Saturation of unlabeled data in the last iterations
    if(len(x_unlabeled) < n_pool_to_label):
        n_pool_to_label = len(x_unlabeled) 
    # Probability of the model
    y_pred = model.predict(x_unlabeled[:n_pool_to_label])
    y_pred = to_categorical(np.argmax(y_pred, axis=1))
    x_train_semi = np.vstack([x_train_semi, x_unlabeled[:n_pool_to_label]])
    y_train_semi = np.vstack([y_train_semi, y_pred])
    # Remove current pool from unlabeled data
    x_unlabeled = x_unlabeled[n_pool_to_label:]
    # Re train the model 
    re_train_model(x_train_semi, y_train_semi, x_test_semi, y_test_semi)