<center><h2>Systematic Factor Evaluation<h2></center>

In [None]:
from IPython.display import HTML
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to show/hide the raw code."></form>''')

In [None]:
import alphalens
import sys
import os
import re
import datetime
import getpass
import logging
import sklearn
from sklearn.base import clone
import ipywidgets as widgets
import pandas as pd
from datetime import datetime
import statsmodels.api as sm_api
from sklearn.preprocessing import StandardScaler

%matplotlib inline
%config IPCompleter.greedy=True
%autosave 0

import pandas as pd
import numpy as np
import plotly.graph_objs as go
from dslab.dsdata import MLData
from dslab.dsmodel import MLModel
from ipywidgets import *
from IPython.display import display, clear_output, Image, Javascript, SVG
from plotly.offline import init_notebook_mode, iplot
from plotly.tools import FigureFactory as FF       

# Initialize data and model object instances
dsData = MLData(regret=True)
model_obj = MLModel()

# for plotly
init_notebook_mode(connected=True)

In [None]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')

In [None]:
import functools
import scipy.interpolate
import itertools
import math
from scipy.optimize import minimize
from scipy.signal import fftconvolve
from scipy.stats import norm
import copy

In [None]:
# display settings

# widgets
row_layout = Layout(display='flex', flex_flow='row', align_items='center')
col_layout = dict(display='flex', flex_flow='column', justify_content='space-between', width='auto')

def leftm(m=10):
    '''Set left margin'''
    return '0px 0px 0px {}px'.format(m)

max_col = IntSlider(value=50, max=100, width='600px')
max_row = IntSlider(value=100, max=250, width='600px')

bs_row1 = Box([Label('Max Columns to Display:', layout=Layout(width='200px')),  max_col], layout=row_layout)
bs_row2 = Box([Label('Max Rows to Display:', layout=Layout(width='200px')),  max_row], layout=row_layout)
# display(Box([bs_row1, bs_row2], layout=Layout(**col_layout)))

# function
pd.options.display.max_columns = max_col.value
pd.options.display.max_rows = max_row.value

def chg_setting(chg):
    '''Change basic settings.'''
    pd.options.display.max_rows = max_row.value
    pd.options.display.max_columns = max_col.value

max_col.observe(chg_setting, names='value')
max_row.observe(chg_setting, names='value')

## 1. Load Data

In [None]:
# load data
# widgets
filepath = Text(description='File Path:', value='fake_data.csv')
loaddata_button = Button(description="Load Data", button_style='success', margin=leftm(650))
prog = FloatProgress(value=0, min=0, max=10, step=1, description='Progress:',
                     margin=leftm(550), width='250px')
display(filepath, loaddata_button, prog)

# functions
def loaddata(b):
    '''Load data.'''
#     clear_output()
#     display(filepath, loaddata_button, prog)
    global all_tickers, all_factors
    prog.value = 0
    prog.description = 'Start...'
    filepath_ = filepath.value # os.path.join(PATH.value, filename.value)
    dsData.read_data(filepath_)  # read data
    prog.value = 5
    prog.description = 'In Progress...'
    dsData.save_current()  # save a copy of current snapshot
    prog.value = 10
    prog.description = 'Done!'
    dsData.infer_categorical()
    clear_output()
    print('\n', 'Data Head')
    display(dsData.head())  # display data head
    print('\n', 'Data Tail')
    display(dsData.tail())  # display data tail
    print('\n', 'Summary Statistics')    
    display(dsData.get_summary())  # display summary statistics
    
    all_tickers = list(dsData.data['symbol'].unique())
    all_factors = [j for j in dsData.data.columns if j not in ['date', 'sid', 'symbol', 'Sector']]

loaddata_button.on_click(loaddata)

## 2. Explore Data

In [None]:
# Select Graph

# widgets
chart_select = ToggleButtons(
    options=['Coverage', 'Pie Chart', 'Histogram', 'Scatter Plot', 'Correlations'],
    tooltips=['Explore feature coverage', 'Visualize single categorical variable', 'Visualize single numerical variable',
             'Visualize pairwise distributions among a group of numerical variables',
             'Visualize pairwise correlations among a group of numerical variables'],
    value='Correlations', 
    margin=leftm()
)

display(HBox([Label('Graph Type: '), chart_select], layout=Layout(margin='0px 0px 30px 0px')))

# functions
grph = HTML('')

def switch_chart(chg):
    '''Switch between graph type.'''
    global grph
    clear_output()
    display(HBox([Label('Graph Type: '), chart_select], layout=Layout(margin='0px 0px 30px 0px')))
    grph.close()
    grph = graph_mapping[chart_select.value]()
    display(grph)

chart_select.observe(switch_chart, names='value')

In [None]:
# Coverage check
# widgets
def make_coverage():    
    coverage_b = Button(description="Coverage", button_style='success', width='70px', height='30px',
                  margin=leftm(20))
    
    row0 = Box([HTML('<h4>Pie Chart - Visualize Distributions of Categorical Variables</h4>')],
                layout=Layout(display='flex', flex_flow='row', align_items='center',
                              justify_content='center', width='90%'))
    row1 = Box([Label(layout=Layout(width='200px')), coverage_b], layout=row_layout)
    coverage_chart = Box([row0, row1], layout=Layout(**col_layout),
                    height='160px')    
    return(coverage_chart)

In [None]:
# Pie chart
# widgets
def make_pie():
    '''create widgets for pie chart'''
    global pie_filter, pie_select, pie_limit, group_less_freq, pieb
    pie_filter = Text(value='', width='500px')
    pie_select = Dropdown(options=dsData.get_header(sort=True), height='30px', width='200px',
                          margin=leftm())
    pie_limit = IntText(value=20, width='70px')
    group_less_freq = Checkbox(value=False)
    pieb = Button(description="Plot", button_style='success', width='70px', height='30px',
                  margin=leftm(20))
    
    row0 = Box([HTML('<h4>Pie Chart - Visualize Distributions of Categorical Variables</h4>')],
                layout=Layout(display='flex', flex_flow='row', align_items='center',
                              justify_content='center', width='90%'))
#     row1 = Box([Label('Filter by Criterion: ', layout=Layout(width='200px')), pie_filter, pieb], layout=row_layout)
    row1 = Box([Label('Pie Chart Plot: ', layout=Layout(width='200px')), pieb], layout=row_layout)
#     row1 = Box([pieb], layout=row_layout)
    row2 = Box([Label('Select a Categorical Column: ', layout=Layout(width='200px')), pie_select,
                Label('Limit on # of Values: ', layout=Layout(width='200px'), margin=leftm(20)), pie_limit], layout=row_layout)
    row3 = Box([Label('Group Everything Below the Limit as "others": ', layout=Layout(width='400px')), group_less_freq],
                layout=row_layout)
    pie_chart = Box([row0, row1, row2, row3], layout=Layout(**col_layout),
                    height='160px')

    pie_select.observe(plot_pie, names='value')
    pieb.on_click(plot_pie)
    return pie_chart


# functions
def plot_pie(chg):
    '''plot the pie chart'''
#     with warnings.catch_warnings():
#         print("ignore warning")
#         warnings.simplefilter('ignore')
    clear_output()
    display(HBox([Label('Graph Type: '), chart_select], layout=Layout(margin='0px 0px 30px 0px'))) ## kai test

    if pie_filter.value.strip() != '':
        try:
            vc = dsData.query_chain(pie_filter.value).ct_freq(pie_select.value, group_less_freq.value,
                                                           pie_limit.value)
            vc.astype('category').cat.remove_unused_categories(True)
        except Exception as e:
            print(e)
            vc = dsData.ct_freq(pie_select.value, group_less_freq.value, pie_limit.value)
    else:
        vc = dsData.ct_freq(pie_select.value, group_less_freq.value, pie_limit.value)
    trace = go.Pie(labels=list(vc.index), values=vc.values)
    figp = go.Figure(data=go.Data([trace]),
                     layout=dict(title='Break Down of {}'.format(pie_select.value.upper())))
    iplot(figp)

In [None]:
# Histogram

# widgets
def make_hist():
    '''make widgets for histogram'''
    global hist_filter, histb, hist_select, hist_min, hist_max, hist_bins
    hist_filter = Text(value='', width='500px')
    histb = Button(description="Plot", button_style='success', width='70px', height='30px',
                   margin=leftm(20))
    hist_select = Dropdown(options=dsData.get_header(['int', 'float'], sort=True), height='30px',
                           width='200px', margin=leftm())
    hist_min = FloatText(value=float('-Inf'), width='70px')
    hist_max = FloatText(value=float('Inf'), width='70px')
    hist_bins = IntText(value=10, width='70px')
    
    row0 = Box([HTML('<h4>Histogram - Visualize Distributions of Numerical Variables</h4>')],
                layout=Layout(display='flex', flex_flow='row', align_items='center',
                              justify_content='center', width='90%'))
#     row1 = Box([Label('Filter by Criterion: ', layout=Layout(width='200px')), hist_filter, histb], layout=row_layout)
    row1 = Box([Label('Histogram Plot: ', layout=Layout(width='200px')), histb], layout=row_layout)
        
    row2 = Box([Label('Select a Numerical Column: ', layout=Layout(width='200px')), hist_select], layout=row_layout)
    row3 = Box([Label('Min: ', layout=Layout(width='200px')), hist_min, Label('Max: ', margin=leftm(20)), hist_max,
                Label('# of Bins: ', margin=leftm(20)), hist_bins], layout=row_layout)

    hist_chart = Box([row0, row1, row2, row3], layout=Layout(**col_layout), height='160px')
    
    hist_select.observe(plot_hist_new_col)
    histb.on_click(plot_hist)
    return hist_chart


# functions
def plot_hist_new_col(chg):
    '''plot histogram when switching columns'''
    hist_min.value = float('-Inf')
    hist_max.value = float('Inf')
    plot_hist(chg)
    
def plot_hist(chg):
    '''plot histogram'''
    clear_output()
    display(HBox([Label('Graph Type: '), chart_select], layout=Layout(margin='0px 0px 30px 0px'))) ## kai test
    if hist_filter.value.strip() != '':
        x = dsData.query(hist_filter.value)[hist_select.value]
    else:
        x = dsData.data[hist_select.value]
    n = len(x)
    x = x[x > hist_min.value]
    x = x[x < hist_max.value]
    print('Percentage covered (after filtering): {:.2f}%'.format(len(x) / n * 100))
    if hist_min.value == float('-Inf'):
        hist_min.value = np.round(np.min(x), 2) - 1
    if hist_max.value == float('Inf'):
        hist_max.value = np.round(np.max(x), 2) + 1

    size = (hist_max.value - hist_min.value) / hist_bins.value
    trh = go.Histogram(x=x, histnorm='percent', marker=dict(color='rgb(0,0,100)'),
                       xbins=dict(start=hist_min.value - 0.5, size=size, end=hist_max.value + 0.5))

    layout = dict(bargap= 0.015, hovermode= 'x',
                  title='Histogram for {}'.format(hist_select.value.upper()),
                  yaxis= dict(title='Percentage (%)', autorange= True, showticklabels= True))
    figh = go.Figure(data=go.Data([trh]), layout=layout)
    iplot(figh)

In [None]:
# Scatter Plot
# widgets
def make_scatter():
    '''Make widgets for scatter plot.'''
    global sp_filter, spb, sp_height, sp_select, sp_ccol
    num_cols = dsData.get_header(['int', 'float'], sort=True)
    sp_filter = Text(value='', width='500px')
    spb = Button(description="Plot", button_style='success', width='70px', height='30px',
                 margin=leftm(20))
    sp_height = np.min([300, np.max([len(num_cols) * 15, 100])])
    sp_select = SelectMultiple(options=num_cols, margin=leftm(),
                               height='{}px'.format(sp_height))
    sp_ccol = Dropdown(options=[None] + dsData.get_header('category', sort=True), height='30px', width='200px', margin=leftm())

    row0 = Box([HTML('<h4>Scatter Plot - Visualize Pairwise Distributions among a Group of Numerical '
                     'Variables</h4>')],
                layout=Layout(display='flex', flex_flow='row', align_items='center',
                              justify_content='center', width='90%'))
#     row1 = Box([Label('Filter by Criterion: ', layout=Layout(width='200px')), sp_filter, spb], layout=row_layout)
    row1 = Box([Label('Scatter Plot: ', layout=Layout(width='200px')), spb], layout=row_layout)
    row2 = Box([Label("Select Columns: ", layout=Layout(width='200px')), sp_select, Label("Group by: ", margin=leftm(30)), sp_ccol],
                layout=row_layout)
    scatter_chart = Box([row0, row1, row2], layout=Layout(**col_layout),
                        height='{}px'.format(sp_height + 110))
    spb.on_click(plot_scatter)
    return scatter_chart

# functions
def plot_scatter(chg):
    '''Generate scatter plot.'''
    clear_output()
    display(HBox([Label('Graph Type: '), chart_select], layout=Layout(margin='0px 0px 30px 0px'))) ## kai test

    if sp_filter.value.strip() != '':
        temp_data = dsData.query(sp_filter.value).copy(True)
    else:
        temp_data = dsData.data.copy(True)
    
    sel_cols = list(sp_select.value)
    if sp_ccol.value is not None:
        temp_data[sp_ccol.value].cat.remove_unused_categories(True)
        temp_data[sp_ccol.value] = temp_data[sp_ccol.value].astype('object')
        sel_cols.append(sp_ccol.value)

    temp = temp_data[sel_cols]
    
    fig = FF.create_scatterplotmatrix(temp, index=sp_ccol.value, diag='histogram', height=800,
                                      width=800)
    iplot(fig)

In [None]:
# Correlations
# widgets
def make_cor():
    '''Make widgets for correlations plot.'''
    global cor_filter, corb, cor_height, cor_select
    num_cols = dsData.get_header(['int', 'float'], sort=True)
    cor_filter = Text(value='', width='500px')
    corb = Button(description="Plot", button_style='success', width='70px', height='30px',
                  margin=leftm(20))
    cor_height = np.min([300, np.max([len(num_cols) * 15, 100])])
    cor_select = SelectMultiple(options=num_cols, margin=leftm(),
                                height='{}px'.format(cor_height))
    row0 = Box([HTML('<h4>Correlations - Visualize Correlations among a Group of Numerical Variables</h4>')],
                layout=Layout(display='flex', flex_flow='row', align_items='center',
                              justify_content='center', width='90%'))
#     row1 = Box([Label('Filter by Criterion: ', layout=Layout(width='200px')), cor_filter, corb], layout=row_layout)
    row1 = Box([Label('Correlation Plot: ', layout=Layout(width='200px')), corb], layout=row_layout)
    row2 = Box([Label("Select Columns: ", layout=Layout(width='200px')), cor_select], layout=row_layout)
    cor_chart = Box([row0, row1, row2], layout=Layout(**col_layout),
                    height='{}px'.format(cor_height + 100))
    corb.on_click(plot_cor)
    return cor_chart

# functions
def plot_cor(chg):
    '''Generate correlations plot.'''
    clear_output()
    display(HBox([Label('Graph Type: '), chart_select], layout=Layout(margin='0px 0px 30px 0px'))) ## kai test
    
    if cor_filter.value.strip() != '':
        temp_data = dsData.query(cor_filter.value).copy(True)
    else:
        temp_data = dsData.data.copy(True)
    
    sel_cols = list(cor_select.value)
    x, y = sel_cols, list(reversed(sel_cols))

    corr = temp_data[sel_cols].corr().round(2)
    z = np.flipud(corr.values)
    annotations = []
    for n, row in enumerate(z):
        for m, val in enumerate(row):
            var = z[n][m]
            annotations.append(
                dict(
                    text=str(val),
                    x=x[m], y=y[n],
                    xref='x1', yref='y1',
                    font=dict(color='black'),
                    showarrow=False)
                )

    colorscale = [[0, 'rgba(255,0,0,0.3)'], [0.5, 'rgba(255,255,0,0.3)'], [1, 'rgba(0,255,0,0.3)']]
    trace = go.Heatmap(x=x, y=y, z=z, zmin=-1, zmax=1, colorscale=colorscale, showscale=True)

    fig = go.Figure(data=[trace])
    fig['layout'].update(
        title="Correlations",
        annotations=annotations,
        xaxis=dict(ticks=''),
        yaxis=dict(ticks='', ticksuffix='  '),
        width=700,
        height=700,
        autosize=False
    )
    
    iplot(fig)

In [None]:
# graph_mapping = {'Pie Chart': make_pie, 'Histogram': make_hist, 'Box Plot': make_boxp,
#                  'Cross Tab': make_xtab, 'Scatter Plot': make_scatter, 'Correlations': make_cor}

graph_mapping = {'Coverage': make_coverage, 'Pie Chart': make_pie, 'Histogram': make_hist, 
                 'Scatter Plot': make_scatter, 'Correlations': make_cor}

## 3. Data Transformation

In [None]:
def impute_data(b):
    '''impute & update global variable mld.mldata'''
    df = dsData.data.copy(deep=True)
    df = df.replace([np.inf, -np.inf], np.nan)
    data_types = df.dtypes
    
    num_factor_cols = [j for j in df.columns if((data_types[j].name!='category') and (data_types[j].name!='object') 
                                                and (j != "sid") and (j != "date"))]
                                                
    cate_factor_cols = [j for j in df.columns if(((data_types[j].name=='category') or (data_types[j].name=='object')) 
                                                 and (j != "sid") and (j != "date"))]

    ### first fillna for numerical columns
    num_cols = ['date'] + num_factor_cols
    df[num_factor_cols] = df[num_cols].groupby("date").transform(lambda x: x.fillna(x.mean()))
    ### first fillna for categorical columns
    cate_cols = ['date'] + cate_factor_cols
    df[cate_factor_cols] = df[cate_cols].groupby("date").transform(lambda x: x.fillna(x.mode()))
    df = df.dropna()  ### if still na, drop that row
    dsData.data = df.copy(deep=True)

In [None]:
def feature_extract(b):
#     univ_tickers = list(dsData.data['symbol'].unique())
#     for this_ticker in univ_tickers:
#         this_ticker = 'CAKE'
#         this_feature = 'price_to_book'

#         data = dsData.data.copy()
#         data = data[data['symbol']==this_ticker]

#         ticker_MLdata = MLData()
#         ticker_MLdata.data = data

#         ticker_YoY = ticker_MLdata.calculate_YoY(this_feature)
#         ticker_QoQ = ticker_MLdata.calculate_QoQ(this_feature)
#         ticker_QQYY = ticker_MLdata.calculate_QQYY(this_feature)        
    return()

In [None]:
layout_data_impute = widgets.Layout(width='auto', height='40px') #set width and height
feature_extraction = Button(description="Feature Extract (Default)", button_style='success', 
                         width='500px', margin=leftm(5), layout=layout_data_impute)
data_imputation = Button(description="Data Imputation (Default)", button_style='success', 
                         width='500px', margin=leftm(5), layout=layout_data_impute)
display(feature_extraction, data_imputation, height='120px', margin='10px 10px 10px 10px')
feature_extraction.on_click(feature_extract)
data_imputation.on_click(impute_data)

## 4. Single Factor Evaluate

In [None]:
def cal_hit(ts):
    ts = ts.dropna()
    same_sign_counts, diff_sign_counts = sum(ts > 0), sum(ts <= 0)
    hit_rate = same_sign_counts / (same_sign_counts + diff_sign_counts)   
    return(hit_rate)

In [None]:
def run_single_factor_single_ticker(data):
    current_ticker = data['symbol'].unique()[0]
    print("single ticker: ", current_ticker)
    display(data.tail(5))
    compare_df = data[[factor_select.value, target_select.value]]
    print("Correlation:")
    display(compare_df.corr())
    print("Hit Rate (Same Sign):")
    check_sign = compare_df[factor_select.value] * compare_df[target_select.value]
    check_sign.index = data["date"]
    check_sign = check_sign.dropna()
    display(check_sign.tail(5))
#     check_sign_mvg60 = check_sign.rolling(60).mean()
    check_sign_mvg60 = check_sign.rolling(60).apply(cal_hit)
    check_sign_mvg60.plot(rot=45)
    same_sign_counts, diff_sign_counts = sum(check_sign > 0), sum(check_sign <= 0)
    hit_rate = same_sign_counts / (same_sign_counts + diff_sign_counts)
    print("Over the history Hit Rate: ", hit_rate)
    

In [None]:
def run_single_factor(b):
    target_cols = [target_select.value]
#     cols = ['date', 'sid',  factor_select.value] + target_cols
    cols = ['date', 'sid', 'symbol', factor_select.value] + target_cols

    al_data = dsData.data[cols].copy()
    ticker_selected = ticker_select.value
    if ticker_selected != 'cross_tickers':
        al_data = al_data[al_data['symbol']==ticker_selected]
        run_single_factor_single_ticker(al_data)
        return()
        
    al_data = al_data.drop("symbol", axis=1)
    al_data = al_data.set_index(['date', 'sid'])

    factor_to_evaluate = al_data[factor_select.value]
    forward_returns = al_data[target_select.value]   

    data_for_alphalens = alphalens.utils.get_clean_factor(factor_to_evaluate, pd.DataFrame(forward_returns), max_loss=0.9) 

    data_for_alphalens = data_for_alphalens.reset_index()
    data_for_alphalens['date'] = pd.to_datetime(data_for_alphalens['date'])  ### make sure the dtype of date column
    data_for_alphalens.columns = ['date', 'asset'] + list(data_for_alphalens.columns[2:])
    data_for_alphalens = data_for_alphalens.set_index(['date', 'asset'])
    data_for_alphalens.columns = ['21D'] + list(data_for_alphalens.columns[1:])  ### alphalens only take standard column names
    # data_for_alphalens.columns = forward_returns_cols + list(data_for_alphalens.columns[1:])  ### alphalens only take standard column names

    alphalens.tears.create_returns_tear_sheet(data_for_alphalens)

    alphalens.tears.create_information_tear_sheet(
        factor_data=data_for_alphalens, group_neutral=False, by_group=False, set_context=False
    )

    alphalens.plotting.plot_quantile_statistics_table(data_for_alphalens)

#     alphalens.tears.create_full_tear_sheet(data_for_alphalens)

In [None]:
def alpha_single(b):
    '''Make widgets for single model.'''
    clear_output()
    display(individual_eval_button)
    global factor_select, target_select, ticker_select, run_eval_button
    factor_select = Dropdown(options=all_factors, height='30px', margin=leftm(), width='200px')
    target_select = Dropdown(options=all_factors, height='30px', margin=leftm(), width='200px')
    tickers_options = ['cross_tickers'] + all_tickers
    ticker_select = Dropdown(options=tickers_options, height='30px', margin=leftm(), width='200px')
    
    rows = []
    rows.append(Box([Label('Independent Factor: ', layout=Layout(width='200px')), factor_select], layout=row_layout))
    rows.append(Box([Label('Target Variable: ', layout=Layout(width='200px')), target_select], layout=row_layout))
    rows.append(Box([Label('Ticker Only: ', layout=Layout(width='200px')), ticker_select], layout=row_layout))

    result = Box(rows, layout=Layout(**col_layout))
    display(result)
    
    run_eval_button = Button(description="Run_Evaluation", button_style='success', margin=leftm(650))
    display(run_eval_button)
    run_eval_button.on_click(run_single_factor)


In [None]:
individual_eval_button = Button(description="Individual_Evaluation", button_style='success', margin=leftm(650))
display(individual_eval_button)
individual_eval_button.on_click(alpha_single)

## 5. Build Model

In [None]:
# override ipythonwidgets build in function
def _get_min_max_value(min, max, value=None, step=None):
    """Return min, max, value given input values with possible None."""
    if value is None:
        if not max > min:
            raise ValueError('max must be greater than min: (min={0}, max={1})'.format(min, max))
        diff = max - min
        value = min + (diff / 2)
        # Ensure that value has the same type as diff
        if not isinstance(value, type(diff)):
            value = min + (diff // 2)
    elif min is None and max is None:
        if not value:
            t = type(value)
            min, max = (t(0), t(1))
        elif value > 0:
            min, max = (0, np.max([10*value, 10]))
        else:
            min, max = (np.min([10*value, -10]), 0)
    else:
        raise ValueError('unable to infer range, value from: ({0}, {1}, {2})'.format(min, max, value))
    if step is not None:
        # ensure value is on a step
        r = (value - min) % step
        value = value - r
    return min, max, value

interaction._get_min_max_value = _get_min_max_value

In [None]:
# Run single model
model_options = {'LinearRegression': 'ols',
            'Ridge': 'ridge',
            'Lasso': 'lasso',
            'ElasticNet': 'enet',
            'BayesianRidge': 'bayes',
            'SVR': 'svr',
            'KNeighborsRegressor': 'knnrgr',
            'DecisionTreeRegressor': 'dtrgr',
            'AdaBoostRegressor': 'ab',
            'GradientBoostingRegressor': 'gbrgr',
            'RandomForestRegressor': 'rfrgr',
            'ExtraTreesRegressor': 'ert',
            'BaggingRegressor': 'bag'}

model_args = {
    'ols': {'fit_intercept': True, 'normalize': False, 'n_jobs': 1},
    'ridge': {'alpha': 1.0, 'fit_intercept': True, 'normalize': False},
    'lasso': {'alpha': 1.0, 'fit_intercept': True, 'normalize': False},
    'enet': {'alpha':1.0, 'l1_ratio': 0.5, 'fit_intercept': True, 'normalize': False},
    'bayes': {'alpha_1': 1e-06, 'alpha_2': 1e-06, 'lambda_1': 1e-06, 'lambda_2': 1e-06, 
              'compute_score': False, 'fit_intercept': True, 'normalize': False},
    'svr': {'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], 'degree': 3,
            'gamma': 'auto', 'coef0': 0.0, 'C': 1.0, 'epsilon': 0.1,'shrinking': True},
    'knnrgr': {'n_neighbors': 5, 'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'leaf_size': 30, 'p': 2, 'metric': 'minkowski', 'n_jobs': 1},
    'dtrgr': {'criterion': 'mse', 'splitter': 'best', 'max_depth': 5, 'min_samples_split': 10,
           'min_samples_leaf': 15, 'min_weight_fraction_leaf': 0.0},
    'ab': {'n_estimators': 50, 'learning_rate': 1.0, 'loss': 'linear'},
    'gbrgr': {'loss': 'ls', 'learning_rate': 0.1, 'n_estimators': 100, 'subsample': 1.0,
           'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0,
           'max_depth': 3, 'alpha': 0.9},
    'rfrgr': {'n_estimators': 10, 'criterion': 'mse', 'max_depth': 20, 'min_samples_split': 2,
           'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 'auto',
           'bootstrap': True, 'oob_score': False, 'n_jobs': 1},
    'ert': {'n_estimators': 10, 'criterion': 'mse', 'max_depth': 20, 'min_samples_split': 2,
            'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 'auto',
            'bootstrap': False, 'oob_score': False, 'n_jobs': 1},
    'bag': {'n_estimators': 10, 'max_samples': 1.0, 'max_features': 1.0, 'bootstrap': True,
            'bootstrap_features': False, 'oob_score': False, 'n_jobs': 1}}

metric_options = {'R2 Score': 'r2', 'Mean Squared Error': 'mse', 'Mean Absolute Error': 'mae', 
                  'Median Absolute Error': 'meae', 'Explained Variance Score': 'evs'}

# widgets
model_dropcat = Checkbox(value=True, margin=leftm(), width='50px')
model_dropthrh = IntSlider(value=500, max=500, margin=leftm())

model_seed = IntText(value=0, width='150px', margin=leftm())
model_split = FloatSlider(value=0.7, min=0, max=1, step=0.01, width='600px', margin=leftm())
model_select = Dropdown(options=model_options, value='dtrgr', height='30px', margin=leftm(),
                        width='200px')
model_metric = SelectMultiple(options=metric_options, value=('r2',), height='90px', margin=leftm(),
                              width='200px')
model_prog = FloatProgress(value=0, min=0, max=10, step=1, description='Progress:',
                           margin=leftm(600), width='200px')
modelb = Button(description="Run Model", button_style='success', width='100px', margin=leftm(150))
model_save = Button(description="Save Results", button_style='warning', width='100px', margin=leftm(150))

def make_single():
    '''Make widgets for single model.'''
    global model_target, model_features, selection_range_slider
    ### kai test for time range
    all_dates = list(dsData.data['date'].sort_values())
    minimum_date, max_date = all_dates[0], all_dates[-1]
    start_date, end_date = pd.to_datetime(minimum_date), pd.to_datetime(max_date)
    dates = pd.date_range(start_date, end_date, freq='D')
    options = [(date.strftime('%d%b%Y'), date) for date in dates]
    index = (0, len(options)-1)
    selection_range_slider = widgets.SelectionRangeSlider(
        options=options,
        index=index,
        description='Dates',
        orientation='horizontal',
        layout={'width': '500px'}
    )
    ### kai test for time range
    model_obj.define_regressor(model_name=model_select.value, **model_args[model_select.value])
    print(model_obj.model)
    model_target = Dropdown(options=dsData.get_header(['float', 'int'], sort=True), height='30px',
                            margin=leftm(), width='200px')
    columns = dsData.get_header(['float', 'int', 'category', 'object'], sort=True)
    features_height = np.min([300, np.max([len(columns) * 12, 100])])
    model_features = SelectMultiple(options=columns, margin=leftm(),
                                    height='{}px'.format(features_height))
    rows = []
    rows.append(Box([Label('Insample Date Range: ', layout=Layout(width='200px')), selection_range_slider], layout=row_layout))
    rows.append(Box([Label('Target Variable: ', layout=Layout(width='200px')), model_target], layout=row_layout))
    rows.append(Box([Label('Independent Variables: ', layout=Layout(width='200px')), model_features], layout=row_layout))
    rows.append(Box([Label('Evaluation Metric: ', layout=Layout(width='200px')), model_metric], layout=row_layout))
    rows.append(Box([Label('Select Model: ', layout=Layout(width='200px')), model_select, model_save, modelb], layout=row_layout))
    rows.append(Box([model_prog], layout=row_layout))
    rows.append(Box([model_toggle_param], margin='0px 0px 20px 0px'))
    return Box(rows, layout=Layout(**col_layout), height='{}px'.format(300 + features_height))

# functions
def model_chg(chg):
    '''Listen to model change event, and re-define regressor and create model parameter Box.'''
    global model_param_displayed, w
#     clear_output()
    w.close()
    model_obj.define_regressor(model_name=model_select.value)
    print(model_obj.model)
    args = model_args[model_select.value]
    w = interactive(set_params, **args)
    w = modify_box_widget(w)
    if model_param_displayed:
        display(w)

def set_params(**args):
    '''Set model parameters and print.'''
    model_obj.model.set_params(**args)
    print(model_obj.model)

def modify_box_widget(w):
    '''Modify the model parameter box widget.'''
    children = []
    for child in w.children:
        child.margin = leftm()
        if hasattr(child, 'description'):
            desc = child.description
            child.description = ''
            children.append(Box([Label('{}: '.format(desc), layout=Layout(width='200px')), child], layout=row_layout))
        else:
            children.append(child)
    return Box(children, layout=Layout(**col_layout), height='{}px'.format(40 * len(w.children)))

model_select.observe(model_chg, names='value')

In [None]:
# Run Multiple Model
# widgets
model_select2 = SelectMultiple(options=model_options, height='200px', margin=leftm(),
                               width='200px')

model_metric2 = Dropdown(options=metric_options, value='r2', height='30px', margin=leftm(),
                         width='200px')

model_toggle_param = Button(description="Toggle Model Settings", button_style='success',
                            width='170px')
    
def make_multiple():
    '''Make widgets for multiple models'''
    global model_target, model_features
    #model_obj.define_regressor(model_name='all')
    model_obj.define_regressor(model_name='run_all_regressors')
    model_target = Dropdown(options=dsData.get_header(['float', 'int'], sort=True), height='30px',
                            margin=leftm(), width='200px')
    columns = dsData.get_header(['float', 'int', 'category', 'object'], sort=True)
    features_height = np.min([300, np.max([len(columns) * 12, 100])])
    model_features = SelectMultiple(options=columns, margin=leftm(),
                                    height='{}px'.format(features_height))
    rows = []
    rows.append(Box([Label('Target Variable: ', layout=Layout(width='200px')), model_target], layout=row_layout))
    rows.append(Box([Label('Independent Variables: ', layout=Layout(width='200px')), model_features], layout=row_layout))
    rows.append(Box([Label('Evaluation Metric: ', layout=Layout(width='200px')), model_metric2], layout=row_layout))
    rows.append(Box([Label('Select Models: ', layout=Layout(width='200px')), model_select2], layout=row_layout))
    rows.append(Box([modelb], margin=leftm(500)))
    rows.append(Box([model_prog], layout=row_layout))
    rows.append(model_toggle_param)
    return Box(rows, layout=Layout(**col_layout), height='{}px'.format(450 + features_height))

# functions
model_param_displayed = False
def toggle_param(b):
    '''Toggle model parameters.'''
    global model_param_displayed, w
    if model_param_displayed:
        w.close()
    else:
        if mode_select.value == 'Run Single Model':
            args = model_args[model_select.value]
            w = interactive(set_params, **args)
            w = modify_box_widget(w)
        else:
            w = get_multiple_params()
        display(w)
    model_param_displayed = not model_param_displayed

def get_multiple_params():
    '''Create parameter Box for multiple models.'''
    rows = []
    for model in model_select2.value:
        args = model_args[model]
        w = interactive(set_params_model, model=fixed(model), **args)
        w = modify_box_widget(w)
        name = re.match('(\w+)\(' , str(model_obj.models[model])).group(1)
        rows.append(HTML('<b>{}</b>'.format(name), margin='20px 0px 10px 0px'))
        rows.append(w)
    return VBox(rows)

def set_params_model(model, **args):
    '''Set parameters for a specific model and print.'''
    model_obj.models[model].set_params(**args)
    print(model_obj.models[model])

model_toggle_param.on_click(toggle_param)
  

def run_model(b):
    if len(model_features.value) == 0:
        raise ValueError("Please select independent variables.")
    
    model_prog.value = 0
    model_prog.description = 'Start...'
    features = list(np.setdiff1d(model_features.value, [model_target.value]))
    all_cols = features + [model_target.value]
    dsData.create_dummy_data(features)
    
    global XX_train, yy_train, XX_test, yy_test
    
    ### kai test
    insample_start_date, insample_end_date = selection_range_slider.value[0], selection_range_slider.value[1]
    all_data = dsData.data
    all_data["date"] = pd.to_datetime(all_data["date"])
    flg1 = [(j>=insample_start_date) for j in all_data["date"]]
    insample_data = all_data[flg1]
    flg2 = [(j<=insample_end_date) for j in insample_data["date"]]
    insample_data = insample_data[flg2]

    flg3 = [(j<insample_start_date) for j in all_data["date"]]
    outsample_data_1 = all_data[flg3]
    flg4 = [(j>insample_end_date) for j in all_data["date"]]
    outsample_data_2 = all_data[flg4]
    outsample_data = pd.concat([outsample_data_1, outsample_data_2])

    XX_train, yy_train = insample_data[list(model_features.value)], insample_data[model_target.value]
    XX_test, yy_test = outsample_data[list(model_features.value)], outsample_data[model_target.value]
    # insample_data = all_data.loc[(all_data["date"]>start_date) & (all_data["date"]<end_date)]
    ### kai test

#     XX_train, yy_train, XX_test, yy_test = \
#         model_obj.split_random(mld.dummies, mld.data[model_target.value],
#                          train_ratio=model_split.value)
    
    
    
    if mode_select.value == 'Run Single Model':
        global stats_model
        if model_select.value == 'ols':
            ## ols
#             global stats_model
            X, Y = XX_train.copy(deep=True), yy_train.copy(deep=True)
            X = sm_api.add_constant(X)
            stats_model = sm_api.OLS(Y, X).fit() 
        elif model_select.value in 'ridge':
            ## ridge
#             global stats_model            
            X, Y = XX_train.copy(deep=True), yy_train.copy(deep=True)
            scaler = StandardScaler(with_mean=False)
            X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
            stats_model = sm_api.OLS(Y, X_scaled).fit_regularized(method='elastic_net', alpha=penalty, L1_wt=0)              
        elif model_select.value in 'lasso':
#             global stats_model           
            X, Y = XX_train.copy(deep=True), yy_train.copy(deep=True)            
            scaler = StandardScaler(with_mean=False)
            X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
            stats_model = sm_api.OLS(Y, X_scaled).fit_regularized(method='elastic_net', alpha=penalty, L1_wt=1)
            
        ### still use sklearn linear models to run, even after statsmodels approach        
        model_obj.fit_model(XX_train, yy_train, indep_cols=features) 
        pred_name = "predicted_" + model_target.value
        dsData.data[pred_name] = model_obj.predict(dsData.dummies)
        pred_error_name = "error_predicted_" + model_target.value
        dsData.data[pred_error_name] = model_obj.predict(dsData.dummies) - dsData.data[model_target.value]
        model_prog.value = 5
        model_prog.description = 'Progress.'
        model_prog.description = 'Done!'
        model_prog.value = 10
    else:
        model_prog.value = 5
        model_prog.description = 'Progress.'
        res = model_obj.model_comparison(XX_train, yy_train, XX_test, yy_test, features,
                                   list(model_select2.value))
        res = pd.Series(res).reset_index().round(4)
        res.columns = ['Model', model_metric2.value]
        res.sort_values(model_metric2.value, ascending=False, inplace=True)
        print('\nModel Performance')
        display(res)
        model_prog.description = 'Done!'
        model_prog.value = 10

def show_results(X_train, y_train, X_test, y_test):
    '''Show prediction results and performance scores.'''
    global train_scores, test_scores, fi, coef 
    # performance score
    print("performance score")
    print("On Training Set")
    pred_train = model_obj.predict(X_train)
    train_scores = model_obj.eval_model(ytrue=y_train, ypred=pred_train, metric_list=model_metric.value)
    train_scores = pd.DataFrame({'metric': model_metric.value, 'score': train_scores}).round(4)
    display(train_scores)

    print("On Testing Set")
    pred_test = model_obj.predict(X_test)
    test_scores = model_obj.eval_model(ytrue=y_test, ypred=pred_test, metric_list=model_metric.value)
    test_scores = pd.DataFrame({'metric': model_metric.value, 'score': test_scores}).round(4)
    display(test_scores)

    if hasattr(model_obj.model, 'feature_importances_'):
        fi = model_obj.check_feature_importance(group=True).round(4)
        print('\nFeature Importance')
        display(fi)

    if model_select.value == 'dtrgr':
       img_bi = model_obj.decision_tree_plot()
       display(SVG(img_bi))

    if model_select.value == 'ols':
        coef = pd.DataFrame({'Feature': model_obj.dummy_indep_cols, 'Coef': model_obj.model.coef_})
        pos_mask = coef['Coef'] > 0
        pos_coef = coef[pos_mask].sort_values('Coef', ascending=False)
        neg_coef = coef[~pos_mask].sort_values('Coef')
        display('Positive Coefficients', pos_coef)
        display('Negative Coefficients', neg_coef)
        print(model_obj)
        print(model_obj.model)
        
        model_summary = stats_model.summary()
        display(model_summary)

    # make prediction
    print("Predictions on Test Data")
#     display(pd.concat([pred_test, y_test, dsData.data.ix[y_test.index, model_obj.indep_cols]], axis=1))
    display(pd.concat([pred_test, y_test, dsData.data.loc[y_test.index, model_obj.indep_cols]], axis=1))

def save_results(b):
    try:
        if not os.path.exists('results'):
            os.makedirs('results')
        train_scores.to_csv('results/train_scores.csv', index=False)
        test_scores.to_csv('results/test_scores.csv', index=False)
        fi.to_csv('results/feature_importance,csv', index=False)
        if model_select.value == 'ols':
            coef.to_csv('results/coefficients.csv')
        elif model_select.value == 'dtrgr':
            png = model_obj.decision_tree_plot('png')
            with open('results/decision_tree_plot.png', 'wb') as f:
                f.write(png)
        print('Done!')
    except Exception as e:
        print(str(e))

modelb.on_click(run_model)
model_save.on_click(save_results)

In [None]:
# Select Mode
mode_select = ToggleButtons(options=['Run Single Model', 'Run Multiple Models'], value='Run Multiple Models', margin=leftm())

display(HBox([Label('Mode: '), mode_select], layout=Layout(margin='0px 0px 30px 0px')))

mode_mapping = {'Run Single Model': make_single, 'Run Multiple Models': make_multiple}
mode = HTML('')
w = HTML('')

rows = []
rows.append(Box([Label('Drop Categorical Column if Too Many Distinct Values: ', layout=Layout(width='600px')), model_dropcat,
                 Label('Threshold: '), model_dropthrh], layout=row_layout))
rows.append(Box([Label('Random Seed: ', layout=Layout(width='200px')), model_seed], layout=row_layout))
rows.append(Box([Label('Proportion for Training: ', layout=Layout(width='200px')), model_split], layout=row_layout))

display(Box(rows, layout=Layout(**col_layout), height='120px', margin='0px 0px 10px 0px'))

def switch_mode(chg):
    '''Switch between modeling mode.'''
    global mode
    clear_output()

    mode.close()
    mode = mode_mapping[mode_select.value]()
    display(mode)

mode_select.observe(switch_mode, names='value')

## 6. Model Result (only for single model)
- model result in testing data set:

In [None]:
# Select Graph

# widgets
Result_chart_select = ToggleButtons(
    options=['Visualize Prediction', 'Display Predictions'],
    tooltips=['Visualize Prediction outside training set, Confidence Interval and real values', 'Display Predictions'],
    value='Visualize Prediction', 
    margin=leftm()
)

display(HBox([Label('Model Result Type: ', layout=Layout(width='200px')), Result_chart_select], layout=Layout(margin='0px 0px 30px 0px')))

# functions
grph = HTML('')

def Result_switch_chart(chg):
    '''Switch between graph type.'''
    global grph
    clear_output()
    display(HBox([Label('Model Result Type: ', layout=Layout(width='200px')), Result_chart_select], layout=Layout(margin='0px 0px 30px 0px')))

    if (grph != None):
        grph.close()
    #grph.close()
    grph = Result_graph_mapping[Result_chart_select.value]()
    display(grph)

Result_chart_select.observe(Result_switch_chart, names='value')

In [None]:
# Confidence Interval chart
# widgets
def make_Confidence_Interval():
    '''create widgets for Confidence_Interval'''
    #global pie_filter2, pie_select2, pie_limit2, group_less_freq2, pieb2
    global pie_select2, pieb2
    #pie_filter2 = Text(value='', width='500px')
    pie_select2 = Dropdown(options=dsData.get_header(sort=True), height='30px', width='200px',
                          margin=leftm())
    #pie_limit2 = IntText(value=20, width='70px')
    #group_less_freq2 = Checkbox(value=False)
    pieb2 = Button(description="Plot", button_style='success', width='70px', height='30px',
                  margin=leftm(20))
    row0 = Box([HTML('<h4> Visualize outsample prediction and confidence interval</h4>')],
                layout=Layout(display='flex', flex_flow='row', align_items='center',
                              justify_content='center', width='90%'))
    row1 = Box([pieb2], layout=row_layout)
    scatter_chart = Box([row0, row1], layout=Layout(**col_layout),
                    height='160px')
    pie_select2.observe(plot_Confidence_Interval, names='value')
    pieb2.on_click(plot_Confidence_Interval)
    return scatter_chart

# functions
def gradientBoostingRegr_band(clf_outside, X_train, y_train, X_test, y_test):
    '''clf must be gradient boosting regressor'''
    ## for upper bound
    clf = clone(clf_outside)
    alpha = 0.95
    clf.set_params(loss='quantile')
    clf.set_params(alpha=alpha)
    clf.fit(X_train, y_train)
    y_upper = clf.predict(X_test)
    ## for lower bound
    clf.set_params(alpha=1.0 - alpha)
    clf.fit(X_train, y_train)
    y_lower = clf.predict(X_test)
    ## for prediction
    clf.set_params(loss='ls')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred, y_upper, y_lower
    
def plot_Confidence_Interval(chg):
    '''plot the Confidence Interval chart'''
    clear_output()
    display(HBox([Label('Model Result Type: ', layout=Layout(width='200px')), 
                  Result_chart_select], layout=Layout(margin='0px 0px 30px 0px')))  ## kai test

#     if ((type(model_obj.model) != sklearn.ensemble.forest.RandomForestRegressor) and (type(model_obj.model) != sklearn.ensemble.gradient_boosting.GradientBoostingRegressor)):
#         return None
    if ((type(model_obj.model) != sklearn.ensemble.RandomForestRegressor) and (type(model_obj.model) != sklearn.ensemble.GradientBoostingRegressor)):
        return None    
    
    forest = model_obj.model

    if (type(forest) == sklearn.ensemble.RandomForestRegressor):
        #forest.fit(X_train, y_train)
        y_hat = forest.predict(XX_test)
        #V_IJ_calibrated = fci.random_forest_error(forest,X_train, X_test)
        V_IJ_calibrated = random_forest_error(forest,XX_train, XX_test)

        yerr=np.sqrt(V_IJ_calibrated)
        y_mid = y_hat

    if (type(forest) == sklearn.ensemble.GradientBoostingRegressor):
        y_hat, y_upper, y_lower = gradientBoostingRegr_band(clf_outside=model_obj.model, X_train=XX_train, 
                                                            y_train=yy_train, X_test=XX_test, y_test=yy_test)
        y_mid = 0.5 * (y_upper + y_lower)
        yerr = y_upper - y_mid
            
    confidence_band = go.Scatter(
                x=list(yy_test),
                y=list(y_mid),
                error_y=dict(
                    type='data',
                    array=list(yerr),
                    visible=True
                ),
                mode='markers',
                name='confidence_band'
            )

    real_value = go.Scatter(
                x=list(yy_test),
                y=list(yy_test),
                mode='markers',
                name='real_value'
            )
    
    prediction = go.Scatter(
                x=list(yy_test),
                y=list(y_hat),
                mode='markers',
                name='prediction'
            )
        
    data = [confidence_band, real_value, prediction]
    
    layout = go.Layout(xaxis=dict(title='ytest'),
                   yaxis=dict(title='ypred')
                   )

    figp = go.Figure(data=data, layout=layout)
    #figp = go.Figure(data=[trace0, trace1, fx, observations], layout=layout)
    
    iplot(figp)
#     logger.info('plotted Confidence Interval')

In [None]:
def display_predictions_on_test():    
    # make prediction
    #print("Predictions on Test Data")
    forest = model_obj.model
    #X_train, y_train, X_test, y_test = \
    #    model_obj.split_random(mld.dummies, mld.data[model_target.value],
    #                     train_ratio=model_split.value)
    
    #show_results(X_train=XX_train, y_train=yy_train, X_test=XX_test, y_test=yy_test)
    
    try:
        show_results(X_train=XX_train, y_train=yy_train, X_test=XX_test, y_test=yy_test)
    except Exception as e:
        print(e)
#         print("Capture error: need to run model first, and Graphviz executables are on your systems' path")
    #display(pd.concat([pred_test, y_test, mld.data.ix[y_test.index, model_obj.indep_cols]], axis=1))


In [None]:
Result_graph_mapping = {'Visualize Prediction': make_Confidence_Interval, 'Display Predictions': display_predictions_on_test}

In [None]:
# def calculate_YoY(data, target):
#     '''expect self.data has columns date, sid, and the feature column to work on'''
#     df = data.copy()
#     df['date'] = pd.to_datetime(df['date'])
#     delta_1yr_left, delta_1yr_right = pd.to_timedelta(395, 'd'), pd.to_timedelta(365, 'd')
#     df["date_1yr_lag_left"], df["date_1yr_lag_right"] = (df['date'] - delta_1yr_left), (df['date'] - delta_1yr_right)

#     def each_row(x, target=target):
#         left_date, right_date = x['date_1yr_lag_left'], x['date_1yr_lag_right']
#         filter_df = df[(df['date'] > left_date) & (df['date'] < right_date)]
#         if filter_df.shape[0]<1:
#             return(np.nan)            
#         kpi_current, kpi_prev = x[target], filter_df.tail(1).squeeze()[target]
#         try:
#             return(kpi_current/kpi_prev - 1)
#         except:
#             return(np.nan)
#     result = df.apply(lambda x : each_row(x, target=target), axis=1)
#     return result

In [None]:
# ### temporary
# this_feature = 'transactions'

# data_with_features = dsData.data.copy()
# cols = [this_feature+'$YoY', this_feature+'$QoQ', this_feature+'$QQYY']
# for each_feature in cols:
#     data_with_features[each_feature] = np.nan
    
    
# univ_tickers = list(dsData.data['symbol'].unique())

# for this_ticker in univ_tickers:
#     data = dsData.data.copy()
#     data = data[data['symbol']==this_ticker]

#     ticker_MLdata = MLData()
#     ticker_MLdata.data = data

#     ticker_YoY = ticker_MLdata.calculate_YoY(this_feature)
#     ticker_QoQ = ticker_MLdata.calculate_QoQ(this_feature)
#     ticker_QQYY = ticker_MLdata.calculate_QQYY(this_feature)

#     data[this_feature+'$YoY'] = ticker_YoY
#     data[this_feature+'$QoQ'] = ticker_QoQ
#     data[this_feature+'$QQYY'] = ticker_QQYY

#     data_with_features.loc[data.index, cols] = data
    
# data_with_features.to_csv('fake_data.csv', index=None)    