**Author** Martín Gamboa

**Github** mmgamboa

**Date** January 10th, 2025

# Problem 1: Handling Outliers in Regression or Chi-Square Fitting

**Objective**. Explore different alternatives to determine whether an outlier should be considered or discarded when performing linear regression or chi-square fitting. Special attention is required for borderline cases where it is not evident if the point should be excluded.

**Requirements**

* Avoid using smoothness techniques.
* Effectiveness is not the priority; computational resource requirements for daily computation must be explicitly stated.

**Data**.
* QQQ and IWM (used as benchmarks).
* 2YM (likely referring to a 2-year metric or dataset).
* Use a logarithmic scale for computations.


In [2]:
# Get the data QQQ and IWM from Yahoo Finance
# Load specific packages
from importlib import reload
import time

import yfinance as yf

import numpy as np
import pandas as pd
import plotly.express as px

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from dash import Dash, dcc, html, Input, Output

# Import mymodule
import sys
sys.path.append('..')
from mymodule import *  

ModuleNotFoundError: No module named 'mymodule'

# Get Data

In [3]:
#! Close or Adj Close?
param = 'Adj Close'
# Download historical data for QQQ and IWM 2YW
data = yf.download(['QQQ', 'IWM'], period='2y')[param]

# Normalize data to start at 1
data = data / data.iloc[0]

[*********************100%***********************]  2 of 2 completed


In [5]:
# Look at the data
fig = px.line(data, title='Normalized price of QQQ and IWM')
fig.show()

In [4]:
# Get date using .index
dates = data.index
dates = [d.strftime('%Y-%m-%d') for d in dates]

Display Log Return difference and Scatter plot to show any possible trend between prices

In [12]:
sys.path.append('..')
from src.features.build_features import compute_daily_return
log_returns_difference = compute_daily_return(data, data.index, ['QQQ', 'IWM'])

# Plot log returns difference with color line purple
fig = px.line(log_returns_difference, title='Log Returns Difference (QQQ - IWM)')
# Plot horizontal line 
fig.add_hline(y=0, line_dash="dot", line_color="red")
# Set x-label
fig.update_xaxes(title_text='Date')
fig.update_yaxes(title_text='Log Returns Difference')
# Set line color to purple
fig.update_traces(line_color='purple')

fig.show()

In [13]:
log_returns_difference

Unnamed: 0_level_0,QQQ,IWM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-02-15,0.000000,0.000000
2023-02-16,-0.018943,-0.009611
2023-02-17,-0.007081,0.002748
2023-02-21,-0.023960,-0.029958
2023-02-22,0.000748,0.002717
...,...,...
2025-02-10,0.012032,0.004503
2025-02-11,-0.002384,-0.005831
2025-02-12,0.000587,-0.009259
2025-02-13,0.014283,0.011206


In [6]:
xdata_label = 'IWM'
ydata_label = 'QQQ' 
fig = px.scatter(x=log_returns_difference[xdata_label], 
                 y=log_returns_difference[ydata_label])
fig.update_xaxes(title_text=xdata_label)
fig.update_yaxes(title_text=ydata_label)
fig.show()

# Outlier detection in datset

In [14]:
full_indexes = log_returns_difference.index.values

print(find_closest_date('2023-02-19', pd.to_datetime(full_indexes)))

# Get date indices for the sliders
date_indices = {i: date for i, date in enumerate(dates)}

NameError: name 'find_closest_date' is not defined

In [8]:
# Initialize Dash app
app = Dash(__name__)

app.layout = html.Div([
    html.H1("Interactive Scatter Plot with Fitted Line"),
    
    dcc.Graph(id='scatter-plot'),
    
    html.Label("Threshold for Outlier Detection:"),
    dcc.Slider(id='threshold-slider', 
               min=1, 
               max=5, 
               step=0.5, 
               value=1.5,
               marks={i: str(i) for i in range(1, 6)}),
    html.Label("Select Initial Date:"),
    dcc.RangeSlider(id='date-range-slider', 
                    min=0, 
                    max=len(dates)-1, 
                    step=1, 
                    value=[0, len(dates)-1],
                    marks={i: date_indices[i] for i in range(0, len(dates), 30)}),
    html.Div([dcc.Dropdown(['std', 'iqr'], 
                            id='outlier-strategy', 
                            value='std',
                            ) ]),
    
])

@app.callback(
    Output('scatter-plot', 'figure'),
    Input('threshold-slider', 'value'),
    [Input('date-range-slider', 'value')],
    Input('outlier-strategy', 'value')
)

def update_plot(threshold, range_dates, outlier_strategy):
    # Convert slider indices to dates
    # Debugging: print the value of range_dates
    print('__________________________________________')
    t0 = time.time()
    initial_date = find_closest_date(pd.to_datetime(dates[range_dates[0]]), pd.to_datetime(full_indexes))
    end_date = find_closest_date(pd.to_datetime(dates[range_dates[1]]), pd.to_datetime(full_indexes))

    
    # Filter data based on the selected date range
    filtered_data = apply_filter_by_dates(log_returns_difference, initial_date, end_date)
    print("Removing outliers with method: ", outlier_strategy)
    
    ## Fit raw data
    X = filtered_data[xdata_label].values.reshape(-1, 1)
    y = filtered_data[ydata_label].values


    x_pred, y_pred, reg_model = fit_line(X, y, nvals=100)
    # Compute residuals
    residuals = y - reg_model.predict(X)
    
    # Fit line without outliers
    # Adaptative fitting. If an outlier is close enough to the line, it is considered an inlier
    x_pred_no_outliers, y_pred_no_outliers, accepted_idxs = fit_adaptative_line(X, y, residuals, 
                                                                                   initial_date, 
                                                                                   end_date, 
                                                                                   outlier_strategy, 
                                                                                   threshold)

    print(f"Time elapsed in preprocessed, outliers and fitting: {time.time() - t0}")
    
    print("Monitor resources pre-plotting", )
    monitor_resources()
    # Create the figure
    fig = go.Figure()
    
    # Add a line trace
    fig.add_scatter(
        x=x_pred, 
        y=y_pred, 
        mode="markers", 
        line=dict(color="red"),
        name='Fitted line')

    
    # Raw data points
    fig.add_trace(go.Scatter(x=X.flatten(), y=y, mode='markers', 
                             name='Raw data', marker=dict(color='blue')))
    
    # Fitted line without outliers
    fig.add_trace(go.Scatter(x=x_pred_no_outliers, y=y_pred_no_outliers, mode='lines', 
                             name='Fitted line (no outliers)', line=dict(color='red')))
    
    # Outliers
    fig.add_trace(go.Scatter(x=X[~accepted_idxs].flatten(), y=y[~accepted_idxs], mode='markers', 
                             name='Outliers', marker=dict(color='black')))
    # Update axes labels
    fig.update_xaxes(title_text=xdata_label, range=[X.min()-np.abs(X.min())*0.1, 
                                                    X.max()+np.abs(X.max())*0.1])
    fig.update_yaxes(title_text=ydata_label, range=[y.min()-np.abs(y.min())*0.1, 
                                                    y.max()+np.abs(y.max())*0.1])

    # Update layout
    fig.update_layout(
        xaxis_title=xdata_label,
        yaxis_title=ydata_label,
        #title="Scatter Plot with Fitted Line",
        legend=dict(orientation="h", x=0, y=-0.2)
    )
    print("Monitor resources post-plotting", )
    monitor_resources()
    return fig

app.run_server(mode='inline', debug=True)


__________________________________________
Removing outliers with method:  std
Score: 0.373300226
Coef: 0.54 - 0.00099
Score: 0.473667256
Coef: 0.5 - 0.00143
Score: 0.469429429
Coef: 0.52 - 0.00135
Time elapsed in preprocessed, outliers and fitting: 0.02034783363342285
Monitor resources pre-plotting
CPU Usage: 12.6%
Memory Usage: 54.7% (6.37 GB used)
Monitor resources post-plotting
CPU Usage: 10.3%
Memory Usage: 54.7% (6.40 GB used)


In [9]:
# Resources requirements
print(f"Pandas DataFrame used {log_returns_difference.memory_usage(deep=True).sum()/1024**2:4.3f} MB")
    

Pandas DataFrame used 0.036 MB
