<a href="https://colab.research.google.com/github/nounou176/research-quant/blob/main/QuantTradingAccelerator_Video3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Quant Trading Accelerator ðŸš€

Learn from 0, extremely fast => JIT Learning => Build, Test, Learn, Iterate ðŸš€

## Part 3: Vectorization

This video we will build a data analysis library while learning about vectorization

### Vector Algebra

In [None]:
import numpy as np

class Vector:
    """
    A lightweight vector wrapper around NumPy arrays, providing
    elementwise arithmetic operations, statistics, and operator overloads.

    Parameters
    ----------
    data : array_like
        Input data to initialize the vector. Can be a list, tuple, or NumPy array.

    Attributes
    ----------
    data : np.ndarray
        The underlying NumPy array storing the vector elements.

    Examples
    --------
    >>> v = Vector([1, 2, 3])
    >>> v + 2
    Vector([3 4 5])
    >>> v * v
    Vector([1 4 9])
    >>> v.mean()
    2.0
    """

    data: np.ndarray

    def __init__(self, data) -> None:
        """Initialize the vector with the given data."""
        self.data = np.array(data)

    # ---------------------------------------------------------------------
    # Basic arithmetic methods
    # ---------------------------------------------------------------------
    def add(self, y) -> np.ndarray:
        """
        Add a scalar or array-like object to the vector.

        Parameters
        ----------
        y : array_like or scalar
            The value(s) to add.

        Returns
        -------
        np.ndarray
            Elementwise sum.
        """
        return self.data + y

    def sub(self, y) -> np.ndarray:
        """Subtract a scalar or array-like object from the vector."""
        return self.data - y

    def mul(self, y) -> np.ndarray:
        """Multiply the vector elementwise by a scalar or array-like object."""
        return self.data * y

    def div(self, y) -> np.ndarray:
        """Divide the vector elementwise by a scalar or array-like object."""
        return self.data / y

    # ---------------------------------------------------------------------
    # Statistical methods
    # ---------------------------------------------------------------------
    def sum(self):
        """
        Return the sum of all elements in the vector.

        Returns
        -------
        float
            The sum of all vector elements.
        """
        return np.sum(self.data)

    def mean(self):
        """
        Return the mean (average) of the vector elements.

        Returns
        -------
        float
            The mean of the vector elements.
        """
        return np.mean(self.data)

    def var(self) -> np.ndarray:
        """
        Return the variance of the vector elements.

        Returns
        -------
        float
            The variance of the vector elements.
        """
        mu = self.mean()
        return np.mean((self.data - mu) ** 2)

    def std(self):
        """
        Return the standard deviation of the vector elements.

        Returns
        -------
        float
            The standard deviation of the vector elements.
        """
        return np.sqrt(self.var())

    def len(self):
        """
        Return the number of elements in the vector.

        Returns
        -------
        int
            Number of elements.
        """
        return len(self.data)

    # ---------------------------------------------------------------------
    # Operator overloads
    # ---------------------------------------------------------------------
    def __add__(self, other):
        """Implements self + other (elementwise addition)."""
        return Vector(self.data + self._to_array(other))

    def __sub__(self, other):
        """Implements self - other (elementwise subtraction)."""
        return Vector(self.data - self._to_array(other))

    def __mul__(self, other):
        """Implements self * other (elementwise multiplication)."""
        return Vector(self.data * self._to_array(other))

    def __truediv__(self, other):
        """Implements self / other (elementwise division)."""
        return Vector(self.data / self._to_array(other))

    def __radd__(self, other):
        """Implements other + self."""
        return self.__add__(other)

    def __rsub__(self, other):
        """Implements other - self."""
        return Vector(self._to_array(other) - self.data)

    def __rmul__(self, other):
        """Implements other * self."""
        return self.__mul__(other)

    def __rtruediv__(self, other):
        """Implements other / self."""
        return Vector(self._to_array(other) / self.data)

    def __pow__(self, power):
        """
        Implements self ** power (elementwise exponentiation).

        Parameters
        ----------
        power : float or int
            Exponent to which each element is raised.

        Returns
        -------
        Vector
            New vector with each element raised to the given power.
        """
        return Vector(self.data ** power)

    # ---------------------------------------------------------------------
    # Accessors
    # ---------------------------------------------------------------------
    def __getitem__(self, index):
        """
        Allow element or slice access via v[index].

        Parameters
        ----------
        index : int or slice
            Index or slice object.

        Returns
        -------
        scalar or Vector
            A single element (if int index) or a new Vector (if slice).
        """
        result = self.data[index]
        if isinstance(result, np.ndarray):
            return Vector(result)
        return result

    def __len__(self):
        """Return the number of elements in the vector (for len(v))."""
        return len(self.data)

    # ---------------------------------------------------------------------
    # Internal helpers and representation
    # ---------------------------------------------------------------------
    def _to_array(self, x):
        """
        Convert input to a NumPy array for arithmetic operations.

        Parameters
        ----------
        x : scalar, array_like, or Vector
            Input to convert.

        Returns
        -------
        np.ndarray
            NumPy array representation.
        """
        if isinstance(x, Vector):
            return x.data
        return np.array(x)

    def __repr__(self):
        """Return a string representation of the Vector."""
        return f"Vector({self.data})"


### Vector-Scalar Algebra

In [None]:
vec = Vector([1,2,3,])
vec

Vector([1 2 3])

In [None]:
vec + 1

Vector([2 3 4])

In [None]:
[1 + 1, 2 + 1, 3 + 1]

[2, 3, 4]

In [None]:
vec - 2

Vector([-1  0  1])

In [None]:
[1 - 2, 2 - 2, 3 - 2]

[-1, 0, 1]

In [None]:
vec * 2

Vector([2 4 6])

In [None]:
vec / 2

Vector([0.5 1.  1.5])

In [None]:
v = []
for e in [1,2,3]:
  v.append(e + 1)
v

[2, 3, 4]

### Power of Vectorization

In [None]:
n = 100000000
v1 = [1 for _ in range(n)]
v2 = Vector(v1)

In [None]:
%%time
y = []
for x in v1:
  y.append(x + 1)
y[-10:]

CPU times: user 8.3 s, sys: 512 ms, total: 8.81 s
Wall time: 8.96 s


[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]

In [None]:
%%time
y = v2 + 1
y.data[-10:]

CPU times: user 432 ms, sys: 497 ms, total: 928 ms
Wall time: 931 ms


array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
8.3 / 0.432

19.212962962962965

Speed == SIMD

### Vector-Vector Algebra

In [None]:
x = Vector([1, 2, 3, 4])
y = Vector([1, -1, 2, -2])

In [None]:
x + y

Vector([2 1 5 2])

In [None]:
[1 + 1, 2 + (-1), 3 + 2, 4 + (-2)]

[2, 1, 5, 2]

In [None]:
x - y

Vector([0 3 1 6])

In [None]:
[1 - 1, 2 - (-1), 3 - 2, 4 - (-2)]

[0, 3, 1, 6]

In [None]:
x * y

Vector([ 1 -2  6 -8])

In [None]:
[1 * 1, 2 * -1, 3 * 2, 4 * (-2)]

[1, -2, 6, -8]

### Vectorized Statistics

In [None]:
log_returns = Vector([0.01, 0.015, 0.02, -0.01])
mu = log_returns.mean()

In [None]:
mu

np.float64(0.008749999999999999)

In [None]:
log_returns - mu

Vector([ 0.00125  0.00625  0.01125 -0.01875])

In [None]:
(log_returns - mu) ** 2

Vector([1.562500e-06 3.906250e-05 1.265625e-04 3.515625e-04])

In [None]:
((log_returns - mu) ** 2).mean()

np.float64(0.0001296875)

In [None]:
log_returns.var()

np.float64(0.0001296875)

In [None]:
np.sqrt(((log_returns - mu) ** 2).mean())

np.float64(0.011388041973930374)

In [None]:
np.sqrt(log_returns.var())

np.float64(0.011388041973930374)

In [None]:
log_returns.std()

np.float64(0.011388041973930374)

### Vectorized Sharpe Ratio

In [None]:
portfolio_log_returns = Vector([0.01, 0.01, 0.02, -0.01])

In [None]:
portfolio_log_returns.sum()

np.float64(0.03)

In [None]:
portfolio_log_returns.mean()

np.float64(0.0075)

In [None]:
portfolio_log_returns.std()

np.float64(0.010897247358851685)

In [None]:
portfolio_log_returns.mean() / portfolio_log_returns.std()

np.float64(0.6882472016116852)

In [None]:
portfolio_log_returns = Vector([-0.01, -0.01, -0.01, 0.06])

In [None]:
portfolio_log_returns.sum()

np.float64(0.03)

In [None]:
portfolio_log_returns.mean()

np.float64(0.0075)

In [None]:
portfolio_log_returns.std()

np.float64(0.03031088913245535)

In [None]:
portfolio_log_returns.mean() / portfolio_log_returns.std()

np.float64(0.24743582965269678)

### Build a Data Analysis Library

In [None]:
import numpy as np

class Column:
    """
    Column represents a single column of data, typically used in a tabular dataset.

    Each Column has a name and a vector of data (wrapped in a Vector object).
    Supports basic operations like length, sum, shift, division, and logarithm,
    and allows element-wise arithmetic with other Columns or scalars.
    """

    vec: 'Vector'

    def __init__(self, name, x):
        """
        Initialize a Column.

        Parameters
        ----------
        name : str
            The name of the column.
        x : list, np.ndarray, or Vector
            The data for the column.
        """
        self.vec = Vector(x)
        self.name = name

    def len(self):
        """
        Return the number of elements in the column.

        Returns
        -------
        int
            Length of the column vector.
        """
        return len(self.vec)

    def sum(self):
        """
        Compute the sum of all elements in the column.

        Returns
        -------
        float
            Sum of the column's elements.
        """
        return np.sum(self.vec)

    def shift(self, n=1):
        """
        Shift the column data downward by n positions, filling new entries with NaN.

        Parameters
        ----------
        n : int, optional
            Number of positions to shift (default is 1).

        Returns
        -------
        np.ndarray
            Shifted data as a NumPy array.
        """
        return np.concatenate(([np.nan] * n, self.vec[:-n]))

    def div(self, y) -> np.ndarray:
        """
        Divide the column element-wise by another Column or array-like object.

        Parameters
        ----------
        y : Column, Vector, or array-like
            The divisor.

        Returns
        -------
        np.ndarray
            Resulting element-wise division as a NumPy array.
        """
        if isinstance(y, Column):
            y = y.vec
        return self.vec / y

    def log(self):
        """
        Compute the natural logarithm of each element in the column.

        Returns
        -------
        np.ndarray
            Element-wise natural logarithm of the column.
        """
        return np.log(self.vec)

    def __truediv__(self, other) -> np.ndarray:
        """
        Enable the use of '/' operator for element-wise division.

        Parameters
        ----------
        other : Column, Vector, or array-like
            The divisor.

        Returns
        -------
        np.ndarray
            Element-wise division result.
        """
        return self.div(other)

    def __repr__(self):
        """
        Return a string representation of the column.

        Shows the first 10 elements of data and the column's name and length.

        Returns
        -------
        str
            Formatted string representation of the Column.
        """
        preview = ", ".join(map(str, self.vec[:10]))  # show first 10 items
        if len(self.vec) > 10:
            preview += ", ..."
        return f"Column(name='{self.name}', data=[{preview}], len={len(self.vec)})"


In [None]:
col = Column('trade_pnl', [2.0, -1.0, 3.0, 1.5])

In [None]:
col

Column(name='trade_pnl', data=[2.0, -1.0, 3.0, 1.5], len=4)

In [None]:
class DataFrame:
    """
    DataFrame: a simple tabular data structure.

    Maintains a list of Column objects in self.cols. Supports basic operations
    like length, adding columns, appending columns, and printing a formatted preview.
    """

    def __init__(self, cols):
        """
        Initialize the DataFrame.

        Parameters
        ----------
        cols : list
            A list of Column objects to initialize the DataFrame.
        """
        self.cols = cols

    def __len__(self):
        """
        Return the number of rows in the DataFrame.

        Returns
        -------
        int
            Number of rows. Assumes all columns have the same length.
        """
        return self.cols[0].len()

    def append(self, col):
        """
        Append a Column object to the DataFrame.

        Parameters
        ----------
        col : Column
            The Column object to append.
        """
        for i, c in enumerate(self.cols):
          if col.name == c.name:
            self.cols[i] = col
            return
        self.cols.append(col)

    def add_col(self, name, col):
        """
        Create a Column from name and data, and append it to the DataFrame.

        Parameters
        ----------
        name : str
            Name of the new column.
        col : list or np.ndarray
            Data for the new column.
        """
        self.cols.append(Column(name, col))

    def __getitem__(self, keys):
            """
            Select columns by name.

            Parameters
            ----------
            keys : str or list of str
                Column name(s) to select.

            Returns
            -------
            np.ndarray
                2D array with shape (n_rows, n_selected_columns)
            """
            if isinstance(keys, str):
                # Single column -> return 1D array
                for col in self.cols:
                    if col.name == keys:
                        return col.vec.data
                raise KeyError(f"Column '{keys}' not found.")
            elif isinstance(keys, list):
                # Multiple columns -> return 2D row-major array
                selected_cols = []
                for key in keys:
                    for col in self.cols:
                        if col.name == key:
                            selected_cols.append(col.vec)
                            break
                    else:
                        raise KeyError(f"Column '{key}' not found.")
                # Stack columns and transpose so rows are observations
                return np.column_stack(selected_cols)
            else:
                raise TypeError("Key must be a string or list of strings.")

    def __repr__(self):
        """
        Return a string representation of the DataFrame.

        Displays the first 10 rows of the DataFrame as a formatted table
        with column headers. If the DataFrame has more than 10 rows,
        adds "..." at the end to indicate more data.

        Returns
        -------
        str
            Formatted string table of the DataFrame preview.
        """
        col_names = [col.name for col in self.cols]

        # Determine width for each column (max of header or any value)
        col_widths = []
        preview_rows = min(len(self), 10)
        for col in self.cols:
            data_preview = [str(x) for x in col.vec[:preview_rows]]
            max_data_width = max(len(x) for x in data_preview) if data_preview else 0
            width = max(len(col.name), max_data_width)
            col_widths.append(width)

        # Format header row
        header = " | ".join(
            name.ljust(width) for name, width in zip(col_names, col_widths)
        )

        # Separator (column-aligned)
        separator = "-+-".join("-" * width for width in col_widths)

        # Format data rows
        rows = []
        for i in range(preview_rows):
            row = " | ".join(
                str(col.vec[i]).ljust(width) for col, width in zip(self.cols, col_widths)
            )
            rows.append(row)

        table = "\n".join([header, separator] + rows)
        if len(self) > 10:
            table += "\n..."
        return table


In [None]:
from datetime import datetime, timedelta

time = Column('time', [datetime(2025, 10, 1) + timedelta(days=i+1) for i in range(7)])
price = Column('price', [10.0, 11.0, 12.0, 10.0, 13.0, 14.0, 15.0])

table = DataFrame([time, price])
table

time                | price
--------------------+------
2025-10-02 00:00:00 | 10.0 
2025-10-03 00:00:00 | 11.0 
2025-10-04 00:00:00 | 12.0 
2025-10-05 00:00:00 | 10.0 
2025-10-06 00:00:00 | 13.0 
2025-10-07 00:00:00 | 14.0 
2025-10-08 00:00:00 | 15.0 

In [None]:
price_lag_1 = price.shift()
price_lag_1

array([nan, 10., 11., 12., 10., 13., 14.])

In [None]:
table.append(Column('price_lag_1', price_lag_1))

In [None]:
table

time                | price | price_lag_1
--------------------+-------+------------
2025-10-02 00:00:00 | 10.0  | nan        
2025-10-03 00:00:00 | 11.0  | 10.0       
2025-10-04 00:00:00 | 12.0  | 11.0       
2025-10-05 00:00:00 | 10.0  | 12.0       
2025-10-06 00:00:00 | 13.0  | 10.0       
2025-10-07 00:00:00 | 14.0  | 13.0       
2025-10-08 00:00:00 | 15.0  | 14.0       

In [None]:
price.div(price_lag_1)

Vector([       nan 1.1        1.09090909 0.83333333 1.3        1.07692308
 1.07142857])

In [None]:
price / price_lag_1

Vector([       nan 1.1        1.09090909 0.83333333 1.3        1.07692308
 1.07142857])

In [None]:
ratio = price / price_lag_1
ratio_col = Column('ratio', ratio)
table.append(ratio_col)

In [None]:
table

time                | price | price_lag_1 | ratio             
--------------------+-------+-------------+-------------------
2025-10-02 00:00:00 | 10.0  | nan         | nan               
2025-10-03 00:00:00 | 11.0  | 10.0        | 1.1               
2025-10-04 00:00:00 | 12.0  | 11.0        | 1.0909090909090908
2025-10-05 00:00:00 | 10.0  | 12.0        | 0.8333333333333334
2025-10-06 00:00:00 | 13.0  | 10.0        | 1.3               
2025-10-07 00:00:00 | 14.0  | 13.0        | 1.0769230769230769
2025-10-08 00:00:00 | 15.0  | 14.0        | 1.0714285714285714

### Generate Auto-Regressive Log Returns

In [None]:
log_return = ratio_col.log()
log_return

array([        nan,  0.09531018,  0.08701138, -0.18232156,  0.26236426,
        0.07410797,  0.06899287])

In [None]:
log_return_col = Column('log_return', log_return)
table.append(log_return_col)
table

time                | price | price_lag_1 | ratio              | log_return         
--------------------+-------+-------------+--------------------+--------------------
2025-10-02 00:00:00 | 10.0  | nan         | nan                | nan                
2025-10-03 00:00:00 | 11.0  | 10.0        | 1.1                | 0.09531017980432493
2025-10-04 00:00:00 | 12.0  | 11.0        | 1.0909090909090908 | 0.0870113769896297 
2025-10-05 00:00:00 | 10.0  | 12.0        | 0.8333333333333334 | -0.1823215567939546
2025-10-06 00:00:00 | 13.0  | 10.0        | 1.3                | 0.26236426446749106
2025-10-07 00:00:00 | 14.0  | 13.0        | 1.0769230769230769 | 0.07410797215372183
2025-10-08 00:00:00 | 15.0  | 14.0        | 1.0714285714285714 | 0.06899287148695142

In [None]:
log_return_lag_1_col = Column('log_return_lag_1', log_return_col.shift())
table.append(log_return_lag_1_col)
table

time                | price | price_lag_1 | ratio              | log_return          | log_return_lag_1   
--------------------+-------+-------------+--------------------+---------------------+--------------------
2025-10-02 00:00:00 | 10.0  | nan         | nan                | nan                 | nan                
2025-10-03 00:00:00 | 11.0  | 10.0        | 1.1                | 0.09531017980432493 | nan                
2025-10-04 00:00:00 | 12.0  | 11.0        | 1.0909090909090908 | 0.0870113769896297  | 0.09531017980432493
2025-10-05 00:00:00 | 10.0  | 12.0        | 0.8333333333333334 | -0.1823215567939546 | 0.0870113769896297 
2025-10-06 00:00:00 | 13.0  | 10.0        | 1.3                | 0.26236426446749106 | -0.1823215567939546
2025-10-07 00:00:00 | 14.0  | 13.0        | 1.0769230769230769 | 0.07410797215372183 | 0.26236426446749106
2025-10-08 00:00:00 | 15.0  | 14.0        | 1.0714285714285714 | 0.06899287148695142 | 0.07410797215372183

### Matrices

### Column Order (Column Major)

In [None]:
x = [1,2,3,4]
y = [1,1,1,1]
matrix = np.array([x,y])

In [None]:
matrix

array([[1, 2, 3, 4],
       [1, 1, 1, 1]])

In [None]:
matrix[0]

array([1, 2, 3, 4])

In [None]:
matrix[1]

array([1, 1, 1, 1])

### Row Order (Row Major)

In [None]:
matrix = np.array([[1, 1], [2, 1], [3, 1], [4, 1]])

In [None]:
matrix[0]

array([1, 1])

In [None]:
matrix[1]

array([2, 1])

In [None]:
matrix[2]

array([3, 1])

In [None]:
matrix[3]

array([4, 1])

### Create Features

In [None]:
X = table[['log_return_lag_1']]

In [None]:
X

array([[        nan],
       [        nan],
       [ 0.09531018],
       [ 0.08701138],
       [-0.18232156],
       [ 0.26236426],
       [ 0.07410797]])

### Create Target

In [None]:
y = table['log_return']

In [None]:
y

array([        nan,  0.09531018,  0.08701138, -0.18232156,  0.26236426,
        0.07410797,  0.06899287])

### Exercise 1: Create Log Returns

In [None]:
cols = [
    Column('date',[datetime(2025, 10, 1)+timedelta(days=1+i) for i in range(10)]),
    Column('price', [10.0, 8.0, 11.0, 7.0, 9.0, 12.0, 8.0, 9.0, 7.0, 10.0])
]
df = DataFrame(cols)
df

date                | price
--------------------+------
2025-10-02 00:00:00 | 10.0 
2025-10-03 00:00:00 | 8.0  
2025-10-04 00:00:00 | 11.0 
2025-10-05 00:00:00 | 7.0  
2025-10-06 00:00:00 | 9.0  
2025-10-07 00:00:00 | 12.0 
2025-10-08 00:00:00 | 8.0  
2025-10-09 00:00:00 | 9.0  
2025-10-10 00:00:00 | 7.0  
2025-10-11 00:00:00 | 10.0 

In [None]:
# add log return column to table as a vectorised one-liner

In [None]:
# checks the last col (log return) is calculated correctly
np.allclose(
    df.cols[-1].vec.data,
    [np.nan, np.log(8.0/10.0), np.log(11.0/8.0), np.log(7.0/11.0),
     np.log(9.0/7.0), np.log(12.0/9.0), np.log(8.0/12.0),
     np.log(9.0/8.0), np.log(7.0/9.0), np.log(10.0/7.0)],
    equal_nan=True
)

False

### Exercise 2: Add Log Return Lag

In [None]:
#add log return lag column

In [None]:
np.allclose(
    df.cols[-1].vec.data,
    [np.nan, np.nan, np.log(8.0/10.0), np.log(11.0/8.0), np.log(7.0/11.0),
     np.log(9.0/7.0), np.log(12.0/9.0), np.log(8.0/12.0),
     np.log(9.0/8.0), np.log(7.0/9.0)],
    equal_nan=True
)

False

### AR(1) Model