<a href="https://colab.research.google.com/github/nickmdefelice/dataminingproject/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports

Note:
My code is fully functioning with smaller values of N for the training set, but when I make it bigger it often runs out of memory in Colab or crashes. I set N, which is the # of stocks, equal to 2000 by default but if it doesn't work, you can change the value of N to something smaller like 500 or less. It should still perform well. It works with the full testing sets though.

In [None]:
N = 2000

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import pickle
import gc
import warnings
warnings.filterwarnings('ignore')


**Task 1: Data Preparation**

Load training set

In [None]:
with open('training_set.pkl', 'rb') as f:
    training_set = pickle.load(f)
#Reads training_set.pkl and stores in training_set
#training_set is a list of 2,000 pandas dataframes, each representing the historical trading data of a single stock

In [None]:
# Use only the first N stocks
#N = 10
training_set = training_set[:N]



# Continue with your code, using 'smaller_training_set' instead of 'training_set'


Calculate the daily percentage change and flattens into 1D arrays

In [None]:
#price_changes = [stock['Close'].pct_change().dropna().tolist() for stock in training_set]
flat_changes = [pct_change for stock in training_set for pct_change in stock['Close'].pct_change().dropna()]
#Calculates daily percentage change in the closing prices for each stock in training_set
#Calculated relative to previous day's closing price
#Stored in price_changes list, which is a list of lists containing the % changes for each stock

Determine thresholds

In [None]:
sorted_changes = np.sort(flat_changes)
num_points = len(sorted_changes)
num_third = num_points // 3
threshold1 = sorted_changes[num_third]
threshold2 = sorted_changes[num_third * 2]

#Calculates two thresholds used to divide percentage changes into three levels: increase, decrease, and no change
#Calculated by sorting flat_changes and selecting values at 1/3 and 2/3

Divide the changes into 3 levels

In [None]:
num_decrease = sum(pct_change < threshold1 for pct_change in flat_changes)
num_nochange = sum(threshold1 <= pct_change < threshold2 for pct_change in flat_changes)
num_increase = num_points - num_decrease - num_nochange

#Iterates through price_changes and counts the # of % changes that falls into each level
#Stored in num_increase, num_nochange, and num_decrease

Results

In [None]:
print("Threshold 1: ", threshold1)
print("Threshold 2: ", threshold2)
print("Number of data points for increase: ", num_increase)
print("Number of data points for no change: ", num_nochange)
print("Number of data points for decrease: ", num_decrease)
#Prints the thresholds and the # for each level of % changes

Threshold 1:  -0.001942987939540286
Threshold 2:  0.0019470602816240579
Number of data points for increase:  7338
Number of data points for no change:  7336
Number of data points for decrease:  7336


**Task 2: Feature Engineering**

Create features function

In [None]:
def create_features(df):
    # Feature 1: Daily Close Percentage Change
    df['PCT_CHG'] = df['Close'].pct_change()

    #Feature 2: Moving Average 5
    df['MA_5'] = df['Close'].rolling(window=5).mean()

    # Feature 3: Moving Average 10
    df['MA_10'] = df['Close'].rolling(window=10).mean()

    #Feature 4: Moving Average 20
    df['MA_20'] = df['Close'].rolling(window=20).mean()

    #Feature 5: Moving Average 50
    df['MA_50'] = df['Close'].rolling(window=50).mean()

    #Feature 6: Moving Average 100
    df['MA_100'] = df['Close'].rolling(window=100).mean()

    # Feature 7: Moving Average Convergence Divergence (MACD)
    exp1 = df['Close'].ewm(span=12, adjust=False).mean()
    exp2 = df['Close'].ewm(span=26, adjust=False).mean()
    df['MACD'] = exp1 - exp2

    # Feature 8: Relative Strength Index (RSI)
    delta = df['Close'].diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    df['RSI'] = rsi

    # Feature 9: Money Flow Index (MFI)
    typical_price = (df['High'] + df['Low'] + df['Close']) / 3
    money_flow = typical_price * df['Volume']
    positive_flow = money_flow.where(df['PCT_CHG'] > 0, 0).rolling(window=14).sum()
    negative_flow = money_flow.where(df['PCT_CHG'] < 0, 0).rolling(window=14).sum()
    mfi = 100 - (100 / (1 + (positive_flow / negative_flow)))
    df['MFI'] = mfi

    # Feature 10: High minus Low
    df['H-L'] = df['High'] - df['Low']

    # Feature 11: High minus Close
    df['H-C'] = df['High'] - df['Close']

    # Feature 12: Low minus Close
    df['L-C'] = df['Low'] - df['Close']

    # Feature 13: Close minus Open
    df['Close-Open'] = df['Close'] - df['Open']

    #Feature 14: Log high
    df['log_high'] = np.log(df['High'])

    #Feature 15: Log low
    df['log_low'] = np.log(df['Low'])

    #Feature 16: Log close
    df['log_close'] = np.log(df['Close'])

    #Feature 17: Log volume
    df['log_volume'] = np.log(df['Volume'])

    #Features 18 and 19: Bollinger Bands
    sma = df['Close'].rolling(window=20).mean()
    std = df['Close'].rolling(window=20).std()
    upper_band = sma + (2 * std)
    lower_band = sma - (2 * std)
    df['BB_upper'] = upper_band
    df['BB_lower'] = lower_band

    #Feature 20: Standard Deviation of Close
    df['Std_Close'] = df['Close'].rolling(window=20).std()

    #Feature 21: Mode of Close
    df['Mode_Close'] = df['Close'].mode().iloc[0]

    #Feature 22: Range of Close
    df['Range_Close'] = df['High'] - df['Low']

    #Feature 23: Sharpe Ratio
    daily_returns = df['Close'].pct_change()
    annualized_mean = daily_returns.mean() * 252
    annualized_std = daily_returns.std() * np.sqrt(252)
    df['Sharpe_Ratio'] = annualized_mean / annualized_std

    #Feature 24: Volume Percent Change
    df['Volume_Change'] = df['Volume'].pct_change()

    #Feature 25: Rate of Change (ROC)
    df['ROC'] = df['Close'].pct_change(periods=10)

    #Feature 26: On-Balance Volume (OBV)
    df['OBV'] = np.where(df['Close'] > df['Close'].shift(1), df['Volume'], -df['Volume']).cumsum()

    #Feature 27: Average True Range (ATR)
    tr = np.maximum(df['High'] - df['Low'], np.abs(df['High'] - df['Close'].shift(1)))
    df['ATR'] = tr.rolling(window=14).mean()

    #Feature 28: Chaikin Money Flow (CMF)
    ad = ((df['Close'] - df['Low']) - (df['High'] - df['Close'])) / (df['High'] - df['Low'])
    cmf_multiplier = ad * df['Volume']
    df['CMF'] = cmf_multiplier.rolling(window=20).sum() / df['Volume'].rolling(window=20).sum()

    #Feature 29: Price Rate of Change (PROC)
    df['PROC'] = df['Close'].pct_change(periods=10)

    #Feature 30: Exponential Moving Average 5
    df['EMA_5'] = df['Close'].ewm(span=5).mean()

    #Feature 31: Exponential Moving Average 10
    df['EMA_10'] = df['Close'].ewm(span=5).mean()

    #Feature 32: Exponential Moving Average 20
    df['EMA_20'] = df['Close'].ewm(span=5).mean()

    #Feature 33: Exponential Moving Average 50
    df['EMA_50'] = df['Close'].ewm(span=5).mean()

    #Feature 34: Exponential Moving Average 100
    df['EMA_100'] = df['Close'].ewm(span=5).mean()

    #Feature 35: Price-Volume Trend (PVT)
    df['PVT'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)) * df['Volume']

    #Feature 36: Accumulation/Distribution Line (ADL)
    df['ADL'] = ((df['Close'] - df['Low']) - (df['High'] - df['Close'])) / (df['High'] - df['Low']) * df['Volume']
    df['ADL'] = df['ADL'].cumsum()

    #Feature 37: Mass Index (MI)
    window = 9  # You can adjust the window size as needed
    high_low_range = df['High'] - df['Low']
    high_low_difference = df['High'] - df['Low'].shift(1)
    high_low_difference_ema = high_low_difference.ewm(span=window, adjust=False).mean()
    mass_index = high_low_range / high_low_difference_ema
    df['MI'] = mass_index.rolling(window=window).sum()

    #Feature 38: Detrended Price Oscillator (DPO)
    window = 20  # You can adjust the window size as needed
    dpo = df['Close'] - df['Close'].rolling(window=int(window/2) + 1).mean().shift(int(window/2))
    df['DPO'] = dpo

    #Feature 39: Ulcer Index (UI)
    window = 14  # You can adjust the window size as needed
    returns = df['Close'].pct_change()
    returns_squared = returns ** 2
    rolling_max = df['Close'].rolling(window=window).max()
    percentage_drawdown = ((df['Close'] / rolling_max) - 1) ** 2
    ui = np.sqrt(percentage_drawdown.rolling(window=window).mean()) * 100
    df['UI'] = ui

    #Feature 40: Williams %R
    window = 14
    highest_high = np.max(df['High'][-window:])
    lowest_low = np.min(df['Low'][-window:])
    williams_r = (highest_high - df['Close']) / (highest_high - lowest_low) * -100
    df['WilliamsR'] = williams_r

    #Feature 41: Stochastic Oscillator
    highest_high = np.max(df['High'][-window:])
    lowest_low = np.min(df['Low'][-window:])
    stochastic_k = (df['Close'] - lowest_low) / (highest_high - lowest_low) * 100
    df['StochasticOscillator'] = stochastic_k

    # Feature 42: Commodity Channel Index (CCI)
    window = 20
    typical_price = (df['High'] + df['Low'] + df['Close']) / 3
    mean_typical_price = np.mean(typical_price.iloc[-window:])
    mean_deviation = np.mean(np.abs(typical_price - mean_typical_price).iloc[-window:])
    if mean_deviation != 0:
        cci = (typical_price.iloc[-1] - mean_typical_price) / (0.015 * mean_deviation)
    else:
        cci = 0  # Set CCI to 0 if mean deviation is 0 to avoid divide by zero error
    df['CCI'] = cci

    #Feature 43: Average Directional Index (ADX)
    window = 14
    true_range = np.max([df['High'] - df['Low'], np.abs(df['High'] - df['Close'][1:]), np.abs(df['Low'] - df['Close'][1:])], axis=0)
    smooth_true_range = np.convolve(true_range, np.ones(window)/window, mode='valid')
    directional_movement = np.where(df['High'][1:] - df['High'][:-1] > df['Low'][:-1] - df['Low'][1:], df['High'][1:] - df['High'][:-1], df['Low'][:-1] - df['Low'][1:])
    positive_directional_movement = np.where(directional_movement > 0, directional_movement, 0)
    negative_directional_movement = np.where(directional_movement < 0, -directional_movement, 0)
    smooth_positive_directional_movement = np.convolve(positive_directional_movement, np.ones(window)/window, mode='valid')
    smooth_negative_directional_movement = np.convolve(negative_directional_movement, np.ones(window)/window, mode='valid')
    directional_index = (smooth_positive_directional_movement / smooth_true_range) * 100
    adx = np.mean(directional_index)
    df['ADX'] = adx

    #Feature 44: Rate of Change Ratio (ROCR)
    window = 10
    previous_close = df['Close'].shift(window)
    rocr = ((df['Close'] - previous_close) / previous_close) * 100
    df['ROCR'] = rocr


    #Feature 45: Pivot Point
    df['Pivot Point (PP)'] = (df['High'] + df['Low'] + df['Close']) / 3

    #Feature 46: Resistance 1
    df['Resistance 1 (R1)'] = (2 * df['Pivot Point (PP)']) - df['Low']

    #Feature 47: Resistance 2
    df['Resistance 2 (R2)'] = df['Pivot Point (PP)'] + (df['High'] - df['Low'])

    #Feature 48: Resistance 3
    df['Resistance 3 (R3)'] = df['High'] + 2 * (df['Pivot Point (PP)'] - df['Low'])

    #Feature 49: Support 1
    df['Support 1 (S1)'] = (2 * df['Pivot Point (PP)']) - df['High']


    #Feature 50: Support 2
    df['Support 2 (S2)'] = df['Pivot Point (PP)'] - (df['High'] - df['Low'])

    #Feature 51: Support 3
    df['Support 3 (S3)'] = df['Low'] - 2 * (df['High'] - df['Pivot Point (PP)'])

    #Feature 52: Ease of Movement (EOM)
    distance_moved = (df['High'] + df['Low']) / 2 - (df['High'].shift(1) + df['Low'].shift(1)) / 2
    box_ratio = (df['Volume'] / 100000000) / ((df['High'] - df['Low']))
    eom = distance_moved / box_ratio
    df['EOM'] = eom

    #Feature 53: Volume Weighted Average Prie (VWAP)
    tp = (df['High'] + df['Low'] + df['Close']) / 3
    vwap = (tp * df['Volume']).cumsum() / df['Volume'].cumsum()
    df['VWAP'] = vwap

    #Feature 54: Chande Momentum Oscillator (CMO)
    window = 14
    prev_close = df['Close'].shift(1)
    delta = df['Close'] - prev_close
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=window).mean()
    avg_loss = loss.rolling(window=window).mean()
    cmo = ((avg_gain - avg_loss) / (avg_gain + avg_loss)) * 100
    df['CMO'] = cmo

    #Feature 55: Accumulative Swing Index (ASI)
    asi = 0
    for i in range(1, len(df)):
        tr = max(df['High'][i] - df['Low'][i], abs(df['High'][i] - df['Close'][i-1]), abs(df['Low'][i] - df['Close'][i-1]))
        hl = df['High'][i] - df['Low'][i]
        hc = abs(df['High'][i] - df['Close'][i-1])
        lc = abs(df['Low'][i] - df['Close'][i-1])
        r = 0.1 * ((hl > hc) and (hl > lc)) * ((hl - 0.5 * hc + 0.25 * lc) / (0.001 * tr))
        si = r * 16 * (df['Close'][i] - df['Close'][i-1]) / tr
        asi += si
    df['ASI'] = asi

    #Feature 56: Double Exponential Moving Average 5
    window = 5
    ema_5 = df['Close'].ewm(span=window).mean()
    dema_5 = 2 * ema_5 - ema_5.ewm(span=window).mean()
    df['DEMA_5'] = dema_5

    #Feature 57: Double Exponential Moving Average 10
    window = 10
    ema_10 = df['Close'].ewm(span=window).mean()
    dema_10 = 2 * ema_10 - ema_10.ewm(span=window).mean()
    df['DEMA_10'] = dema_10

    #Feature 58: Double Exponential Moving Average 20
    window = 20
    ema_20 = df['Close'].ewm(span=window).mean()
    dema_20 = 2 * ema_20 - ema_20.ewm(span=window).mean()
    df['DEMA_20'] = dema_20

    #Feature 59: Double Exponential Moving Average 50
    window = 50
    ema_50 = df['Close'].ewm(span=window).mean()
    dema_50 = 2 * ema_50 - ema_50.ewm(span=window).mean()
    df['DEMA_50'] = dema_50

    #Feature 60: Double Exponential Moving Average 100
    window = 100
    ema_100 = df['Close'].ewm(span=window).mean()
    dema_100 = 2 * ema_100 - ema_100.ewm(span=window).mean()
    df['DEMA_100'] = dema_100

    #Features 61 and 62: Keltner Channels
    window = 20
    avg_true_range = df['High'] - df['Low']
    upper_band = df['Close'] + 2 * avg_true_range.rolling(window=window).mean()
    lower_band = df['Close'] - 2 * avg_true_range.rolling(window=window).mean()
    df['Keltner_Upper'] = upper_band
    df['Keltner_Lower'] = lower_band

    #Feature 63: Chandelier Exit Long 22
    window = 22
    atr = df['ATR'] # Assuming ATR (Average True Range) is already calculated
    chandelier_exit_long = df['High'] - (3 * atr.rolling(window=window).max())
    df['ChandelierExitLong22'] = chandelier_exit_long

    #Feature 64: Chandelier Exit Short 22
    chandelier_exit_short = df['Low'] + (3 * atr.rolling(window=window).max())
    df['ChandelierExitShort22'] = chandelier_exit_short

    #Features 65 and 66: Moving Average Envelope 5% Upper and Lower
    ma = df['Close'].rolling(window=20).mean() # Assuming 20-day simple moving average
    envelope_width = 0.05
    envelope_upper = ma * (1 + envelope_width)
    envelope_lower = ma * (1 - envelope_width)
    df['MAE5%Upper'] = envelope_upper
    df['MAE5%Lower'] = envelope_lower

    #Feature 67: Triangular Moving Average 30
    df['TMA_30'] = df['Close'].rolling(window=30).sum() / ((30 + 1) / 2)

    #Feature 68: Triangular Exponential Moving Average 5
    window = 5
    ema1 = df['Close'].ewm(span=window, adjust=False).mean()
    ema2 = ema1.ewm(span=window, adjust=False).mean()
    ema3 = ema2.ewm(span=window, adjust=False).mean()
    df['TEMA_5'] = 3 * ema1 - 3 * ema2 + ema3

    #Feature 69: Triangular Exponential Moving Average 10
    window = 10
    df['TEMA_10'] = 3 * ema1 - 3 * ema2 + ema3

    #Feature 70: Triangular Exponential Moving Average 20
    window = 20
    df['TEMA_20'] = 3 * ema1 - 3 * ema2 + ema3

    #Feature 71: Triangular Exponential Moving Average 50
    window = 50
    df['TEMA_50'] = 3 * ema1 - 3 * ema2 + ema3

    #Feature 72: Triangular Exponential Moving Average 100
    window = 100
    df['TEMA_100'] = 3 * ema1 - 3 * ema2 + ema3

    #Feature 73: Average Price (AVP)
    df['avp'] = (df['High'] + df['Low'] + df['Close']) / 3

    #Feature 74: Average Daily Range (ADR)
    df['adr'] = df['High'] - df['Low']
    df['adr'] = df['adr'].rolling(window=20).mean()

    #Feature 75: Historical Volatility
    df['hv'] = df['Close'].pct_change().rolling(window=20).std()

    #Feature 76: High-Low Range Ratio
    df['hl_range_ratio'] = (df['High'] - df['Low']) / df['Close']

    #Feature 77: Typical Price
    df['tp'] = (df['High'] + df['Low'] + df['Close']) / 3

    #Feature 78: Volume-Weighted Moving Average
    df['vwap'] = (df['tp'] * df['Volume']).rolling(window=20).sum() / df['Volume'].rolling(window=20).sum()

    #Feature 79: Daily Return
    df['daily_return'] = df['Close'].pct_change()

    #Feature 80: Average Daily Return
    df['avg_daily_return'] = df['daily_return'].rolling(window=len(df)).mean()

    #Feature 81: Rolling Volatility
    df['Rolling Volatility'] = df['Close'].rolling(window=20).std()

    #Feature 82: Price Momentum
    df['Price Momentum'] = df['Close'].pct_change() * 100

    #Feature 83: Cumulative Returns
    df['Cumulative Returns'] = (df['Close'] / df['Close'].iloc[0] - 1) * 100

    #Feature 84: Bullish Signal
    df['Bullish Signal'] = np.where((df['MA_5'] > df['MA_10']) & (df['MA_5'].shift(1) <= df['MA_10'].shift(1)), 1, 0)

    #Feature 85: Bearish Signal
    df['Bearish Signal'] = np.where((df['MA_5'] < df['MA_10']) & (df['MA_5'].shift(1) >= df['MA_10'].shift(1)), -1, 0)

    #Feature 86: Bullish Divergence
    df['Bullish Divergence'] = np.where((rsi > rsi.shift(1)) & (rsi.shift(1) < rsi.shift(2)), 1, 0)

    #Feature 87: Bearish Divergence
    df['Bearish Divergence'] = np.where((rsi < rsi.shift(1)) & (rsi.shift(1) > rsi.shift(2)), -1, 0)

    #Feature 88: Price to Moving Average Ratio
    window = 50
    df['Price_MA_Ratio'] = df['Close'] / df['Close'].rolling(window=window).mean()

    #Feature 89: Donchian Channel High
    window = 20
    df['Donchian_Channel_High'] = df['High'].rolling(window=window).max()

    #Feature 90: Donchian Channel Low
    df['Donchian_Channel_Low'] = df['Low'].rolling(window=window).min()

    #Feature 91: Tenkan Sen (Conversion Line)
    df['Tenkan_Sen'] = (df['High'].rolling(window=9).max() + df['Low'].rolling(window=9).min()) / 2

    #Feature 92: Kijun Sen (Base Line)
    df['Kijun_Sen'] = (df['High'].rolling(window=26).max() + df['Low'].rolling(window=26).min()) / 2

    #Feature 93: Senkoun Span A (Leading Span A)
    df['Senkou_Span_A'] = ((df['Tenkan_Sen'] + df['Kijun_Sen']) / 2).shift(26)

    #Feature 94: Senkou Span B (Leading Span B)
    df['Senkou_Span_B'] = ((df['High'].rolling(window=52).max() + df['Low'].rolling(window=52).min()) / 2).shift(26)

    #Feature 95: Chikou Span (Lagging Span)
    df['Chikou_Span'] = df['Close'].shift(-26)

    #Feature 96: Fibonacci 23.6
    df['Fib_23.6'] = df['High'] - (df['H-L'] * 0.236)

    #Feature 97: Fibonacci 38.2
    df['Fib_38.2'] = df['High'] - (df['H-L'] * 0.382)

    #Feature 98: Fibonacci 50
    df['Fib_50.0'] = df['High'] - (df['H-L'] * 0.5)

    #Feature 99: Fibonacci 61.8
    df['Fib_61.8'] = df['High'] - (df['H-L'] * 0.618)

    #Feature 100: Fibonacci 76.4
    df['Fib_76.4'] = df['High'] - (df['H-L'] * 0.764)

    # Fill NaN values with median
    #df = df.fillna(df.median())

    return df

Apply function to all dataframes

In [None]:
'''for i, df in enumerate(training_set):
    training_set[i] = create_features(df)'''

for i, df in enumerate(training_set):
    create_features(df)

Print last 5 data points for first stock dataframe

In [None]:
pd.set_option('display.max_columns', None)
print(training_set[0].tail(5))

**Task 3: Training Predictive Model**

Data Preparation

Load and preprocess the data set

Label percent change

In [None]:

def label_pct_change_vectorized(pct_change, threshold1, threshold2):
    labels = np.empty_like(pct_change)
    labels[pct_change >= threshold2] = 1
    labels[np.logical_and(threshold1 <= pct_change, pct_change < threshold2)] = 0
    labels[pct_change < threshold1] = -1
    return labels


In [None]:

# Apply feature engineering and labeling to all DataFrames at once
#smaller_training_set = [create_features(stock) for stock in smaller_training_set]
targets = [label_pct_change_vectorized(stock['PCT_CHG'].values, threshold1, threshold2) for stock in training_set]



In [None]:
# Split the data for each stock individually
train_set = []
test_set = []
for stock, target in zip(training_set, targets):
    stock['Target'] = target
    X_train_stock, X_test_stock = train_test_split(stock, test_size=0.2, random_state=42)
    train_set.append(X_train_stock)
    test_set.append(X_test_stock)

In [None]:
# Concatenate the training and testing sets separately
X_train = pd.concat([stock.drop(columns=['Target']) for stock in train_set], ignore_index=True)
y_train = pd.concat([stock['Target'] for stock in train_set], ignore_index=True)
X_test = pd.concat([stock.drop(columns=['Target']) for stock in test_set], ignore_index=True)
y_test = pd.concat([stock['Target'] for stock in test_set], ignore_index=True)



In [None]:
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Replace infinite values with NaN for target variable
#y_train = np.where(np.isinf(y_train), np.nan, y_train)

# Impute missing values for target variable
#y_imputer = SimpleImputer(strategy='mean')
#y_train_imputed = y_imputer.fit_transform(y_train.reshape(-1, 1)).ravel()


In [None]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [None]:
# Define your models
models = [
    LogisticRegression(max_iter=1000),
    RandomForestClassifier(),
    RandomForestClassifier(n_estimators=50),
    #SVC(),
    #SVC(C=0.1),
    MLPClassifier(),
    MLPClassifier(hidden_layer_sizes=(50,)),
    KNeighborsClassifier(n_neighbors=3),
    KNeighborsClassifier(n_neighbors=5),
    GradientBoostingClassifier(),
    #GradientBoostingClassifier(n_estimators=100),
    ExtraTreesClassifier(),
    ExtraTreesClassifier(n_estimators=50),
    # Add more models or variations of models
]

results = []



In [None]:
# Convert the continuous target variable into categorical labels
def to_categorical(y, threshold_1, threshold_2):
    if y < threshold_1:
        return 0  # "decrease"
    elif y < threshold_2:
        return 1  # "no big change"
    else:
        return 2  # "increase"

y_train_categorical = np.array([to_categorical(y, threshold1, threshold2) for y in y_train])
y_test_categorical = np.array([to_categorical(y, threshold1, threshold2) for y in y_test])


In [None]:
for model in models:
    print(f"Training {model.__class__.__name__}")
    # Train the model
    model.fit(X_train, y_train_categorical)
    print(f"Finished training {model.__class__.__name__}")

    # Evaluate the model on both training and validation sets
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_test)

    # Calculate the evaluation metrics
    train_accuracy = accuracy_score(y_train_categorical, y_train_pred)
    train_precision = precision_score(y_train_categorical, y_train_pred, average='weighted')
    train_positive_pct = (y_train_pred == 1).sum() / len(y_train_pred)

    val_accuracy = accuracy_score(y_test_categorical, y_val_pred)
    val_precision = precision_score(y_test_categorical, y_val_pred, average='weighted')
    val_positive_pct = (y_val_pred == 1).sum() / len(y_val_pred)

    results.append({
        'model': model,
        'train_accuracy': train_accuracy,
        'train_precision': train_precision,
        'train_positive_pct': train_positive_pct,
        'val_accuracy': val_accuracy,
        'val_precision': val_precision,
        'val_positive_pct': val_positive_pct,
    })



Training LogisticRegression
Finished training LogisticRegression
Training RandomForestClassifier
Finished training RandomForestClassifier
Training RandomForestClassifier
Finished training RandomForestClassifier
Training MLPClassifier
Finished training MLPClassifier
Training MLPClassifier
Finished training MLPClassifier
Training KNeighborsClassifier
Finished training KNeighborsClassifier
Training KNeighborsClassifier
Finished training KNeighborsClassifier
Training GradientBoostingClassifier
Finished training GradientBoostingClassifier
Training ExtraTreesClassifier
Finished training ExtraTreesClassifier
Training ExtraTreesClassifier
Finished training ExtraTreesClassifier


In [None]:
# Print the results
for result in results:
    print(f"{result['model']}:\n"
          f"  Train: Accuracy={result['train_accuracy']:.4f}, Precision={result['train_precision']:.4f}, Positive Pct={result['train_positive_pct']:.4f}\n"
          f"  Validation: Accuracy={result['val_accuracy']:.4f}, Precision={result['val_precision']:.4f}, Positive Pct={result['val_positive_pct']:.4f}\n")

LogisticRegression(max_iter=1000):
  Train: Accuracy=0.9935, Precision=0.9935, Positive Pct=0.3383
  Validation: Accuracy=0.9932, Precision=0.9932, Positive Pct=0.3288

RandomForestClassifier():
  Train: Accuracy=1.0000, Precision=1.0000, Positive Pct=0.3353
  Validation: Accuracy=1.0000, Precision=1.0000, Positive Pct=0.3270

RandomForestClassifier(n_estimators=50):
  Train: Accuracy=1.0000, Precision=1.0000, Positive Pct=0.3353
  Validation: Accuracy=0.9998, Precision=0.9998, Positive Pct=0.3268

MLPClassifier():
  Train: Accuracy=0.9977, Precision=0.9977, Positive Pct=0.3376
  Validation: Accuracy=0.9873, Precision=0.9875, Positive Pct=0.3333

MLPClassifier(hidden_layer_sizes=(50,)):
  Train: Accuracy=0.9997, Precision=0.9997, Positive Pct=0.3354
  Validation: Accuracy=0.9914, Precision=0.9914, Positive Pct=0.3274

KNeighborsClassifier(n_neighbors=3):
  Train: Accuracy=0.8522, Precision=0.8540, Positive Pct=0.3344
  Validation: Accuracy=0.6964, Precision=0.7002, Positive Pct=0.3401


Task 4: Feature Selection

In [None]:
models = [
    ('LogisticRegression', LogisticRegression(max_iter=1000)),
    ('RandomForestClassifier', RandomForestClassifier()),
    ('RandomForestClassifier_50', RandomForestClassifier(n_estimators=200)),
    #('SVC', SVC(probability=True)),
    #('SVC_C_0.1', SVC(C=0.1, probability=True)),
    ('MLPClassifier', MLPClassifier()),
    ('MLPClassifier_50', MLPClassifier(hidden_layer_sizes=(100, 100))),
    ('KNeighborsClassifier_3', KNeighborsClassifier(n_neighbors=3)),
    ('KNeighborsClassifier_5', KNeighborsClassifier(n_neighbors=5)),
    ('GradientBoostingClassifier', GradientBoostingClassifier()),
    #('GradientBoostingClassifier_200', GradientBoostingClassifier(n_estimators=200)),
    ('ExtraTreesClassifier', ExtraTreesClassifier()),
    ('ExtraTreesClassifier_50', ExtraTreesClassifier(n_estimators=200)),
]


In [None]:
# Set the number of features to select
n_features_to_select = 30

# Create a logistic regression estimator for RFE
estimator = LogisticRegression(max_iter=1000)

# Initialize the RFE selector
selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)


# Fit the selector to the training data
selector.fit(X_train, y_train_categorical)

# Get the selected features
selected_features = selector.get_support()

# Apply feature selection to the train and test sets
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]


In [None]:
for model_name, model in models:
    # Train the model
    model.fit(X_train_selected, y_train_categorical)

    # Evaluate the model
    y_pred_train = model.predict(X_train_selected)
    y_pred_test = model.predict(X_test_selected)

    # Calculate and print the evaluation metrics
    accuracy_train = accuracy_score(y_train_categorical, y_pred_train)
    precision_train = precision_score(y_train_categorical, y_pred_train, average='weighted')
    positive_pct_train = (y_pred_train == 1).sum() / len(y_pred_train)

    accuracy_test = accuracy_score(y_test_categorical, y_pred_test)
    precision_test = precision_score(y_test_categorical, y_pred_test, average='weighted')
    positive_pct_test = (y_pred_test == 1).sum() / len(y_pred_test)

    print(f"{model_name}:")
    print(f"  Train: Accuracy={accuracy_train:.4f}, Precision={precision_train:.4f}, Positive Pct={positive_pct_train:.4f}")
    print(f"  Validation: Accuracy={accuracy_test:.4f}, Precision={precision_test:.4f}, Positive Pct={positive_pct_test:.4f}")
    print()


Task 5

Voting

In [None]:
selected_estimators = [
    ('lr', models[0][1]),
    ('rf', models[1][1]),
    ('rf_50', models[2][1]),
    ('mlp', models[3][1]),
    ('mlp_50', models[4][1]),
    ('knn_3', models[5][1]),
    ('knn_5', models[6][1]),
    ('gbc', models[7][1]),
    ('etc', models[8][1]),
    ('etc_50', models[9][1]),
]

Blending

Adaboosting

In [None]:
ensemble_methods = {
    "Voting": VotingClassifier(estimators=selected_estimators, voting='soft'),
    "Blending": LogisticRegression(max_iter=1000),
    "AdaBoosting": AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100)
}

best_method = None
best_precision = 0
evaluation_results = {}

# Prepare data for Blending
if "Blending" in ensemble_methods:
    for _, model in models:  # Unpack the tuple to get the model
        model.fit(X_train, y_train_categorical)
    X_meta_train = np.column_stack([model.predict(X_train) for _, model in models])  # Unpack the tuple to get the model
    X_meta_test = np.column_stack([model.predict(X_test) for _, model in models])  # Unpack the tuple to get the model

for method_name, clf in ensemble_methods.items():
    # Fit the model
    if method_name == "Blending":
        clf.fit(X_meta_train, y_train_categorical)
    else:
        clf.fit(X_train, y_train_categorical)

    # Make predictions
    y_train_pred = clf.predict(X_train if method_name != "Blending" else X_meta_train)
    y_test_pred = clf.predict(X_test if method_name != "Blending" else X_meta_test)

    # Calculate evaluation metrics
    train_accuracy = accuracy_score(y_train_categorical, y_train_pred)
    train_precision = precision_score(y_train_categorical, y_train_pred, average='weighted')
    train_positive_pct = (y_train_pred == 1).sum() / len(y_train_pred)

    test_accuracy = accuracy_score(y_test_categorical, y_test_pred)
    test_precision = precision_score(y_test_categorical, y_test_pred, average='weighted')
    test_positive_pct = (y_test_pred == 1).sum() / len(y_test_pred)

    # Store evaluation results
    evaluation_results[method_name] = {
        'train_accuracy': train_accuracy,
        'train_precision': train_precision,
        'train_positive_pct': train_positive_pct,
        'test_accuracy': test_accuracy,
        'test_precision': test_precision,
        'test_positive_pct': test_positive_pct
    }

    # Update best method
    if test_precision > best_precision:
        best_precision = test_precision
        best_model = method_name

    # Print results
    print(f"{method_name} Classifier:")
    print(f"  Train: Accuracy={train_accuracy:.4f}, Precision={train_precision:.4f}, Positive Pct={train_positive_pct:.4f}")
    print(f"  Test: Accuracy={test_accuracy:.4f}, Precision={test_precision:.4f}, Positive Pct={test_positive_pct:.4f}\n")


Task 6

Load testing sets

In [None]:
with open('testing_set1.pkl', 'rb') as f:
    testing_set1 = pickle.load(f)

with open('testing_set2.pkl', 'rb') as f:
    testing_set2 = pickle.load(f)

with open('testing_set3.pkl', 'rb') as f:
    testing_set3 = pickle.load(f)

In [None]:
# Create a generator to process the data in chunks
def process_testing_set(testing_set, create_features, label_pct_change_vectorized, threshold1, threshold2, imputer, scaler):
    for stock in testing_set:
        stock_df = create_features(stock)
        stock_df['Target'] = stock_df['PCT_CHG'].apply(lambda x: label_pct_change_vectorized(x, threshold1, threshold2))
        X_testing = stock_df.drop(columns=['Target'])
        y_testing = stock_df['Target']

        X_testing = pd.DataFrame(X_testing).replace([np.inf, -np.inf], np.nan)
        X_testing = imputer.transform(X_testing)
        X_testing = scaler.transform(X_testing)
        y_testing = np.array([to_categorical(y, threshold1, threshold2) for y in y_testing])

        yield X_testing, y_testing

In [None]:
# Evaluate the model on each stock separately
def evaluate_test_sets(model, testing_sets, create_features, label_pct_change_vectorized, threshold1, threshold2, imputer, scaler):
    for i, testing_set in enumerate(testing_sets):
        accuracies = []
        precisions = []
        positive_pcts = []

        for X_testing, y_testing in process_testing_set(testing_set, create_features, label_pct_change_vectorized, threshold1, threshold2, imputer, scaler):
            y_pred = model.predict(X_testing)
            accuracy = accuracy_score(y_testing, y_pred)
            precision = precision_score(y_testing, y_pred, average='weighted')
            positive_pct = (y_pred == 1).sum() / len(y_pred)

            accuracies.append(accuracy)
            precisions.append(precision)
            positive_pcts.append(positive_pct)

        print(f"Testing set {i + 1}:")
        print(f"Accuracy={np.mean(accuracies):.4f}, Precision={np.mean(precisions):.4f}, Positive Pct={np.mean(positive_pcts):.4f}\n")

        # Clear memory
        del testing_set, accuracies, precisions, positive_pcts
        gc.collect()

In [None]:
# Evaluate the model on both testing sets
evaluate_test_sets(ensemble_methods[best_model], [testing_set1, testing_set2, testing_set3], create_features, label_pct_change_vectorized, threshold1, threshold2, imputer, scaler)