In [1]:
import pandas as pd
import numpy as np
from functions import dataloader, add_features, data_split, scale
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt


In [2]:
df = dataloader()
df = add_features(df)

  monthly_return = df['Close'].resample('M').ffill().pct_change().shift(1)
  monthly_return = monthly_return.fillna(method='bfill')


In [3]:
features = [
    'MA5', 'MA10', 'MA20', 'Momentum_1d', 
    'Momentum_5d', 'Monthly_Return', 'Open', 'High', 'Low', 'Close'
]


In [4]:
X = df[features]
y = df['target']

In [5]:
X = X.to_numpy()
y = y.to_numpy()

In [6]:
X_train, X_test, y_train, y_test = data_split(X,y)

In [7]:
X_train.shape

(1802, 10)

In [8]:
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled=scale(X_train, X_test, y_train, y_test)

In [9]:
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)


In [10]:
y_pred = model.predict(X_test_scaled)

In [11]:
print(y_pred)

[0.89375137 0.89143278 0.87756734 0.88142417 0.92728694 0.88672237
 0.84716377 0.8552366  0.83164244 0.79142217 0.79857752 0.82475014
 0.79782667 0.81155641 0.83173875 0.79218501 0.81788151 0.81640739
 0.81068833 0.70846643 0.71451954 0.70319477 0.70469161 0.72550803
 0.73295374 0.76574601 0.70299378 0.69768522 0.63464093 0.6436908
 0.58951403 0.58148227 0.61686946 0.57902502 0.60872394 0.56755004
 0.56895675 0.527747   0.53840025 0.50123327 0.52553531 0.5638669
 0.60785681 0.60789673 0.59580833 0.623192   0.56704497 0.57529566
 0.57504526 0.58412785 0.57755897 0.55690477 0.51670471 0.52760086
 0.55665915 0.50835231 0.51952851 0.56772044 0.56850746 0.56390933
 0.59088336 0.59106252 0.56170013 0.54899585 0.53911279 0.54589808
 0.55615865 0.55564949 0.58601929 0.602651   0.56278397 0.55993049
 0.56940309 0.57217395 0.57820688 0.5794754  0.58908853 0.59605744
 0.65380862 0.65868885 0.65004367 0.62548507 0.66352436 0.67903097
 0.71772    0.72168654 0.72892054 0.7457738  0.74934264 0.698175

In [None]:
# y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

In [12]:
mse = mean_squared_error(y_test_scaled, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_scaled, y_pred)
r2 = r2_score(y_test_scaled, y_pred)

In [14]:
print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error: 0.0005
Root Mean Squared Error: 0.0212
Mean Absolute Error: 0.0157
R² Score: 0.9714


In [15]:
from sklearn.ensemble import RandomForestRegressor


def train_random_forest():

        


    X_train, X_test, y_train, y_test=data_split(X,y)
        


    X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled=scale(X_train, X_test, y_train, y_test)
        

        
    # Train Random Forest model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train_scaled, y_train_scaled)
        
    # Make predictions (on scaled data)
    y_pred= rf_model.predict(X_test_scaled)
        

        
        # Evaluate model
    mse = mean_squared_error(y_test_scaled, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test_scaled, y_pred)
    r2 = r2_score(y_test_scaled, y_pred)
        
    results = {
            'MSE': mse,
            'RMSE': rmse,
            'MAE': mae,
            'R²': r2
        }
        

        
    return rf_model, results
        


# Run the model
rf_model, results = train_random_forest()

# Print results
if results:
    print("\nRandom Forest Model Performance:")
    for metric, value in results.items():
        print(f"{metric}: {value:.4f}")


Random Forest Model Performance:
MSE: 0.0008
RMSE: 0.0291
MAE: 0.0222
R²: 0.9461


In [16]:
import xgboost as xgb

def train_xgboost():
    try:
        # Load and prepare data
        df = dataloader()
        df = add_features(df)
        

        

        


        
        # Train XGBoost model
        xgb_model = xgb.XGBRegressor(
            objective='reg:squarederror',
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        )
        xgb_model.fit(X_train_scaled, y_train_scaled)
        
        # Make predictions (on scaled data)
        y_pred = xgb_model.predict(X_test_scaled)
        

        
        # Evaluate model
        mse = mean_squared_error(y_test_scaled, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test_scaled, y_pred)
        r2 = r2_score(y_test_scaled, y_pred)
        
        results = {
            'MSE': mse,
            'RMSE': rmse,
            'MAE': mae,
            'R²': r2
        }
        

        

        


        
        return xgb_model, results
        
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        return None, None

# Run the model
xgb_model, results = train_xgboost()

# Print results
if results:
    print("\nXGBoost Model Performance:")
    for metric, value in results.items():
        print(f"{metric}: {value:.4f}")


XGBoost Model Performance:
MSE: 0.0008
RMSE: 0.0279
MAE: 0.0217
R²: 0.9505


  monthly_return = df['Close'].resample('M').ffill().pct_change().shift(1)
  monthly_return = monthly_return.fillna(method='bfill')


In [17]:
from sklearn.tree import DecisionTreeRegressor


def train_decision_tree():
    
    # Load and prepare data
    df = dataloader()
    df = add_features(df)
    

    

    

    
    # Train Decision Tree model with limited depth to prevent overfitting
    dt_model = DecisionTreeRegressor(
        max_depth=5,  # Limit tree depth to prevent overfitting
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    )
    dt_model.fit(X_train_scaled, y_train_scaled)
    
    # Make predictions (on scaled data)
    y_pred = dt_model.predict(X_test_scaled)
    

    
    # Evaluate model
    mse = mean_squared_error(y_test_scaled, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test_scaled, y_pred)
    r2 = r2_score(y_test_scaled, y_pred)
    
    results = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2
    }
    

    


    
    return dt_model, results
        


# Run the model
dt_model, results = train_decision_tree()

# Print results
if results:
    print("\nDecision Tree Model Performance:")
    for metric, value in results.items():
        print(f"{metric}: {value:.4f}")


Decision Tree Model Performance:
MSE: 0.0015
RMSE: 0.0392
MAE: 0.0290
R²: 0.9024


  monthly_return = df['Close'].resample('M').ffill().pct_change().shift(1)
  monthly_return = monthly_return.fillna(method='bfill')
