In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Load the dataset (ensure that it's not too large for memory)
file_path = '/content/tesla_stock_data_final_cleaneddata(noduplciates_nomissingvalues).csv'  # Adjust path if needed
df = pd.read_csv(file_path)

# Optimize the data types of columns to reduce memory usage
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['year'] = df['timestamp'].dt.year.astype('int16')
df['month'] = df['timestamp'].dt.month.astype('int8')
df['day_of_week'] = df['timestamp'].dt.dayofweek.astype('int8')
df['week_of_year'] = df['timestamp'].dt.isocalendar().week.astype('int8')

# Use 'float32' or 'int32' where applicable to save memory
df['close'] = df['close'].astype('float32')
df['high'] = df['high'].astype('float32')
df['low'] = df['low'].astype('float32')
df['open'] = df['open'].astype('float32')

# Feature Engineering: Keep it simple with lag features and rolling stats
# 1. Lag Features (7, 14, and 30 days)
df['high_lag_7'] = df['high'].shift(7)
df['low_lag_7'] = df['low'].shift(7)
df['close_lag_7'] = df['close'].shift(7)

df['high_lag_14'] = df['high'].shift(14)
df['low_lag_14'] = df['low'].shift(14)
df['close_lag_14'] = df['close'].shift(14)

df['high_lag_30'] = df['high'].shift(30)
df['low_lag_30'] = df['low'].shift(30)
df['close_lag_30'] = df['close'].shift(30)

# 2. Rolling Features (30-day rolling mean and std)
df['close_rolling_mean_30'] = df['close'].rolling(window=30).mean().astype('float32')
df['high_rolling_mean_30'] = df['high'].rolling(window=30).mean().astype('float32')
df['low_rolling_mean_30'] = df['low'].rolling(window=30).mean().astype('float32')

df['close_rolling_std_30'] = df['close'].rolling(window=30).std().astype('float32')
df['high_rolling_std_30'] = df['high'].rolling(window=30).std().astype('float32')
df['low_rolling_std_30'] = df['low'].rolling(window=30).std().astype('float32')

# 3. Price Differences
df['price_diff'] = (df['high'] - df['low']).astype('float32')
df['close_open_diff'] = (df['close'] - df['open']).astype('float32')

# 4. Moving Averages (50 and 200 days)
df['close_50ma'] = df['close'].rolling(window=50).mean().astype('float32')
df['close_200ma'] = df['close'].rolling(window=200).mean().astype('float32')

# Drop rows with NaN values (created by rolling and lagging)
df = df.dropna()

# Prepare Features and Target Variables
X = df[['year', 'month', 'day_of_week', 'week_of_year',
        'close_rolling_mean_30', 'high_rolling_mean_30', 'low_rolling_mean_30',
        'close_rolling_std_30', 'high_rolling_std_30', 'low_rolling_std_30',
        'price_diff', 'close_open_diff', 'close_50ma', 'close_200ma',
        'high_lag_7', 'low_lag_7', 'close_lag_7',
        'high_lag_14', 'low_lag_14', 'close_lag_14',
        'high_lag_30', 'low_lag_30', 'close_lag_30']]

# Target Variables (high and low prices)
y_high = df['high']
y_low = df['low']



In [None]:
import time
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Split the Data into Training and Testing Sets (80-20 split)
X_train, X_test, y_train_high, y_test_high, y_train_low, y_test_low = train_test_split(
    X, y_high, y_low, test_size=0.2, shuffle=False
)

# Define the desired batch size (adjustable)
batch_size = 50000  # You can change this value to control the batch size

# Calculate number of batches
num_batches = len(X_train) // batch_size + (1 if len(X_train) % batch_size != 0 else 0)

# Create batches manually
X_train_batches = [X_train[i * batch_size : (i + 1) * batch_size] for i in range(num_batches)]
y_train_high_batches = [y_train_high[i * batch_size : (i + 1) * batch_size] for i in range(num_batches)]
y_train_low_batches = [y_train_low[i * batch_size : (i + 1) * batch_size] for i in range(num_batches)]

# Initialize the models
model_high = RandomForestRegressor(n_estimators=50, random_state=42)
model_low = RandomForestRegressor(n_estimators=50, random_state=42)

# Initialize variables to track total training time
total_training_time_high = 0
total_training_time_low = 0

# Measure training time for High and Low Price models in batches
for batch_num in range(num_batches):
    # Get the current batch of data
    X_batch = X_train_batches[batch_num]
    y_batch_high = y_train_high_batches[batch_num]
    y_batch_low = y_train_low_batches[batch_num]

    # Calculate the range of samples for the current batch
    start_index = batch_num * len(X_batch)
    end_index = start_index + len(X_batch) - 1

    # Display the sample range being trained on
    print(f"Training on batch from sample {start_index} to {end_index} (Batch size: {len(X_batch)} samples)")

    # Measure training time for High Price Prediction model
    start_time_high = time.time()
    model_high.fit(X_batch, y_batch_high)
    end_time_high = time.time()
    training_time_high = end_time_high - start_time_high
    total_training_time_high += training_time_high
    print(f"Training time for high price model (Batch): {training_time_high:.2f} seconds")

    # Measure training time for Low Price Prediction model
    start_time_low = time.time()
    model_low.fit(X_batch, y_batch_low)
    end_time_low = time.time()
    training_time_low = end_time_low - start_time_low
    total_training_time_low += training_time_low
    print(f"Training time for low price model (Batch): {training_time_low:.2f} seconds")

# Output the total training time after all batches are processed
print(f"Total training time for High Price model: {total_training_time_high:.2f} seconds")
print(f"Total training time for Low Price model: {total_training_time_low:.2f} seconds")

# Evaluate the models on the Test Set after all batches are trained
y_pred_high = model_high.predict(X_test)
y_pred_low = model_low.predict(X_test)

# Calculate Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared (R²) for both predictions
mae_high = mean_absolute_error(y_test_high, y_pred_high)
mae_low = mean_absolute_error(y_test_low, y_pred_low)

mse_high = mean_squared_error(y_test_high, y_pred_high)
mse_low = mean_squared_error(y_test_low, y_pred_low)

r2_high = r2_score(y_test_high, y_pred_high)
r2_low = r2_score(y_test_low, y_pred_low)

# Print the evaluation metrics after all batches have been trained
print(f'Mean Absolute Error for High Price Prediction: {mae_high:.4f}')
print(f'Mean Absolute Error for Low Price Prediction: {mae_low:.4f}')
print(f'Mean Squared Error for High Price Prediction: {mse_high:.4f}')
print(f'Mean Squared Error for Low Price Prediction: {mse_low:.4f}')
print(f'R-squared for High Price Prediction: {r2_high:.4f}')
print(f'R-squared for Low Price Prediction: {r2_low:.4f}')


Training on batch from sample 0 to 49999 (Batch size: 50000 samples)
Training time for high price model (Batch): 78.37 seconds
Training time for low price model (Batch): 77.76 seconds
Training on batch from sample 50000 to 99999 (Batch size: 50000 samples)
Training time for high price model (Batch): 80.12 seconds
Training time for low price model (Batch): 79.86 seconds
Training on batch from sample 100000 to 149999 (Batch size: 50000 samples)
Training time for high price model (Batch): 75.74 seconds
Training time for low price model (Batch): 76.13 seconds
Training on batch from sample 150000 to 199999 (Batch size: 50000 samples)
Training time for high price model (Batch): 75.04 seconds
Training time for low price model (Batch): 74.17 seconds
Training on batch from sample 200000 to 249999 (Batch size: 50000 samples)
Training time for high price model (Batch): 71.67 seconds
Training time for low price model (Batch): 72.44 seconds
Training on batch from sample 250000 to 299999 (Batch size

In [5]:
import time
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint

# Split the Data into Training and Testing Sets (80-20 split)
X_train, X_test, y_train_high, y_test_high, y_train_low, y_test_low = train_test_split(
    X, y_high, y_low, test_size=0.2, shuffle=False
)

# Define the Random Forest Hyperparameter Grid (Optimized)
param_dist = {
    'n_estimators': randint(100, 120),  # Smaller range for number of trees
    'max_depth': randint(5, 10),         # Limit maximum depth
    'min_samples_split': randint(2, 6),  # Reduced range
    'min_samples_leaf': randint(1, 6),   # Reduced range
    'max_features': ['sqrt', 'log2'],    # Keep as is
    'bootstrap': [True, False]            # Keep as is
}

# Initialize the models
model_high = RandomForestRegressor(random_state=42, n_jobs=-1)
model_low = RandomForestRegressor(random_state=42, n_jobs=-1)

# Initialize RandomizedSearchCV for hyperparameter tuning with fewer iterations and cross-validation folds
random_search_high = RandomizedSearchCV(model_high, param_dist, n_iter=3, cv=2, random_state=42, n_jobs=-1, verbose=2)
random_search_low = RandomizedSearchCV(model_low, param_dist, n_iter=3, cv=2, random_state=42, n_jobs=-1, verbose=2)

# Define the desired batch size
batch_size = 10000  # Reduced batch size

# Calculate the number of batches
num_batches = len(X_train) // batch_size + (1 if len(X_train) % batch_size != 0 else 0)

# Create batches manually
X_train_batches = [X_train[i * batch_size : (i + 1) * batch_size] for i in range(num_batches)]
y_train_high_batches = [y_train_high[i * batch_size : (i + 1) * batch_size] for i in range(num_batches)]
y_train_low_batches = [y_train_low[i * batch_size : (i + 1) * batch_size] for i in range(num_batches)]

# Initialize variables to track total training time
total_training_time_high = 0
total_training_time_low = 0

# Track predictions for evaluation
y_pred_high = []
y_pred_low = []

# Measure and train the models on batches
for batch_num in range(num_batches):
    # Get the current batch of data
    X_batch = X_train_batches[batch_num]
    y_batch_high = y_train_high_batches[batch_num]
    y_batch_low = y_train_low_batches[batch_num]

    # Calculate the range of samples for the current batch
    start_index = batch_num * len(X_batch)
    end_index = start_index + len(X_batch) - 1

    # Display the sample range being trained on
    print(f"Training on batch from sample {start_index} to {end_index} (Batch size: {len(X_batch)} samples)")

    # Fit the RandomizedSearchCV on the batch for High Price Prediction
    start_time_high = time.time()
    random_search_high.fit(X_batch, y_batch_high)
    end_time_high = time.time()
    training_time_high = end_time_high - start_time_high
    total_training_time_high += training_time_high
    print(f"Training time for high price model (Batch): {training_time_high:.2f} seconds")

    # Fit the RandomizedSearchCV on the batch for Low Price Prediction
    start_time_low = time.time()
    random_search_low.fit(X_batch, y_batch_low)
    end_time_low = time.time()
    training_time_low = end_time_low - start_time_low
    total_training_time_low += training_time_low
    print(f"Training time for low price model (Batch): {training_time_low:.2f} seconds")

# Output the total training time
print(f"Total training time for High Price model: {total_training_time_high:.2f} seconds")
print(f"Total training time for Low Price model: {total_training_time_low:.2f} seconds")

# Output the best hyperparameters for both models
print("Best hyperparameters for High Price Model:")
print(random_search_high.best_params_)

print("Best hyperparameters for Low Price Model:")
print(random_search_low.best_params_)

# Use the best models found by RandomizedSearchCV
best_model_high = random_search_high.best_estimator_
best_model_low = random_search_low.best_estimator_

# Make predictions for the entire test set after training is complete
y_pred_high = best_model_high.predict(X_test)
y_pred_low = best_model_low.predict(X_test)

# Calculate Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared (R²) for both predictions
mae_high = mean_absolute_error(y_test_high, y_pred_high)
mae_low = mean_absolute_error(y_test_low, y_pred_low)

mse_high = mean_squared_error(y_test_high, y_pred_high)
mse_low = mean_squared_error(y_test_low, y_pred_low)

r2_high = r2_score(y_test_high, y_pred_high)
r2_low = r2_score(y_test_low, y_pred_low)



Training on batch from sample 0 to 9999 (Batch size: 10000 samples)
Fitting 2 folds for each of 3 candidates, totalling 6 fits
Training time for high price model (Batch): 19.22 seconds
Fitting 2 folds for each of 3 candidates, totalling 6 fits
Training time for low price model (Batch): 15.08 seconds
Training on batch from sample 10000 to 19999 (Batch size: 10000 samples)
Fitting 2 folds for each of 3 candidates, totalling 6 fits
Training time for high price model (Batch): 9.39 seconds
Fitting 2 folds for each of 3 candidates, totalling 6 fits
Training time for low price model (Batch): 10.05 seconds
Training on batch from sample 20000 to 29999 (Batch size: 10000 samples)
Fitting 2 folds for each of 3 candidates, totalling 6 fits
Training time for high price model (Batch): 10.72 seconds
Fitting 2 folds for each of 3 candidates, totalling 6 fits
Training time for low price model (Batch): 8.53 seconds
Training on batch from sample 30000 to 39999 (Batch size: 10000 samples)
Fitting 2 folds 

In [8]:
# Print all the metrics in one statement
print(f'Mean Absolute Error for High Price Prediction: {mae_high:.4f}')
print(f'Mean Absolute Error for Low Price Prediction: {mae_low:.4f}')
print(f'Mean Squared Error for High Price Prediction: {mse_high:.4f}')
print(f'Mean Squared Error for Low Price Prediction: {mse_low:.4f}')
print(f'R-squared for High Price Prediction: {r2_high:.4f}')
print(f'R-squared for Low Price Prediction: {r2_low:.4f}')


Mean Absolute Error for High Price Prediction: 5.4532
Mean Absolute Error for Low Price Prediction: 4.1234
Mean Squared Error for High Price Prediction: 29.6782
Mean Squared Error for Low Price Prediction: 17.2231
R-squared for High Price Prediction: 0.8214
R-squared for Low Price Prediction: 0.8743
