In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

In [2]:
# Load the time series data
data = pd.read_excel('rea_holdings_share_prices.xls',header=1)

In [3]:
# Defining different lag periods to experiment with
lag_periods = [1, 3, 6, 9, 12]  # Monthly lag features for different periods

In [4]:
# Initializing dictionaries to store results
model_results = {'Random Forest': [], 'Gradient Boosting': [], 'Linear Regression': []}

In [5]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close
0,2015-01-02,702.5,702.5,702.5,702.5
1,2015-01-05,697.5,697.5,697.5,697.5
2,2015-01-06,682.5,682.5,682.5,682.5
3,2015-01-07,690.0,690.0,690.0,690.0
4,2015-01-08,705.0,705.0,705.0,705.0


In [7]:
# Iterate over lag periods
for lag_period in lag_periods:
    # Feature Engineering: Create lag features for the 'Close' price
    for i in range(1, lag_period + 1):
        data[f'lag_{i}'] = data['Close'].shift(i)
    # Drop rows with NaN resulting from lagged features
    data.dropna(inplace=True)
    # Split data into features (X) and target variable (y)
    X = data.drop(['Date', 'Close'], axis=1)
    y = data['Close']

    # Train-Test Split (using TimeSeriesSplit for sequential split)
    tscv = TimeSeriesSplit(n_splits=5)
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Initialize and train the Random Forest model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    rf_pred = rf_model.predict(X_test)
    rf_mse = mean_squared_error(y_test, rf_pred)
    model_results['Random Forest'].append(rf_mse)

    # Initialize and train the Gradient Boosting model
    gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_model.fit(X_train, y_train)
    gb_pred = gb_model.predict(X_test)
    gb_mse = mean_squared_error(y_test, gb_pred)
    model_results['Gradient Boosting'].append(gb_mse)

    # Initialize and train the Linear Regression model
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    lr_pred = lr_model.predict(X_test)
    lr_mse = mean_squared_error(y_test, lr_pred)
    model_results['Linear Regression'].append(lr_mse)

In [8]:
# Convert results to DataFrame for easy comparison of models
results_df = pd.DataFrame(model_results, index=lag_periods)
print(results_df)

    Random Forest  Gradient Boosting  Linear Regression
1       36.911372          32.344032       2.397706e-25
3       33.145869          32.945414       8.269705e-26
6       32.380161          35.967174       5.391144e-26
9       28.230743          32.172813       5.469225e-26
12      36.521533          36.734928       6.582743e-26


In [9]:
data.tail(5)

Unnamed: 0,Date,Open,High,Low,Close,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11,lag_12
2229,2023-12-15,930.0,930.0,930.0,930.0,935.0,920.0,935.0,940.0,945.0,945.0,940.0,950.0,960.0,965.0,965.0,965.0
2230,2023-12-18,940.0,940.0,940.0,940.0,930.0,935.0,920.0,935.0,940.0,945.0,945.0,940.0,950.0,960.0,965.0,965.0
2231,2023-12-19,945.0,945.0,945.0,945.0,940.0,930.0,935.0,920.0,935.0,940.0,945.0,945.0,940.0,950.0,960.0,965.0
2232,2023-12-20,950.0,950.0,950.0,950.0,945.0,940.0,930.0,935.0,920.0,935.0,940.0,945.0,945.0,940.0,950.0,960.0
2233,2023-12-21,940.0,940.0,940.0,940.0,950.0,945.0,940.0,930.0,935.0,920.0,935.0,940.0,945.0,945.0,940.0,950.0
