In [35]:
import pandas as pd

# Load the combined dataset
combined_df = pd.read_csv("data/combined_data_and_features.csv")

combined_df["date"] = pd.to_datetime(combined_df["date"])

pre_covid_end = pd.to_datetime("2019-12-31")
covid_start = pd.to_datetime("2020-01-01")
covid_end = pd.to_datetime("2020-09-17") # End of available data

# Split data into periods
pre_covid_df = combined_df[combined_df["date"] <= pre_covid_end]
covid_df = combined_df[(combined_df["date"] >= covid_start) & (combined_df["date"] <= covid_end)]

# Post-COVID period will be empty with current data, so not creating a separate dataframe for it.

# Save the split dataframes to CSV
pre_covid_df.to_csv("data/pre_covid_features.csv", index=False)
covid_df.to_csv("data/covid_features.csv", index=False)

print(f"Pre-COVID data shape: {pre_covid_df.shape}")
print(f"COVID data shape: {covid_df.shape}")
print("Data split by adjusted time periods and saved to CSV files.")




Pre-COVID data shape: (128, 159)
COVID data shape: (22, 159)
Data split by adjusted time periods and saved to CSV files.


In [39]:

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

def perform_pca_for_period(file_path, period_name):
    print(f"\n--- Performing PCA for {period_name} period ---")
    df = pd.read_csv(file_path)

    if df.empty:
        print(f"No data for {period_name} period. Skipping PCA.")
        return

    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
    exclude_cols = [col for col in ["open", "high", "low", "close", "adj_close", "volume"] if col in numerical_cols]
    features_for_pca = [col for col in numerical_cols if col not in exclude_cols]

    X = df[features_for_pca].copy()

    # Drop columns that are entirely NaN
    X.dropna(axis=1, how='all', inplace=True)

    # Impute remaining NaNs with the mean
    X = X.fillna(X.mean())

    # Drop rows with any remaining NaNs (shouldn't happen if imputation is effective)
    X.dropna(inplace=True)

    if X.empty:
        print(f"No valid numerical data for {period_name} period after preprocessing. Skipping PCA.")
        return

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    print(f"Number of components to retain 95% variance for {period_name}: {pca.n_components_}")
    print(f"Cumulative explained variance for {period_name}: {np.sum(pca.explained_variance_ratio_)}")

    pca_df = pd.DataFrame(data=X_pca, columns=[f"principal_component_{i+1}" for i in range(pca.n_components_)])
    pca_df = pd.concat([df.loc[X.index, ["company", "date"]], pca_df], axis=1)
    pca_df.to_csv(f"data/pca_features_{period_name}.csv", index=False)

    plt.figure(figsize=(10, 6))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel("Number of Components")
    plt.ylabel("Cumulative Explained Variance")
    plt.title(f"Explained Variance by Number of Components ({period_name})")
    plt.grid(True)
    plt.savefig(f"explained_variance_{period_name}.png")
    plt.close()

    print(f"PCA analysis complete for {period_name}. PCA features saved to pca_features_{period_name}.csv")
    print(f"Explained variance plot saved to explained_variance_{period_name}.png")

# Perform PCA for each period
perform_pca_for_period("data/pre_covid_features.csv", "pre_covid")
perform_pca_for_period("data/covid_features.csv", "covid")
# perform_pca_for_period("post_covid_features.csv", "post_covid") # This will be empty based on current data





--- Performing PCA for pre_covid period ---
Number of components to retain 95% variance for pre_covid: 37
Cumulative explained variance for pre_covid: 0.9520716743187168
PCA analysis complete for pre_covid. PCA features saved to pca_features_pre_covid.csv
Explained variance plot saved to explained_variance_pre_covid.png

--- Performing PCA for covid period ---
Number of components to retain 95% variance for covid: 14
Cumulative explained variance for covid: 0.957323171447637
PCA analysis complete for covid. PCA features saved to pca_features_covid.csv
Explained variance plot saved to explained_variance_covid.png


In [40]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


def analyze_feature_importance_for_period(pca_file_path, combined_file_path, period_name):
    print(f"\n--- Analyzing Feature Importance for {period_name} period ---")
    pca_df = pd.read_csv(pca_file_path)
    combined_df = pd.read_csv(combined_file_path)

    if pca_df.empty or combined_df.empty:
        print(f"No data for {period_name} period. Skipping feature importance analysis.")
        return

    # Ensure date columns are datetime objects for merging
    pca_df["date"] = pd.to_datetime(pca_df["date"])
    combined_df["date"] = pd.to_datetime(combined_df["date"])

    # Merge PCA features with the target variable (e.g., 'close' price from combined_df)
    merged_data = pd.merge(pca_df, combined_df[["company", "date", "close"]], on=["company", "date"], how="inner")

    # Define features (X) and target (y)
    X = merged_data.drop(columns=["company", "date", "close"])
    y = merged_data["close"]

    # Handle any remaining NaNs in X
    X = X.fillna(X.mean())

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the Random Forest Regressor
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)

    # Get feature importances
    feature_importances = rf_model.feature_importances_

    # Create a DataFrame for feature importances
    features_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    })

    # Sort by importance
    features_df = features_df.sort_values(by='Importance', ascending=False)

    print(f"\n--- Top Feature Importances from Random Forest (PCA Components) for {period_name} ---")
    print(features_df.head(10)) # Print top 10 PCA component importances

    # To understand the original factors, we need to analyze the PCA components themselves.
    # Re-create X_original and PCA object for this period
    numerical_cols = combined_df.select_dtypes(include=np.number).columns.tolist()
    exclude_cols = [col for col in ["open", "high", "low", "close", "adj_close", "volume"] if col in numerical_cols]
    features_for_pca = [col for col in numerical_cols if col not in exclude_cols]

    X_original = combined_df[features_for_pca].copy()
    X_original.dropna(axis=1, how="all", inplace=True)
    X_original = X_original.fillna(X_original.mean())
    X_original.dropna(inplace=True) # Drop rows with any remaining NaNs

    if X_original.empty:
        print(f"No valid original numerical data for {period_name} period after preprocessing. Cannot analyze original feature contributions.")
        return

    scaler = StandardScaler()
    X_scaled_original = scaler.fit_transform(X_original)

    pca = PCA(n_components=0.95)
    pca.fit(X_scaled_original)

    print(f"\n--- Top contributing original features to the first few Principal Components for {period_name} ---")
    num_components_to_show = min(5, pca.n_components_)
    for i in range(num_components_to_show):
        component = pca.components_[i]
        loadings = pd.Series(component, index=X_original.columns)
        sorted_loadings = loadings.abs().sort_values(ascending=False)
        print(f"\nPrincipal Component {i+1} (Explained Variance: {pca.explained_variance_ratio_[i]:.2f}):")
        print(sorted_loadings.head(10)) # Show top 10 contributing features

# Analyze for each period
analyze_feature_importance_for_period("data/pca_features_pre_covid.csv", "data/pre_covid_features.csv", "pre_covid")
analyze_feature_importance_for_period("data/pca_features_covid.csv", "data/covid_features.csv", "covid")
# analyze_feature_importance_for_period("pca_features_post_covid.csv", "post_covid_features.csv", "post_covid") # Not available with current data





--- Analyzing Feature Importance for pre_covid period ---

--- Top Feature Importances from Random Forest (PCA Components) for pre_covid ---
                   Feature  Importance
7    principal_component_8    0.247749
6    principal_component_7    0.171121
0    principal_component_1    0.052269
10  principal_component_11    0.052153
1    principal_component_2    0.047491
4    principal_component_5    0.044097
27  principal_component_28    0.041488
5    principal_component_6    0.035059
17  principal_component_18    0.026956
3    principal_component_4    0.026472

--- Top contributing original features to the first few Principal Components for pre_covid ---

Principal Component 1 (Explained Variance: 0.26):
Close                  0.154729
Low                    0.154689
High                   0.154424
Open                   0.154213
trend_ichimoku_conv    0.153609
momentum_kama          0.153310
volatility_kcl         0.153283
trend_ichimoku_a       0.153120
trend_ema_fast         0.1