In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

pca_df = pd.read_csv("data/pca_features.csv")
combined_df = pd.read_csv("data/combined_data_and_features.csv")

pca_df["date"] = pd.to_datetime(pca_df["date"])
combined_df["date"] = pd.to_datetime(combined_df["date"])

merged_data = pd.merge(pca_df, combined_df[["company", "date", "close"]], on=["company", "date"], how="inner")

X = merged_data.drop(columns=["company", "date", "close"])
y = merged_data["close"]

X = X.fillna(X.mean())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Decision Tree Regressor")
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
r2_dt = r2_score(y_test, y_pred_dt)
print(f"Decision Tree RMSE: {rmse_dt:.2f}")
print(f"Decision Tree R2 Score: {r2_dt:.2f}")

print("\nRandom Forest Regressor")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1) # n_jobs=-1 uses all available cores
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest RMSE: {rmse_rf:.2f}")
print(f"Random Forest R2 Score: {r2_rf:.2f}")

print("Model training complete.")





--- Decision Tree Regressor ---
Decision Tree RMSE: 349.37
Decision Tree R2 Score: 0.63

--- Random Forest Regressor ---
Random Forest RMSE: 352.51
Random Forest R2 Score: 0.62
Model training complete.


In [None]:


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

feature_importances = rf_model.feature_importances_

features_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})
features_df = features_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importances from Random Forest -- PCA Components")
print(features_df)

numerical_cols = combined_df.select_dtypes(include=np.number).columns.tolist()
exclude_cols = [col for col in ["open", "high", "low", "close", "adj_close", "volume"] if col in numerical_cols]
features_for_pca = [col for col in numerical_cols if col not in exclude_cols]

X_original = combined_df[features_for_pca].copy()
X_original.dropna(axis=1, how="all", inplace=True)
X_original = X_original.fillna(X_original.mean())

scaler = StandardScaler()
X_scaled_original = scaler.fit_transform(X_original)

pca = PCA(n_components=0.95)
pca.fit(X_scaled_original)

print("\n--- Top contributing original features to the first few Principal Components ---")
num_components_to_show = min(5, pca.n_components_)
for i in range(num_components_to_show):
    component = pca.components_[i]
    

    
    loadings = pd.Series(component, index=X_original.columns)
    

    sorted_loadings = loadings.abs().sort_values(ascending=False)
    print(f"\nPrincipal Component {i+1} (Explained Variance: {pca.explained_variance_ratio_[i]:.2f}):")
    print(sorted_loadings.head(10)) # Show top 10 contributing features





--- Feature Importances from Random Forest (PCA Components) ---
                   Feature  Importance
7    principal_component_8    0.301159
5    principal_component_6    0.193196
9   principal_component_10    0.091175
0    principal_component_1    0.050520
3    principal_component_4    0.049471
8    principal_component_9    0.045738
11  principal_component_12    0.026915
22  principal_component_23    0.025409
31  principal_component_32    0.022996
2    principal_component_3    0.016270
15  principal_component_16    0.013168
4    principal_component_5    0.012232
12  principal_component_13    0.011283
33  principal_component_34    0.010898
26  principal_component_27    0.010822
10  principal_component_11    0.010241
6    principal_component_7    0.009589
38  principal_component_39    0.009213
21  principal_component_22    0.008261
17  principal_component_18    0.007995
24  principal_component_25    0.007642
1    principal_component_2    0.006780
13  principal_component_14    0.006549