In [64]:
# Imports
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import numpy as np
from cuml.svm import SVR as cumlSVR
# from sklearn.ensemble import RandomForestRegressor
from cuml.ensemble import RandomForestRegressor
import cudf

In [80]:
# Loading Data into a DataFrame
final_dataset_path = "data/final_dataset.csv"

final_dataset = pd.read_csv(final_dataset_path, nrows=200000)

In [81]:
# Split the features and output
final_features = final_dataset.iloc[:, 1:418]
final_output = final_dataset.iloc[:, [421]]

In [45]:
def number_of_components(df):
    """
    Determines the minimum number of principal components required to retain 95% of the variance in the data.

    Parameters:
    df (DataFrame): Scaled Input Features where each row is a sample and each column is a feature.

    Returns:
    tuple: A tuple containing:
        - n_components (int): The number of principal components required to retain 95% of the variance.
        - cumulative_explained_variance (float): The actual cumulative explained variance achieved with the selected number of components.
    """
    pca = PCA()
    pca.fit(df)

    # Calculate the explained variance ratio
    explained_variance_ratio = pca.explained_variance_ratio_

    # Calculate the cumulative explained variance
    cumulative_explained_variance = np.cumsum(explained_variance_ratio)

    # Find the number of components for 95% variance
    n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1

    return (n_components, cumulative_explained_variance[n_components-1])

In [46]:
def pca_transform(df, n_components):
    """
    Applies PCA to reduce the dimensionality of a DataFrame.

    Parameters:
    df (DataFrame): Scaled Input Features.
    n_components (int): The number of components required to retain 95% of variance.

    Returns:
    DataFrame: Returns a pd.DataFrame with the PCA transformation.
    """
    pca = PCA(n_components=n_components)
    pca.fit(df)
    df_pca = pca.transform(df)
    
    return pd.DataFrame(df_pca, columns=[f"PC{i+1}" for i in range(n_components)])

In [7]:
# Correlation Heatmap of the input features for the final data
sns.heatmap(final_features.corr())

In [8]:
# Determining the number of components needed for PCA for the final data
(n_components, cumulative_explained_variance) = number_of_components(final_features)

# Display results
print(f"Number of components to retain 95% of variance: {n_components}")
print(
    f"Cumulative explained variance for the selected components: {cumulative_explained_variance}"
)

In [9]:
# Perform PCA
final_pca = pca_transform(final_features, n_components)
sns.heatmap(final_pca.corr())

In [73]:
n_estimators = [100, 150, 200]
mses = []
r2s = []
# for n_estimator in n_estimators:
n_components = 300
final_pca = pca_transform(final_features, n_components)
X_train, X_test, y_train, y_test = train_test_split(final_pca, final_output, test_size=0.2, random_state=42)
rf_regressor = RandomForestRegressor(n_estimators=200, random_state=42)
rf_regressor.fit(X_train, y_train)
y_pred = rf_regressor.predict(X_test)
mses.append(mean_squared_error(y_test, y_pred))

  return func(**kwargs)
  ret = func(*args, **kwargs)


In [6]:
components = [100] # Experiment with differing numbers of components
mses = []
r2s = []
for n_components in components:
    final_pca = pca_transform(final_features, n_components)
    X_train, X_test, y_train, y_test = train_test_split(final_pca, final_output, test_size=0.2, random_state=42)
    svr_model = cumlSVR(kernel="rbf", C=1.0, epsilon=0.1) # SVR with the linear kernel can be much faster using the specialized solver provided by LinearSVR. Consider switching to LinearSVR if tranining takes too long.
    svr_model.fit(X_train, y_train)
    y_pred = svr_model.predict(X_test)
    mses.append(mean_squared_error(y_test, y_pred))
    r2s.append(r2_score(y_test, y_pred))