In [4]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

# Load feature matrix with summary statistics
feature_matrix = pd.read_csv("C:\\Users\\parshav\\Desktop\\EdTech_Final\\feature_matrix.csv")

# Separate features and target columns
feature_cols = [col for col in feature_matrix.columns if not col.startswith('TARGET')]
target_cols = [col for col in feature_matrix.columns if col.startswith('TARGET')]

X = feature_matrix[feature_cols]
y = feature_matrix[target_cols]

# Replace infinite values with NaN
X = X.replace([np.inf, -np.inf], np.nan)

# Drop columns where all values are NaN (cannot impute these)
X = X.dropna(axis=1, how='all')

# Impute missing values with mean of each column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Get columns after dropping all NaN columns for DataFrame creation
imputed_cols = X.columns

# Scale features using z-score normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Create DataFrame of scaled features
X_scaled_df = pd.DataFrame(X_scaled, columns=imputed_cols)

# Remove low-variance features
low_var_thresh = 1e-5
selector_var = VarianceThreshold(threshold=low_var_thresh)
X_low_var = selector_var.fit_transform(X_scaled_df)
cols_low_var = X_scaled_df.columns[selector_var.get_support()]

df_low_var = pd.DataFrame(X_low_var, columns=cols_low_var)

# Remove highly correlated features (corr > 0.95)
corr_matrix = df_low_var.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
df_final = df_low_var.drop(columns=to_drop)

print(f"Dropped {len(cols_low_var) - len(df_final.columns)} features due to high correlation")
print(f"Final features shape: {df_final.shape}")

# Save final preprocessed features and targets
df_final.to_csv('features_preprocessed.csv', index=False)
y.to_csv('targets.csv', index=False)

print("Preprocessing complete and saved to CSV.")


Dropped 153 features due to high correlation
Final features shape: (41, 363)
Preprocessing complete and saved to CSV.
