In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler



In [None]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# Apply PCA (keep all components initially)
pca = PCA()
X_pca = pca.fit_transform(X_scaled)


In [None]:
# Plot cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by PCA Components')
plt.grid(True)
plt.show()


In [None]:
# Create a DataFrame with first two principal components
pca_df = pd.DataFrame(X_pca[:, :2], columns=['PC1', 'PC2'])
pca_df['target'] = y.values

# Scatter plot
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='target', palette='Set1')
plt.title('PCA Projection (First 2 Components)')
plt.show()


In [None]:
# Find how many components explain at least 95% of the variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1
print(f'Number of components explaining 95% variance: {n_components_95}')


In [None]:
# Apply PCA again using only the top 12 components
pca_12 = PCA(n_components=12)
X_pca_12 = pca_12.fit_transform(X_scaled)

# Confirm the shape
print("Shape after PCA:", X_pca_12.shape)


In [None]:
# Create a DataFrame with the 12 principal components
pca_columns = [f'PC{i+1}' for i in range(12)]
pca_df_12 = pd.DataFrame(X_pca_12, columns=pca_columns)
pca_df_12['target'] = y.values

# Show first 5 rows
print(pca_df_12.head())


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Fit Random Forest on original data (not PCA)
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X, y)

# Get importance
importances = model_rf.feature_importances_
features = X.columns

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=features, hue=features, palette='viridis', legend=False)
plt.title('Feature Importance from Random Forest')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()
