In [25]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
import numpy as np

In [10]:
# Initialize Faker
faker = Faker()

# Generate synthetic dataset
n_samples = 1000
data = {
    'Customer_ID': [faker.uuid4() for _ in range(n_samples)],
    'Age': np.random.randint(18, 70, size=n_samples),
    'Gender': np.random.choice(['Male', 'Female', 'Other'], size=n_samples),
    'Annual_Income': np.random.uniform(15000, 120000, size=n_samples),
    'Credit_Score': np.random.uniform(300, 850, size=n_samples),
    'Transaction_Amount': np.random.uniform(10, 5000, size=n_samples),
    'Purchase_Frequency': np.random.randint(1, 20, size=n_samples),
    'Region': np.random.choice(['North', 'South', 'East', 'West'], size=n_samples),
    'Customer_Satisfaction': np.random.choice([1, 2, 3, 4, 5], size=n_samples),
    'Join_Date': [faker.date_this_decade() for _ in range(n_samples)]
}

df = pd.DataFrame(data)

# Introduce inconsistencies
df.loc[random.sample(range(n_samples), 50), 'Age'] = np.nan  # Missing values
df.loc[random.sample(range(n_samples), 30), 'Annual_Income'] = np.nan
df.loc[random.sample(range(n_samples), 5), 'Transaction_Amount'] = 10000  # Outliers
df = pd.concat([df, df.iloc[0:10]])  # Duplicates

# Save raw dataset
df.to_csv('synthetic_customer_data.csv', index=False)
print("Dataset generated and saved as 'synthetic_customer_data.csv'")


Dataset generated and saved as 'synthetic_customer_data.csv'


In [16]:
# Load dataset
df_raw = pd.read_csv('synthetic_customer_data.csv')

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df_raw['Age'] = imputer.fit_transform(df_raw[['Age']])
df_raw['Annual_Income'] = imputer.fit_transform(df_raw[['Annual_Income']])

# Drop duplicates
df_cleaned = df_raw.drop_duplicates().reset_index(drop=True)

# Handle outliers in Transaction_Amount
df_cleaned.loc[df_cleaned['Transaction_Amount'] > 5000, 'Transaction_Amount'] = df_cleaned['Transaction_Amount'].median()

# Encode categorical variables
categorical_features = ['Gender', 'Region']
df_encoded = pd.get_dummies(df_cleaned, columns=categorical_features, drop_first=True)

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['Age', 'Annual_Income', 'Credit_Score', 'Transaction_Amount', 'Purchase_Frequency']
df_encoded[numerical_features] = scaler.fit_transform(df_encoded[numerical_features].copy())

# Discretize Age into bins
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
df_encoded['Age_Binned'] = discretizer.fit_transform(df_encoded[['Age']].copy())

# Dimensionality reduction using PCA
pca = PCA(n_components=2)
pca_features = pca.fit_transform(df_encoded[numerical_features].copy())
df_encoded.loc[:, 'PCA_1'] = pca_features[:, 0]
df_encoded.loc[:, 'PCA_2'] = pca_features[:, 1]

# Save preprocessed dataset
df_encoded.to_csv('preprocessed_customer_data.csv', index=False)
print("Preprocessing completed and saved as 'preprocessed_customer_data.csv'")


Preprocessing completed and saved as 'preprocessed_customer_data.csv'


In [23]:
# Load datasets
df_raw = pd.read_csv('synthetic_customer_data.csv')
df_preprocessed = pd.read_csv('preprocessed_customer_data.csv')

# Minimal preprocessing for raw data
df_raw['Age'] = df_raw['Age'].fillna(df_raw['Age'].mean())
df_raw['Annual_Income'] = df_raw['Annual_Income'].fillna(df_raw['Annual_Income'].mean())
df_raw = pd.get_dummies(df_raw, columns=['Gender', 'Region'], drop_first=True)

# Raw data: Features and target
X_raw = df_raw.drop(['Customer_ID', 'Transaction_Amount', 'Join_Date'], axis=1)
y_raw = df_raw['Transaction_Amount']
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X_raw, y_raw, test_size=0.2, random_state=42)
model_raw = RandomForestRegressor(random_state=42)
model_raw.fit(X_train_raw, y_train_raw)
y_pred_raw = model_raw.predict(X_test_raw)

# Metrics for raw data
rmse_raw = np.sqrt(mean_squared_error(y_test_raw, y_pred_raw))
r2_raw = r2_score(y_test_raw, y_pred_raw)

# Preprocessed data: Features and target
X_preprocessed = df_preprocessed.drop(['Customer_ID', 'Transaction_Amount', 'Join_Date'], axis=1)
y_preprocessed = df_preprocessed['Transaction_Amount']
X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_preprocessed, y_preprocessed, test_size=0.2, random_state=42)
model_preprocessed = RandomForestRegressor(random_state=42)
model_preprocessed.fit(X_train_pre, y_train_pre)
y_pred_pre = model_preprocessed.predict(X_test_pre)

# Metrics for preprocessed data
rmse_preprocessed = np.sqrt(mean_squared_error(y_test_pre, y_pred_pre))
r2_preprocessed = r2_score(y_test_pre, y_pred_pre)

# Print results
print(f"RMSE on Raw Data: {rmse_raw}")
print(f"R-squared on Raw Data: {r2_raw}")
print(f"RMSE on Preprocessed Data: {rmse_preprocessed}")
print(f"R-squared on Preprocessed Data: {r2_preprocessed}")

RMSE on Raw Data: 1476.6117937212434
R-squared on Raw Data: -0.056488930992956154
RMSE on Preprocessed Data: 0.9382753739804405
R-squared on Preprocessed Data: 0.046337407293424504


In [24]:
# Streamlit Dashboard
st.title("Impact of Preprocessing on Model Performance")

# Display Metrics
st.header("Model Performance Comparison")
st.write(f"RMSE on Raw Data: {rmse_raw}")
st.write(f"R-squared on Raw Data: {r2_raw}")
st.write(f"RMSE on Preprocessed Data: {rmse_preprocessed}")
st.write(f"R-squared on Preprocessed Data: {r2_preprocessed}")

# RMSE and R-squared Bar Chart
st.header("Performance Metrics Comparison")
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

# RMSE Bar Chart
ax[0].bar(['Raw Data', 'Preprocessed Data'], [rmse_raw, rmse_preprocessed], color=['red', 'green'])
ax[0].set_title("RMSE Comparison")
ax[0].set_ylabel("RMSE")

# R-squared Bar Chart
ax[1].bar(['Raw Data', 'Preprocessed Data'], [r2_raw, r2_preprocessed], color=['blue', 'orange'])
ax[1].set_title("R-squared Comparison")
ax[1].set_ylabel("R-squared")

st.pyplot(fig)


NameError: name 'st' is not defined