Use SMOTE technique to generate synthetic data on the given dataset and evaluate the performance.

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score
from imblearn.over_sampling import SMOTE

In [3]:
# Upload dataset
from google.colab import files
uploaded = files.upload()  # This will prompt you to upload your file

Saving exp5_50_Startups.csv to exp5_50_Startups.csv


In [4]:
# Read the dataset from Excel (.xlsx) format
import pandas as pd
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

In [5]:
# Display the dataset
print("Original dataset:")
print(df.head())

Original dataset:
   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94


In [6]:
# Convert 'State' to numerical values using one-hot encoding
df_encoded = pd.get_dummies(df, columns=['State'], drop_first=True)
print("\nEncoded dataset:")
print(df_encoded.head())


Encoded dataset:
   R&D Spend  Administration  Marketing Spend     Profit  State_Florida  \
0  165349.20       136897.80        471784.10  192261.83          False   
1  162597.70       151377.59        443898.53  191792.06          False   
2  153441.51       101145.55        407934.54  191050.39           True   
3  144372.41       118671.85        383199.62  182901.99          False   
4  142107.34        91391.77        366168.42  166187.94           True   

   State_New York  
0            True  
1           False  
2           False  
3            True  
4           False  


In [7]:
# For SMOTE, we need a classification target
# Let's convert Profit into a binary target based on median value
median_profit = df_encoded['Profit'].median()
df_encoded['Profit_Binary'] = (df_encoded['Profit'] > median_profit).astype(int)

print("\nClass distribution (High vs Low Profit):")
print(df_encoded['Profit_Binary'].value_counts())

# Prepare the data
X = df_encoded.drop(['Profit', 'Profit_Binary'], axis=1)
y = df_encoded['Profit_Binary']
y_reg = df_encoded['Profit']  # For regression comparison

# Split the data into training and testing sets
X_train, X_test, y_train, y_test, y_reg_train, y_reg_test = train_test_split(
    X, y, y_reg, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Class distribution (High vs Low Profit):
Profit_Binary
1    25
0    25
Name: count, dtype: int64


In [8]:
# For regression model (without SMOTE)
print("\n--- Regression Model (Original Data) ---")
reg_model = LinearRegression()
reg_model.fit(X_train_scaled, y_reg_train)
y_reg_pred = reg_model.predict(X_test_scaled)

print(f"Mean Squared Error: {mean_squared_error(y_reg_test, y_reg_pred):.2f}")
print(f"R² Score: {r2_score(y_reg_test, y_reg_pred):.2f}")

# Train a classification model on the imbalanced dataset
print("\n--- Classification Model (Before SMOTE) ---")
model_before = LogisticRegression(random_state=42)
model_before.fit(X_train_scaled, y_train)

# Make predictions and evaluate
y_pred_before = model_before.predict(X_test_scaled)
print("\nClassification Report (Before SMOTE):")
print(classification_report(y_test, y_pred_before))


--- Regression Model (Original Data) ---
Mean Squared Error: 82010363.05
R² Score: 0.90

--- Classification Model (Before SMOTE) ---

Classification Report (Before SMOTE):
              precision    recall  f1-score   support

           0       0.88      1.00      0.93         7
           1       1.00      0.67      0.80         3

    accuracy                           0.90        10
   macro avg       0.94      0.83      0.87        10
weighted avg       0.91      0.90      0.89        10



In [9]:
# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Check class distribution after SMOTE
print("\nClass distribution after SMOTE:")
unique, counts = np.unique(y_train_smote, return_counts=True)
print(dict(zip(unique, counts)))


Class distribution after SMOTE:
{np.int64(0): np.int64(22), np.int64(1): np.int64(22)}


In [10]:
# Train a model on the balanced dataset
print("\n--- Classification Model (After SMOTE) ---")
model_after = LogisticRegression(random_state=42)
model_after.fit(X_train_smote, y_train_smote)

# Make predictions and evaluate
y_pred_after = model_after.predict(X_test_scaled)
print("\nClassification Report (After SMOTE):")
print(classification_report(y_test, y_pred_after))


--- Classification Model (After SMOTE) ---

Classification Report (After SMOTE):
              precision    recall  f1-score   support

           0       0.88      1.00      0.93         7
           1       1.00      0.67      0.80         3

    accuracy                           0.90        10
   macro avg       0.94      0.83      0.87        10
weighted avg       0.91      0.90      0.89        10



In [11]:
# First, we'll create a mapping between X_train_smote and original regression targets
# For simplicity, we'll assign the original profit values to the nearest neighbors
from sklearn.neighbors import NearestNeighbors

# Convert synthetic samples back to original scale
X_train_smote_original = scaler.inverse_transform(X_train_smote)

# For each synthetic sample, find its nearest neighbor in the original data
# and use that neighbor's profit value
X_train_original = scaler.inverse_transform(X_train_scaled)
knn = NearestNeighbors(n_neighbors=1)
knn.fit(X_train_original)

# Find nearest neighbors for each synthetic sample
distances, indices = knn.kneighbors(X_train_smote_original)

# Assign profit values from nearest neighbors
y_reg_train_smote = np.array([y_reg_train.iloc[idx] for idx in indices.flatten()])

# Train regression model on SMOTE-enhanced data
reg_model_smote = LinearRegression()
reg_model_smote.fit(X_train_smote, y_reg_train_smote)
y_reg_pred_smote = reg_model_smote.predict(X_test_scaled)

print(f"Mean Squared Error (after SMOTE): {mean_squared_error(y_reg_test, y_reg_pred_smote):.2f}")
print(f"R² Score (after SMOTE): {r2_score(y_reg_test, y_reg_pred_smote):.2f}")

Mean Squared Error (after SMOTE): 98899735.88
R² Score (after SMOTE): 0.88


In [12]:
# # Visualize the results
# plt.figure(figsize=(12, 8))

# # Plot original data distribution
# plt.subplot(2, 2, 1)
# sns.scatterplot(x=range(len(y_train)), y=y_train)
# plt.title('Original Class Distribution (Training Set)')
# plt.xlabel('Sample Index')
# plt.ylabel('Class (0=Low Profit, 1=High Profit)')

# # Plot SMOTE-enhanced data distribution
# plt.subplot(2, 2, 2)
# sns.scatterplot(x=range(len(y_train_smote)), y=y_train_smote)
# plt.title('SMOTE-Enhanced Class Distribution (Training Set)')
# plt.xlabel('Sample Index')
# plt.ylabel('Class (0=Low Profit, 1=High Profit)')

# # Plot regression predictions before SMOTE
# plt.subplot(2, 2, 3)
# plt.scatter(y_reg_test, y_reg_pred, color='blue', label='Original')
# plt.plot([y_reg_test.min(), y_reg_test.max()], [y_reg_test.min(), y_reg_test.max()], 'k--')
# plt.title('Actual vs Predicted Profit (Before SMOTE)')
# plt.xlabel('Actual Profit')
# plt.ylabel('Predicted Profit')

# # Plot regression predictions after SMOTE
# plt.subplot(2, 2, 4)
# plt.scatter(y_reg_test, y_reg_pred_smote, color='red', label='SMOTE-Enhanced')
# plt.plot([y_reg_test.min(), y_reg_test.max()], [y_reg_test.min(), y_reg_test.max()], 'k--')
# plt.title('Actual vs Predicted Profit (After SMOTE)')
# plt.xlabel('Actual Profit')
# plt.ylabel('Predicted Profit')

# plt.tight_layout()
# plt.show()

# # Feature importance comparison
# plt.figure(figsize=(10, 6))
# features = X.columns

# # For classification
# importance_before = abs(model_before.coef_[0])
# importance_after = abs(model_after.coef_[0])

# # Plot feature importance for classification
# plt.bar(features, importance_before, alpha=0.5, label='Before SMOTE')
# plt.bar(features, importance_after, alpha=0.5, label='After SMOTE')
# plt.xlabel('Features')
# plt.ylabel('Importance (Logistic Regression Coefficients)')
# plt.title('Feature Importance Before and After SMOTE (Classification)')
# plt.legend()
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()

# # Feature importance for regression
# plt.figure(figsize=(10, 6))
# reg_importance_before = abs(reg_model.coef_)
# reg_importance_after = abs(reg_model_smote.coef_)

# plt.bar(features, reg_importance_before, alpha=0.5, label='Before SMOTE')
# plt.bar(features, reg_importance_after, alpha=0.5, label='After SMOTE')
# plt.xlabel('Features')
# plt.ylabel('Importance (Linear Regression Coefficients)')
# plt.title('Feature Importance Before and After SMOTE (Regression)')
# plt.legend()
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()

# Summary
print("\n--- SMOTE Impact Summary ---")
print(f"Classification Accuracy (Before SMOTE): {accuracy_score(y_test, y_pred_before):.4f}")
print(f"Classification Accuracy (After SMOTE): {accuracy_score(y_test, y_pred_after):.4f}")
print(f"Regression R² (Before SMOTE): {r2_score(y_reg_test, y_reg_pred):.4f}")
print(f"Regression R² (After SMOTE): {r2_score(y_reg_test, y_reg_pred_smote):.4f}")


--- SMOTE Impact Summary ---
Classification Accuracy (Before SMOTE): 0.9000
Classification Accuracy (After SMOTE): 0.9000
Regression R² (Before SMOTE): 0.8987
Regression R² (After SMOTE): 0.8779
