In [1]:
import pandas as pd
import numpy as np

# Machine Learning and Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Model Evaluation
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# 1. Load the data
df = pd.read_csv("superstore.csv")


In [2]:
# ----------------------------------------------------------------------------
# 2. Basic Data Inspection & Cleaning
# ----------------------------------------------------------------------------

# (a) Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# If there are many missing values, consider your imputation strategy carefully.
# For simplicity, we'll drop rows with any NA:
df.dropna(inplace=True)

# (b) Remove duplicates if they exist
df.drop_duplicates(inplace=True)

Missing values per column:
Category          0
City              0
Country           0
Customer.ID       0
Customer.Name     0
Discount          0
Market            0
记录数               0
Order.Date        0
Order.ID          0
Order.Priority    0
Product.ID        0
Product.Name      0
Profit            0
Quantity          0
Region            0
Row.ID            0
Sales             0
Segment           0
Ship.Date         0
Ship.Mode         0
Shipping.Cost     0
State             0
Sub.Category      0
Year              0
Market2           0
weeknum           0
dtype: int64


In [3]:
# ----------------------------------------------------------------------------
# 3. Select Columns & Feature Engineering
# ----------------------------------------------------------------------------

# We'll choose 'Segment' as the target. Adjust if you prefer a different target.
target_col = "Segment"
if target_col not in df.columns:
    raise ValueError(f"Column '{target_col}' not found in the dataset.")

# Remove columns that won't help the model or are basically unique IDs.
# Adjust this list as necessary for your analysis.
cols_to_drop = [
    "Row ID", "Order ID", "Order Date", "Ship Date",
    "Customer ID", "Customer Name", "Product ID", 
    "Product Name", "Postal Code"
]
for col in cols_to_drop:
    if col in df.columns:
        df.drop(col, axis=1, inplace=True)

# (a) Separate features (X) and target (y)
y = df[target_col]
X = df.drop(target_col, axis=1)

# (b) Identify which columns are categorical vs. numeric
categorical_cols = X.select_dtypes(include=["object"]).columns

# (c) Encode the categorical features with label encoding
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# (d) Encode the target if it is categorical (Segment is categorical)
y_encoder = LabelEncoder()
y = y_encoder.fit_transform(y)

In [4]:
# ----------------------------------------------------------------------------
# 4. Train/Test Split
# ----------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,          # 20% of the data for testing
    random_state=42,        # for reproducibility
    stratify=y              # helps preserve class distribution
)


In [5]:
# ----------------------------------------------------------------------------
# 5. Model Training: Random Forest Classifier
# ----------------------------------------------------------------------------
rf_classifier = RandomForestClassifier(
    n_estimators=100,       # number of trees in the forest
    random_state=42
)
rf_classifier.fit(X_train, y_train)

In [6]:
# ----------------------------------------------------------------------------
# 6. Predictions & Evaluation
# ----------------------------------------------------------------------------
y_pred = rf_classifier.predict(X_test)

# (a) Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# (b) Classification Report (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=y_encoder.classes_))

# (c) Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)



Confusion Matrix:
[[5182  110   11]
 [ 797 2283    6]
 [ 600   89 1180]]

Classification Report:
              precision    recall  f1-score   support

    Consumer       0.79      0.98      0.87      5303
   Corporate       0.92      0.74      0.82      3086
 Home Office       0.99      0.63      0.77      1869

    accuracy                           0.84     10258
   macro avg       0.90      0.78      0.82     10258
weighted avg       0.86      0.84      0.84     10258

Accuracy: 0.8427568726847339
