In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# -------------------------------
# 1. Load Data
# -------------------------------
train_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic/horse-colic.data"
test_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic/horse-colic.test"

names = [
    "surgery", "age", "hospital_number", "rectal_temperature", "pulse", "respiratory_rate",
    "temperature_extremities", "peripheral_pulse", "mucous_membranes", "capillary_refill_time",
    "pain", "peristalsis", "abdominal_distension", "nasogastric_tube", "nasogastric_reflux",
    "nasogastric_reflux_ph", "feces", "abdomen", "packed_cell_volume", "total_protein",
    "abdomcentesis_appearance", "abdomcentesis_total_protein", "outcome", "surgical_lesion",
    "lesion_1", "lesion_2", "lesion_3", "cp_data"
]

df_train = pd.read_csv(train_url, delim_whitespace=True, names=names, na_values='?')
df_test = pd.read_csv(test_url, delim_whitespace=True, names=names, na_values='?')

df = pd.concat([df_train, df_test], ignore_index=True)

# -------------------------------
# 2. Drop unneeded columns
# -------------------------------
df.drop(columns=["hospital_number", "lesion_1", "lesion_2", "lesion_3", "cp_data"], inplace=True)

# -------------------------------
# 3. Clean target column
# -------------------------------
df = df[~df["surgical_lesion"].isna()]
df["surgical_lesion"] = df["surgical_lesion"].map({1: 1, 2: 0})

# -------------------------------
# 4. Separate X, y
# -------------------------------
X = df.drop("surgical_lesion", axis=1)
y = df["surgical_lesion"]

# -------------------------------
# 5. Detect column types
# -------------------------------
numeric_feats = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_feats = [c for c in X.columns if c not in numeric_feats]

# If no categorical columns are found, skip encoding safely
if len(categorical_feats) == 0:
    categorical_feats = []

# -------------------------------
# 6. Build pipeline
# -------------------------------
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_feats),
        ("cat", categorical_transformer, categorical_feats)
    ]
)

# -------------------------------
# 7. Fit and transform
# -------------------------------
X_proc = preprocessor.fit_transform(X)

# Safely access the encoder only if categorical columns exist
if len(categorical_feats) > 0:
    encoder = preprocessor.named_transformers_["cat"].named_steps["encoder"]
    cat_cols = encoder.get_feature_names_out(categorical_feats)
else:
    cat_cols = []

# Merge numeric + encoded column names
all_cols = numeric_feats + list(cat_cols)

# -------------------------------
# 8. Build DataFrame and Save
# -------------------------------
X_proc = np.array(X_proc)
df_processed = pd.DataFrame(X_proc, columns=all_cols)
df_processed["surgical_lesion"] = y.values

df_processed.to_csv("horse_colic_preprocessed.csv", index=False)
print("✅ Clean dataset created and saved as 'horse_colic_preprocessed.csv'")
print("Shape:", df_processed.shape)
print(df_processed.head())


  df_train = pd.read_csv(train_url, delim_whitespace=True, names=names, na_values='?')
  df_test = pd.read_csv(test_url, delim_whitespace=True, names=names, na_values='?')


✅ Clean dataset created and saved as 'horse_colic_preprocessed.csv'
Shape: (368, 23)
    surgery       age  rectal_temperature     pulse  respiratory_rate  \
0  1.192079 -0.286972            0.580749 -0.147068         -0.128183   
1 -0.838870 -0.286972            1.673525  0.662354         -0.632014   
2  1.192079 -0.286972            0.268528 -1.103657         -0.380099   
3 -0.838870  3.484660            1.517414  3.458537          3.398639   
4  1.192079 -0.286972           -1.292581  1.251024          0.312670   

   temperature_extremities  peripheral_pulse  mucous_membranes  \
0                 0.543602          1.245342          0.097113   
1                 0.543602         -0.738579          0.771410   
2                -1.508146         -0.738579          0.097113   
3                 1.569476         -0.738579          2.120004   
4                 0.543602         -0.738579          2.120004   

   capillary_refill_time      pain  ...  nasogastric_reflux  \
0               