In [73]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score, classification_report
import numpy as np
from scipy.stats import zscore

In [12]:
df = sns.load_dataset("iris")
#indx = df['species'].unique()
#df['target'] = [list(indx).index(i) for i in df['species']]
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [26]:
def cleaning_pipeline(df, missing_threshold=0.4, zscore_threshold=2.57):
    """
    Cleans a DataFrame by:
    1. Dropping columns with missing values > missing_threshold
    2. Imputing remaining missing values with median
    3. Removing rows with Z-score outliers (absolute Z > zscore_threshold)

    Parameters:
        df (pd.DataFrame): The input DataFrame
        missing_threshold (float): Max allowed missing % per column (0.0 - 1.0)
        zscore_threshold (float): Threshold for Z-score based outlier removal

    Returns:
        pd.DataFrame: Cleaned DataFrame
    """
    df = df.copy()

    # Step 1: Drop columns with too many missing values
    df = df.loc[:, df.isnull().mean() <= missing_threshold]

    # Step 2: Impute remaining missing values using median
    df = df.fillna(df.median(numeric_only=True))

    # Step 3: Remove outliers using Z-score
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    z_scores = np.abs(zscore(df[numeric_cols]))

    # Keep rows where all z-scores are below threshold
    df = df[(z_scores < zscore_threshold).all(axis=1)]

    return df

In [27]:
df_cleaned = cleaning_pipeline(df)

In [28]:
df_cleaned.shape

(148, 5)

In [29]:
X = df_cleaned.drop(columns='petal_width')
y = df_cleaned['petal_width']

# Identify categorical and numerical columns
cat_features = ['species']
num_features = [col for col in X.columns if col != 'species']

In [33]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), cat_features),
        ('num', StandardScaler(), num_features)
    ],
    remainder='drop'  # Drop anything not explicitly transformed
)

xgb_pipe = Pipeline([
    ('pre', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42))
])

In [41]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [2, 3, 4],
    'regressor__learning_rate': [0.01, 0.1, 0.3]
}

# Step 5: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Grid Search
grid = GridSearchCV(xgb_pipe, param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

In [42]:
# Step 7: Evaluation
print("Best Parameters:", grid.best_params_)
y_pred = grid.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", round(mse, 4))

Best Parameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 4, 'regressor__n_estimators': 50}
Test MSE: 0.0324


In [43]:
X_train

Unnamed: 0,sepal_length,sepal_width,petal_length,species
138,6.0,3.0,4.8,virginica
47,4.6,3.2,1.4,setosa
121,5.6,2.8,4.9,virginica
28,5.2,3.4,1.4,setosa
4,5.0,3.6,1.4,setosa
...,...,...,...,...
73,6.1,2.8,4.7,versicolor
108,6.7,2.5,5.8,virginica
14,5.8,4.0,1.2,setosa
94,5.6,2.7,4.2,versicolor


In [None]:
## XGB Classifier

In [62]:
df_classifier = df.copy()
df_classifier

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [63]:
df_classifier['petal_width_label'] = df_classifier.groupby('species')['petal_width'] \
.transform(lambda x: pd.cut(x, bins=3, labels=['s', 'm', 'l']))
df_classifier = df_classifier.drop('petal_width', axis=1)

In [64]:
df_classifier

Unnamed: 0,sepal_length,sepal_width,petal_length,species,petal_width_label
0,5.1,3.5,1.4,setosa,s
1,4.9,3.0,1.4,setosa,s
2,4.7,3.2,1.3,setosa,s
3,4.6,3.1,1.5,setosa,s
4,5.0,3.6,1.4,setosa,s
...,...,...,...,...,...
145,6.7,3.0,5.2,virginica,l
146,6.3,2.5,5.0,virginica,m
147,6.5,3.0,5.2,virginica,m
148,6.2,3.4,5.4,virginica,l


In [65]:
df_cleaned_classifier = cleaning_pipeline(df_classifier)

In [66]:
X = df_cleaned_classifier.drop(columns='petal_width_label')
y = df_cleaned_classifier['petal_width_label']

le = LabelEncoder()
y = le.fit_transform(y)

In [67]:
# Identify categorical and numerical columns
cat_features = ['species']
num_features = [col for col in X.columns if col != 'species']

In [70]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), cat_features),
        ('num', StandardScaler(), num_features)
    ],
    remainder='drop'  # Drop anything not explicitly transformed
)

xgb_clf_pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', XGBClassifier(objective='multi:softprob', eval_metric = 'mlogloss', random_state=42))
])

In [80]:
# Step 5: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Grid search
param_grid = {
    'clf__n_estimators': [50, 100,150],
    'clf__max_depth': [3, 4,5],
    'clf__learning_rate': [0.01,0.1,0.2,0.3]
}
grid = GridSearchCV(xgb_clf_pipe, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

In [82]:
print("Best Params:", grid.best_params_)
y_pred = grid.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Best Params: {'clf__learning_rate': 0.01, 'clf__max_depth': 3, 'clf__n_estimators': 150}

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.64      0.69      0.67        13
           2       0.69      0.85      0.76        13

    accuracy                           0.67        30
   macro avg       0.44      0.51      0.48        30
weighted avg       0.58      0.67      0.62        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
