In [85]:
# Importing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor


pd.set_option("display.max_columns", None)


df = pd.read_csv(
    "./grain-training.csv"
)

In [86]:
df

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,11366,423.114990,171.906647,85.579300,0.867278,11599,0.611404,Osmancik
1,16523,531.892029,224.995422,94.417702,0.907689,16911,0.577041,Cammeo
2,11088,418.208008,172.027420,82.935669,0.876112,11284,0.624993,Osmancik
3,14528,475.447998,192.198563,97.417427,0.862029,14795,0.629490,Cammeo
4,8990,389.377014,157.749603,73.919182,0.883418,9297,0.625261,Osmancik
...,...,...,...,...,...,...,...,...
3043,14078,478.470001,203.645462,88.560310,0.900491,14280,0.744395,Cammeo
3044,17246,540.541992,225.988861,98.573151,0.899857,17704,0.573929,Cammeo
3045,11070,419.403015,173.575043,82.154213,0.880898,11266,0.600586,Osmancik
3046,11747,452.127014,194.494858,78.744461,0.914376,11935,0.542637,Cammeo


In [87]:
def preprocess(df):
    # Define mapping dictionary
    target_mapping = {'Osmancik': 1, 'Cammeo': 2}
    # Map values
    df["Class"] = df["Class"].map(target_mapping)

    df["Aspect_Ratio"] = df["Major_Axis_Length"] / df["Minor_Axis_Length"]
    df["Shape_Irregularity"] = df["Convex_Area"] - df["Area"]

    # Polynomial Features for numerical data
    # Assuming all other features are numerical after potential encoding
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    numeric_features = df.drop(
        ["Class", "feature_x"], axis=1, errors="ignore"
    )  # exclude target and categorical features
    numeric_features_poly = poly.fit_transform(numeric_features)
    poly_feature_names = poly.get_feature_names_out(numeric_features.columns)
    df_poly = pd.DataFrame(numeric_features_poly, columns=poly_feature_names)

    # Merge transformed categorical and polynomial features
    df = df[["Class"]].join(df_poly)  # Re-add the target variable
    
    features_to_scale = df.drop(["Class"], axis=1).columns
    scaler = StandardScaler()
    df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

    return df

In [88]:
df = preprocess(df)

In [89]:
X = df.drop("Class", axis=1)
y = df[["Class"]]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=10
)

In [90]:
best_params = {"learning_rate": 0.05, "max_depth": 3, "n_estimators": 100}

model = XGBRegressor(
    objective="reg:squarederror",
    learning_rate=best_params["learning_rate"],
    max_depth=best_params["max_depth"],
    n_estimators=best_params["n_estimators"],
    random_state=10,
)


model.fit(X_train, y_train.values.ravel())

predictions = model.predict(X_test)


# Calculate RMSE
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)

r2 = r2_score(y_test, predictions)

print(f"\n\nRMSE: {rmse}")
print(f"\n\nR^2: {r2}")



RMSE: 0.240170738787175


R^2: 0.7645930271051617


In [91]:
# # Re-defining pipeline with the necessary steps including PolynomialFeatures and StandardScaler
# pipeline = Pipeline(steps=[
#     ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
#     ('scaler', StandardScaler()),
#     ('model', XGBRegressor(objective='reg:squarederror', random_state=10))
# ])

# # Setting up the grid search parameters
# param_grid = {
#     'model__n_estimators': [100, 200],
#     'model__learning_rate': [0.01, 0.05, 0.1],
#     'model__max_depth': [3, 4, 5]
# }

# # Performing grid search with cross-validation
# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)

# # Fitting the model
# grid_search.fit(X_train, y_train.values.ravel())

# # Extracting the best model
# best_model = grid_search.best_estimator_

# # Cross-validated R^2 score
# cv_r2_scores = cross_val_score(best_model, X, y.values.ravel(), cv=5, scoring='r2')
# cv_r2_mean = np.mean(cv_r2_scores)

# # Predictions and evaluation
# predictions = best_model.predict(X_test)

# # Calculate RMSE
# mse = mean_squared_error(y_test, predictions)
# rmse = np.sqrt(mse)

# # R^2 score
# r2 = r2_score(y_test, predictions)

# print(f"Best Model Parameters: {grid_search.best_params_}")
# print(f"\nCross-Validated Mean R^2: {cv_r2_mean}")
# print(f"\nRMSE: {rmse}")
# print(f"\nR^2: {r2}")