In [2]:
import pandas as pd
df = pd.read_csv('abalone.csv')
abalone = df

abalone.head()


Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def preprocess_data(abalone):
    sex_encoder = LabelEncoder()
    abalone['Sex'] = sex_encoder.fit_transform(abalone['Sex'])  # Encoding categorical variable 'Sex'
    # 1 is infant, 2 is male, 0 is female

    X = abalone.drop(columns=['Diameter'])  
    y = abalone['Diameter'] 

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    numerical_features = ['Length', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight']

    numerical_transformer = StandardScaler()
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features)
        ], remainder='passthrough')
    
    
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    return X_train_preprocessed, X_test_preprocessed, y_train, y_test


X_train_preprocessed, X_test_preprocessed, y_train, y_test = preprocess_data(abalone)

abalone


Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,2,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,2,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,0,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,2,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,1,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,0,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,2,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,2,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,0,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [5]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor 
from sklearn.metrics import mean_squared_error

def tune_xgboost(X_train, y_train):  
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [1,2],
        'learning_rate': [0.01, 0.1],
        'gamma': [0, 0.1],
        'subsample': [0.6, 0.8],
        'colsample_bytree': [0.6, 0.8]
    }
    xgb_regressor = XGBRegressor(random_state=42) 
    grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train) 
    print("Best Parameters:", grid_search.best_params_) 
    return grid_search.best_estimator_ 

X_train_preprocessed, X_test_preprocessed, y_train, y_test = preprocess_data(abalone) 
best_xgboost_model = tune_xgboost(X_train_preprocessed, y_train) 
xgboost_predictions = best_xgboost_model.predict(X_test_preprocessed) 
mse_xgboost = mean_squared_error(y_test, xgboost_predictions) 
print("Mean Squared Error for XGBoost:", mse_xgboost)

Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200, 'subsample': 0.8}
Mean Squared Error for XGBoost: 0.0002447119408237649


# Quiz Question
everything above is me re-inserting my old data model from the last assignment 

In [7]:

from sklearn.decomposition import PCA
import numpy as np

pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train_preprocessed)
X_test_pca = pca.transform(X_test_preprocessed)


best_xgboost_model_pca = tune_xgboost(X_train_pca, y_train)


xgboost_predictions_pca = best_xgboost_model_pca.predict(X_test_pca)


test_rmse_pca = np.sqrt(mean_squared_error(y_test, xgboost_predictions_pca))
print("RMSE for testing data with PCA:", test_rmse_pca)


Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200, 'subsample': 0.6}
RMSE for testing data with PCA: 0.018897098846556992
