In [2]:
import pandas as pd
df = pd.read_csv('abalone.csv')
abalone = df

abalone.head()


Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


# Q1: What is inductive reasoning? Deductive reasoning? Give an example of each, different from the examples given in class.

* Inductive reasoning is when you make generalizations based on specific observations. So if someone goes to a college campus and sees lots of students studying in the library they might come to the generalization that all students study in the library
* Deductive reasoning is when you make a specific conclusion based off general knowledge. It is a way to come to logical conclusions with minimal information. So if you know that all mammals are warm-blooded, and that a dog is a mammal, then you would use deductive reasoning to come to the conclusion that a dog is warm blooded. 

# Q2: Preprocess your dataset

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def preprocess_data(abalone):
    sex_encoder = LabelEncoder()
    abalone['Sex'] = sex_encoder.fit_transform(abalone['Sex'])  # Encoding categorical variable 'Sex'
    # 1 is infant, 2 is male, 0 is female

    X = abalone.drop(columns=['Diameter'])  
    y = abalone['Diameter'] 

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    numerical_features = ['Length', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight']

    numerical_transformer = StandardScaler()
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features)
        ], remainder='passthrough')
    
    
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    return X_train_preprocessed, X_test_preprocessed, y_train, y_test


X_train_preprocessed, X_test_preprocessed, y_train, y_test = preprocess_data(abalone)

abalone


Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,2,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,2,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,0,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,2,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,1,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,0,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,2,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,2,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,0,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


# Q3 Create and Tune a Decision Tree Model

In [4]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error 

def tune_decision_tree(X_train, y_train):
    param_grid = {
        'max_depth': [3,5,7,10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    } 

    dt_regressor = DecisionTreeRegressor(random_state=42) 
    grid_search = GridSearchCV(estimator=dt_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train) 

    print("Best Parameters:", grid_search.best_params_) 
    return grid_search.best_estimator_

X_train_preprocessed, X_test_preprocessed, y_train, y_test = preprocess_data(abalone) 
best_decision_tree_model = tune_decision_tree(X_train_preprocessed, y_train) 
best_decision_tree_model = tune_decision_tree(X_train_preprocessed, y_train) 
best_decision_tree_model.fit(X_train_preprocessed, y_train) 
decision_tree_prediction = best_decision_tree_model.predict(X_test_preprocessed) 

mse_decision_tree = mean_squared_error(y_test, decision_tree_prediction) 
print("Mean Squared Error for Abalone Decision Tree:", mse_decision_tree) 
   

Best Parameters: {'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best Parameters: {'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 2}
Mean Squared Error for Abalone Decision Tree: 0.0002880811510598701


Explanation: 
* I started by creating a parameter grid that had different values for the max depth, min samples, and min samples leaf. Then I used GridSearchCV to perform cross-validation. Squared Error is the most important part of Decision Tree models as it tells us how well it's doing so I implemented that as well as to keep going until I got a lower squared error. 

# Q4 Create a Random Forest Model and tune it to the best of your abilities 

In [5]:
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import GridSearchCV

def tune_random_forest(X_train, y_train, param_grid):
    rf_regressor = RandomForestRegressor(random_state=42) 
    grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=5, scoring= 'neg_mean_squared_error')
    grid_search.fit(X_train, y_train) 

    print("Best Parameters:", grid_search.best_params_)
    return grid_search.best_estimator_

param_grid_rf = {
    'n_estimators': [50,100],
    'max_depth': [3, 5],
    'min_samples_split': [2,3],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt']
}


X_train_preprocessed, X_test_preprocessed, y_train, y_test = preprocess_data(abalone) 
best_random_forest_model = tune_random_forest(X_train_preprocessed, y_train, param_grid_rf) 
random_forest_predictions = best_random_forest_model.predict(X_test_preprocessed) 

mse_random_forest = mean_squared_error(y_test, random_forest_predictions) 
print("Mean Squared Error for Random Forest:", mse_random_forest) 
            

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/jupyterlab/4.1.2/libexec/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/Cellar/jupyterlab/4.1.2/libexec/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/opt/homebrew/Cellar/jupyterlab/4.1.2/libexec/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/homebrew/Cellar/jupyterlab/4.1.2/libexec/lib/py

Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}
Mean Squared Error for Random Forest: 0.00027291787747169027


 The thing I spent the most time tuning was the param grid for this model and I'm still not sure if its at the best place possible. Then I used GridSearchCV to find the best combo of hyperparameters. Then, used the mean squared error to ensure the model is working correctly. I still ran into a few errors but I troubleshooted and I cant figure out how to fix them. 

# Q5  Create an xgboost model tuned to the best of your abilities. Explain how you tuned it.

In [24]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [6]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor 

def tune_xgboost(X_train, y_train):  
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [1,2],
        'learning_rate': [0.01, 0.1],
        'gamma': [0, 0.1],
        'subsample': [0.6, 0.8],
        'colsample_bytree': [0.6, 0.8]
    }
    xgb_regressor = XGBRegressor(random_state=42) 
    grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train) 
    print("Best Parameters:", grid_search.best_params_) 
    return grid_search.best_estimator_ 

X_train_preprocessed, X_test_preprocessed, y_train, y_test = preprocess_data(abalone) 
best_xgboost_model = tune_xgboost(X_train_preprocessed, y_train) 
xgboost_predictions = best_xgboost_model.predict(X_test_preprocessed) 
mse_xgboost = mean_squared_error(y_test, xgboost_predictions) 
print("Mean Squared Error for XGBoost:", mse_xgboost)

        


Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200, 'subsample': 0.8}
Mean Squared Error for XGBoost: 0.0002447119408237649


Starting with making the param grid like the other models, again I needed to mess with the dimensions of the param grid so that my model could run faster. Also did cross validation through use of GridSearchCV to find the best combination of hyperparameters. Then I used mean squared error as a way to score how well the model runs.  

In [7]:
from sklearn.metrics import mean_squared_error
import numpy as np

train_predictions = best_xgboost_model.predict(X_train_preprocessed)
train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
print("RMSE for training data:", train_rmse)

test_predictions = best_xgboost_model.predict(X_test_preprocessed)
test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
print("RMSE for testing data:", test_rmse)


RMSE for training data: 0.013397145905904375
RMSE for testing data: 0.01564327142332335
