In [5]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.linear_model import Lasso, Ridge, ElasticNet
import time
import pickle
import matplotlib.pyplot as plt
 

ModuleNotFoundError: No module named 'sklearn'

In [None]:
#All models to test
allm = {}
allm["linear"] = LinearRegression()
allm["bayesian"] = BayesianRidge(compute_score=True)
allm["dt5"] = DecisionTreeRegressor(max_depth=5)
allm["dt10"] = DecisionTreeRegressor(max_depth=10)
allm["dt15"] = DecisionTreeRegressor(max_depth=15)
allm["rf5"] = RandomForestRegressor(n_estimators=200)
allm["ada"] = AdaBoostRegressor(n_estimators=200)
allm["gb2005"] = GradientBoostingRegressor(n_estimators=200, max_depth=5)
allm["gb2008"] = GradientBoostingRegressor(n_estimators=500, max_depth=8)
allm["mlp200"] = MLPRegressor(hidden_layer_sizes=(200,), max_iter=5000)
allm["mlp400"] = MLPRegressor(hidden_layer_sizes=(400,), max_iter=5000)
allm["mlp200200"] = MLPRegressor(hidden_layer_sizes=(200,200), max_iter=5000)
allm["mlp20050"] = MLPRegressor(hidden_layer_sizes=(200,50), max_iter=5000)
allm["gpr"] = GaussianProcessRegressor(kernel=DotProduct() + WhiteKernel())
allm["lasso"] = Lasso(alpha=0.1)
allm["ridge"] = Ridge(alpha=0.1)
allm["elastic"] = ElasticNet(alpha=0.1, l1_ratio=0.5)

In [None]:
def custom_tts(X, Y, train_size = 1000):
    """
    Custom train test split to ensure even distribution of extreme drag values across train and test sets
    Input: X, Y, train_size
    Output: X_train, X_test, Y_train, Y_test
    """
    total_df = X.join(Y)
    total_df = total_df.sort_values(by="drag")
    select = np.linspace(0, len(total_df)-1, train_size, dtype=int)
    select = 2

In [None]:
def custom_tts(data, test_prop=0.1):
    """
    Input: data list, train_num
    Output: train, test
    1. puts data in same dataframe
    2. sorts data by drag force in target vals
    3. splits data into train and test
    """
    sss = StratifiedShuffleSplit(n_splits=10, test_size=test_prop, random_state=20)
    X_train, X_test, y_train, y_test = train_test_split(input_vals, target_vals, stratify=sss.split(input_vals, target_vals))
    return X_train, X_test, y_train, y_test

In [None]:
# Rearrange and rename columns of aero dataset
# Original:['SaddleY@Mainsketch', 'UpperLeg@Mainsketch', 'HandleX@Mainsketch',
# 'HandleY@Mainsketch', 'Arm@Mainsketch', 'SaddleX+.2@Mainsketch',
# 'CrankLength@Mainsketch', 'LowerLeg@Mainsketch',
# 'TorsoLength@Mainsketch', 'Neckandhead@Mainsketch',
# 'TorsoWidth@Widthdef', 'Back Vertical Height', 'Head Height',
# 'Theigh Width', 'legarea', 'Frontal Surface Area']
# Result: ['hx', 'hy', 'sx', 'sy', 'cl', 'll', 'ul', 'tl', 'al', 'sw', 'ht', 'hb', 'back vertical height', 'head height', 'thigh width', 'leg area', 'frontal surface area']

in_df = pd.read_csv("aero.csv")
out_df = in_df[["HandleX@Mainsketch", "HandleY@Mainsketch", "SaddleX+.2@Mainsketch", "SaddleY@Mainsketch", "CrankLength@Mainsketch", "LowerLeg@Mainsketch", "UpperLeg@Mainsketch", "TorsoLength@Mainsketch", "Arm@Mainsketch", "Neckandhead@Mainsketch", "TorsoWidth@Widthdef", "Back Vertical Height", "Head Height", "Theigh Width", "legarea", "Frontal Surface Area"]]
out_df.columns = ["hx", "hy", "sx", "sy", "cl", "ll", "ul", "tl", "al", "head", "sw", "back vertical height", "head height", "thigh width", "leg area", "frontal surface area"]

In [1]:
def eval_score(num_tests, data, method, train_size):
    start_time = time.time()
    total_score = 0
    model = None
    input_vals = data[0]
    target_vals = data[1]
    max_model = None
    max_score = -1.0
    for i in range(num_tests):
        X_train, X_test, y_train, y_test = train_test_split(input_vals, target_vals, train_size=train_size, shuffle=True)
        match method:
            case "bayesian":
                model = BayesianRidge(compute_score=True)
            case "linear":
                model = LinearRegression()
            case "dt5":
                model = DecisionTreeRegressor(max_depth=5)
            case "dt8":
                model = DecisionTreeRegressor(max_depth=8)
            case "rf":
                model = RandomForestRegressor(n_estimators=200)
            case "ada":
                model = AdaBoostRegressor(n_estimators=200)
            case "gb":
                model = GradientBoostingRegressor(n_estimators=200, max_depth=3)
            case "mlp1":
                model = MLPRegressor(hidden_layer_sizes=(200,), max_iter=5000)
            case "mlp2":
                model = MLPRegressor(hidden_layer_sizes=(200, 200), max_iter=5000)
            case "gpr":
                kernel = DotProduct() + WhiteKernel()
                model = GaussianProcessRegressor(kernel=kernel)
            case _:
                raise ValueError("Invalid method")
        model.fit(X_train, y_train)
        cur_score = model.score(X_test, y_test)
        total_score += cur_score
        if cur_score > max_score:
          max_score = cur_score
          max_model = model
    avg_score = total_score / num_tests
    print(f"Method: {method}, Score: {avg_score}, Time: {time.time() - start_time}")
    return avg_score, max_score, max_model



def test_methods(input_vals, target_vals, train_size = 1000):
    data = [input_vals, target_vals]
    arow=pd.DataFrame({"Linear Regression": eval_score(100,data,"linear",train_size)},index=[len(data[0])])
    arow=arow.join(pd.DataFrame({"Bayesian Ridge Regression": eval_score(100,data,"bayesian",train_size)},index=[len(data[0])]))
    arow=arow.join(pd.DataFrame({"Gaussian Process Regression": eval_score(10,data,"gpr",train_size)},index=[len(data[0])]))
    arow=arow.join(pd.DataFrame({"Depth 5 Decision Tree": eval_score(100,data,"dt5",train_size)},index=[len(data[0])]))
    arow=arow.join(pd.DataFrame({"Depth 8 Decision Tree": eval_score(100,data,"dt8",train_size)},index=[len(data[0])]))
    arow=arow.join(pd.DataFrame({"Random Forest": eval_score(10,data,"rf",train_size)},index=[len(data[0])]))
    arow=arow.join(pd.DataFrame({"AdaBoost": eval_score(10,data,"ada",train_size)},index=[len(data[0])]))
    arow=arow.join(pd.DataFrame({"Gradient Boosting Regressor": eval_score(10,data,"gb",train_size)},index=[len(data[0])]))
    arow=arow.join(pd.DataFrame({"3-layer Neural Net": eval_score(5,data,"mlp1",train_size)},index=[len(data[0])]))
    arow=arow.join(pd.DataFrame({"4-layer Neural Net": eval_score(5,data,"mlp2",train_size)},index=[len(data[0])]))

    return arow

In [None]:
all_model_names = ["linear", "bayesian", "dt5", "dt8", "rf", "ada", "gb", "mlp1", "mlp2"]
def train_methods(data, train_size):
    """
    Trains all models on the given data and returns a dictionary of the trained models
    """
    results = {}
    for model_name in all_model_names:
        avg_score, max_score, model = eval_score(100, data, model_name, train_size)
        results[model_name] = avg_score, max_score, model
    return results

In [None]:
bike_vector_df = pd.read_csv("/Users/noahwiley/Documents/Bike UROP/MeasureML-main/Frame Datasets/aero_data_augmented_id_cut.csv")
target_drag_df = pd.read_csv("/Users/noahwiley/Downloads/Aero/targetdf.csv")


In [3]:
import pickle
with open("/Users/noahwiley/Documents/Bike UROP/MeasureML-main/Frame Datasets/Model Tests/trained_models.pkl", "rb") as f:
    model_dict = pickle.load(f)

ModuleNotFoundError: No module named 'sklearn'