In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import pickle
import joblib
import time
import os

In [11]:
for line in [f'{f[:-4]}_df = pd.read_csv("scotus/{f}")' for f in os.listdir("scotus") if f.endswith("csv")]:
    print(line)

print("{")
for line in [f'\t"{f[:-4]}": {f[:-4]}_df,' for f in os.listdir("scotus") if f.endswith("csv")]:
    print(line)
print("}")

bigram_df = pd.read_csv("scotus/bigram.csv")
t5_scotus_embeddings_df = pd.read_csv("scotus/t5_scotus_embeddings.csv")
paraphrase_scotus_embeddings_df = pd.read_csv("scotus/paraphrase_scotus_embeddings.csv")
smoothed_unigram_df = pd.read_csv("scotus/smoothed_unigram.csv")
unigram_df = pd.read_csv("scotus/unigram.csv")
tfidf_df = pd.read_csv("scotus/tfidf.csv")
{
	"bigram": bigram_df,
	"t5_scotus_embeddings": t5_scotus_embeddings_df,
	"paraphrase_scotus_embeddings": paraphrase_scotus_embeddings_df,
	"smoothed_unigram": smoothed_unigram_df,
	"unigram": unigram_df,
	"tfidf": tfidf_df,
}


In [9]:
bigram_df = pd.read_csv("scotus/bigram.csv")
t5_scotus_embeddings_df = pd.read_csv("scotus/t5_scotus_embeddings.csv")
paraphrase_scotus_embeddings_df = pd.read_csv("scotus/paraphrase_scotus_embeddings.csv")
smoothed_unigram_df = pd.read_csv("scotus/smoothed_unigram.csv")
unigram_df = pd.read_csv("scotus/unigram.csv")
tfidf_df = pd.read_csv("scotus/tfidf.csv")

In [12]:
models = {
	"bigram": bigram_df,
	"t5_scotus_embeddings": t5_scotus_embeddings_df,
	"paraphrase_scotus_embeddings": paraphrase_scotus_embeddings_df,
	"smoothed_unigram": smoothed_unigram_df,
	"unigram": unigram_df,
	"tfidf": tfidf_df
}

In [13]:
def train_and_score_gbt(X, y, model_name, dataset_name="scotus"):
    # 1. Split Data
    X_train, X_test, y_train, y_test = train_test_split(X,y)

    # 2. Train GBT Model
    clf = GradientBoostingRegressor(n_estimators=100, learning_rate=1.0,
        max_depth=5, random_state=0)
    clf.fit(X_train,y_train)

    # 3. Get Predictions and Residuals
    y_pred = clf.predict(X_test)
    residuals = y_test - y_pred

    # 4. Score
    scores = {
        'model_name': model_name,
        'r2': r2_score(y_pred, y_test),
        'mea': mean_absolute_error(y_pred, y_test),
        'rmse': root_mean_squared_error(y_pred, y_test)
    }

    # 5. Plot
    plt.scatter(y_pred, residuals)
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title(f'Residual Plot: {model_name}')
    plt.axhline(y=0, color='r', linestyle='--')
    os.mkdir(f"results/dataset_name/{model_name}")
    plt.savefig(f"results/dataset_name/{model_name}/residuals.jpg")
    plt.clf()
    
    # 6. Save Model and Scores
    joblib.dump(clf, f"results/dataset_name/{model_name}/model_object.pkl")
    joblib.dump(clf, f"results/dataset_name/{model_name}/model_scores.pkl")

    # return scores