In [1]:
import os
import pandas as pd
from joblib import dump
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
parent_dir = os.path.dirname(os.getcwd())

# Train data preparation

In [3]:
# Initialize directories
models_dir = os.path.join(parent_dir, "Trained Models")
os.makedirs(models_dir, exist_ok=True)
data_dir = os.path.join(parent_dir, "Data")

In [4]:
# Load train data set
train_df = pd.read_csv(os.path.join(data_dir, "train.csv"))
train_df.head()

Unnamed: 0,id,k,arc,tail_in_degree,tail_out_degree,head_in_degree,head_out_degree,d_s,d_t,C_r,F_r
0,1,1,"(0,1)",1,5,1,2,0.0,1.0,0.75,21.126761
1,1,1,"(0,2)",1,5,1,2,0.0,1.0,0.9,22.535211
2,1,1,"(0,3)",1,5,1,2,0.0,1.0,0.75,21.126761
3,1,1,"(0,4)",1,5,1,2,0.0,1.0,0.75,12.676056
4,1,1,"(0,5)",1,5,1,2,0.0,1.0,0.85,22.535211


In [5]:
# Drop the first three columns (id, k, and arc)
train = train_df.drop(columns=["id", "k", "arc"])

In [6]:
# Split data into 75% train and 25% test
rs = ShuffleSplit(n_splits=1, test_size=.25, random_state=0)
for train_index, test_index in rs.split(train):
    train_set = train.loc[train_index]
    test_set = train.loc[test_index]
print(f"Sizes: data set {train.shape}, train set {train_set.shape}, test set {test_set.shape}")

Sizes: data set (2016, 8), train set (1512, 8), test set (504, 8)


In [7]:
# Determine the predictors and target
X_train = train_set.drop(columns=["F_r"])
y_train = train_set["F_r"]
X_test = test_set.drop(columns=["F_r"])
y_test = test_set["F_r"]

# Decision tree model

In [8]:
# Use random search to find the best hyperparameters for the decision tree
param_dist = {"max_depth": range(1, 21)}
tree = DecisionTreeRegressor()
tree_cv = RandomizedSearchCV(tree, param_dist, cv=5, n_iter=20, random_state=0)
tree_cv.fit(X_train, y_train)
print(f"Best hyperparameters: {tree_cv.best_params_['max_depth']}")

Best hyperparameters: 4


In [9]:
# Construct the decision tree model
dt = DecisionTreeRegressor(random_state=0, max_depth=tree_cv.best_params_['max_depth'])
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Decision Tree R2: {r2}")

Decision Tree R2: 0.7306780230126633


In [10]:
# Save the decision tree model
model_path = os.path.join(models_dir, "dt.joblib")
dump(dt, model_path)

['c:\\Users\\mashr\\OneDrive - University of Pittsburgh\\GitHub\\IE2164\\Trained Models\\dt.joblib']

# Random forest model

In [11]:
# Use random search to find the best hyperparameters for the random forest
param_dist = {"max_depth": range(1, 21)}
rf = RandomForestRegressor()
rf_cv = RandomizedSearchCV(rf, param_dist, cv=5, n_iter=20, random_state=0)
rf_cv.fit(X_train, y_train)
print(f"Best hyperparameters: {rf_cv.best_params_['max_depth']}")

Best hyperparameters: 4


In [12]:
# Construct the random forest model
rf = RandomForestRegressor(random_state=0, max_depth=rf_cv.best_params_['max_depth'])
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Random Forest R2: {r2}")

Random Forest R2: 0.7475821662831181


In [13]:
# Save the random forest model
model_path = os.path.join(models_dir, "rf.joblib")
dump(rf, model_path)

['c:\\Users\\mashr\\OneDrive - University of Pittsburgh\\GitHub\\IE2164\\Trained Models\\rf.joblib']