In [None]:
# run if needed!
!pip install --upgrade requests pandas

Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m86.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully uninstalled pandas-2.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.[0m[31m
[0mSuccessfully installed pandas-2.2.3


In [46]:
# imports
import io
import requests
import pandas as pd
import numpy as np
import sklearn as sk
import warnings
# import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from google.colab import files

In [47]:
def categorize(X, ind_list, cols_replace):
  """
  Categorizes non-numerical data for regression!
  ------------------------------------------------------------------
  Input(s):
  - X (dataframe; all input data)
  - ind_list (list; contains indices where conditions apply for each category)
  - cols_replace (list; the names of columns to be replaced by dummies

  Output(s):
  - X (dataframe; input data with categorized "dummy" columns)
  """
  cats = np.empty(len(X.index), dtype=object)
  for i, t in enumerate(ind_list):
    cats[t] = 'type ' + str(1 + i)
  dummies = pd.get_dummies(cats)
  X = pd.concat([X, dummies],axis=1)
  X = X.drop(cols_replace, axis=1)
  return X

In [48]:
def load_data_from_github(csv_url):
    """
    Loads summary.csv from GitHub and returns X and y train/test splits!
    ------------------------------------------------------------------
    Input(s):
    - csv_url (string; user-input URL of GitHub link to summary.csv)

    Output(s):
    - df (dataframe; complete summary.csv in a dataframe)
    - X_train (dataframe; training split of inputs)
    - X_test (dataframe; testing split of inputs)
    - y_train (series; training split of outputs)
    - y_test (dataframe; testing split of outputs)
    """
    # load summary dataset directly from the GitHub URL
    df = pd.read_csv(csv_url)
    # display the dataframe (uncomment to enable)
    # display(df)
    # obtain outputs (number of cycles until 80% SoH)
    y = df['num_cycles_op']
    # obtain all x-values (temperature, charging/discharging rate, min/max SoC)
    X = df.filter(['avg_age_temp', 'avg_age_chg_rate', 'avg_age_dischg_rate', 'SOC Window Min', 'SOC Window Max'])
    # parse min and max SOC into categories
    # 0-100% = type 1
    # 10-100% = type 2
    # 10-90% = type 3
    type_1_ind = X.loc[(X['SOC Window Min'] == 0) & (X['SOC Window Max'] == 100)].index
    type_2_ind = X.loc[(X['SOC Window Min'] == 10) & (X['SOC Window Max'] == 100)].index
    type_3_ind = X.loc[(X['SOC Window Min'] == 10) & (X['SOC Window Max'] == 90)].index
    X = categorize(X, [type_1_ind,type_2_ind,type_3_ind], ['SOC Window Min', 'SOC Window Max'])
    # obtain training/testing split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test, X, y

In [49]:
def permute_hyperparams(est, X_train, y_train):
  # n_estimators
  n_est = np.linspace(10,200,20,dtype=int)
  n_est = np.append(n_est,None)
  # max_depth
  max_depth = np.linspace(5,100,20,dtype=int)
  # min_samples_split
  min_samples_split = np.linspace(2,10,5,dtype=int)
  # min_samples_leaf
  min_samples_leaf = np.linspace(1,10,10,dtype=int)
  # max_features
  max_features = ['sqrt','log2',None]
  grid = {'n_estimators':n_est,'max_depth':max_depth,'min_samples_split':min_samples_split,
          'min_samples_leaf':min_samples_leaf,'max_features':max_features}
  r_search = RandomizedSearchCV(estimator = est, param_distributions = grid, n_iter = 100, cv = 5, verbose = 1, random_state = 0, n_jobs = -1)
  r_search.fit(X_train, y_train)
  return r_search.best_params_

In [50]:
def eval_params(r_forest_fit, X_test, y_test):
  pred_y = r_forest_fit.predict(X_test)
  return (r2_score(y_test, pred_y), mean_squared_error(y_test, pred_y))

In [51]:
def permute_splits(r_forest_fit, X, y):
  """
  Iterate through 100 random splits to evaluate accuracy!
  ------------------------------------------------------------------
  """
  r2_vals = np.zeros(100)
  # iterate 100 times
  for i in range(0,99):
    # make random 80/20 splits
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    iter_y = r_forest_fit.predict(X_test)
    r2_vals[i] = r2_score(y_test, iter_y)
  return (r2_vals.mean())

In [52]:
def predict(r_forest_fit, mse):
  print("Here we can use the model to predict EOL with hypothetical conditions:")
  # prompt for input values, allowing user-to-model interaction
  try:
      avg_age_temp = float(input("Enter aging temperature (numerical): "))
      avg_age_chg_rate = float(input("Enter charge C-rate (numerical, larger than 0): "))
      avg_age_dischg_rate = float(input("Enter discharge C-rate (numerical, larger than 0): "))
      SoC_win = str(input("Enter SoC window ('type 1' = 0-100%; 'type 2' = 10-100%; 'type 3' = 10-90%): "))
  # this exception is included in case the user passes a non-value input into the function
  except ValueError:
      print("Please enter the appropriate variable type!")
      return

  type_1 = 0
  type_2 = 0
  type_3 = 0

  if SoC_win == 'type 1':
    type_1 = 1
  elif SoC_win == 'type 2':
    type_2 = 1
  elif SoC_win == 'type 3':
    type_3 = 1

  input_df = pd.DataFrame([[avg_age_temp, avg_age_chg_rate, avg_age_dischg_rate, type_1, type_2, type_3]],
                          columns=['avg_age_temp', 'avg_age_chg_rate', 'avg_age_dischg_rate', 'type 1', 'type 2', 'type 3'])
  # This is where we predict the output
  predicted_y = r_forest_fit.predict(input_df)
  print("Predicted Number of Cycles until EOL: ", predicted_y[0])
  print("Model's RMSE: ", mse**0.5)
  print(f"Prediction could be off by += {mse**0.5} cycles.")

In [53]:
def main():
    """
    Main function to call everything else!
    ------------------------------------------------------------------
    """
    # ignore warnings
    warnings.filterwarnings("ignore")

    # user input: summary.csv URL on GitHub
    csv_url = input("Paste the raw GitHub URL for summary.csv: ").strip()

    # obtain splits
    X_train, X_test, y_train, y_test, X, y = load_data_from_github(csv_url)

    # initiate regression model
    r_forest = RandomForestRegressor()
    r_forest.fit(X_train, y_train)
    pred_y_unadj = r_forest.predict(X_test)
    unadjusted_r2 = r2_score(y_test, pred_y_unadj)
    unadjusted_mse = mean_squared_error(y_test, pred_y_unadj)
    print('R2 score with default regression parameters: ', unadjusted_r2)
    print('MSE score with default regression parameters: ', unadjusted_mse)

    # perform parameter testing
    hyperparams = permute_hyperparams(r_forest, X_train, y_train)
    print(hyperparams)

    # evaluate fit on test set
    r_forest_fit = RandomForestRegressor(**hyperparams)
    r_forest_fit.fit(X_train, y_train)
    r2_adj, mse_adj = eval_params(r_forest_fit, X_test, y_test)
    print('R2 score after RandomizedSearchCV: ', r2_adj)
    print('MSE score after RandomizedSearchCV: ', mse_adj)

    if r2_adj >= unadjusted_r2:
      model = r_forest_fit
      reported_mse = mse_adj
    else:
      model = r_forest
      reported_mse = unadjusted_mse

    # evaluate average fit on random train/test splits
    r2_ran_splits = permute_splits(model, X, y)
    print ('R2 score across 100 random splits: ', r2_ran_splits)

    predict(model, reported_mse)

if __name__ == "__main__":
    main()

Paste the raw GitHub URL for summary.csv: https://raw.githubusercontent.com/pinkfig/best-chemeng-DS-ML-project/refs/heads/main/summary.csv?token=GHSAT0AAAAAAC72Z7RQ6V26SOOLNT4SEXOQZ6U4AVQ
R2 score with default regression parameters:  0.96019696991129
MSE score with default regression parameters:  0.96019696991129
Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'n_estimators': 160, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 85}
R2 score after RandomizedSearchCV:  0.9481172595323265
MSE score after RandomizedSearchCV:  9254.264823012873
R2 score across 100 random splits:  0.9583520981952413
Here we can use the model to predict EOL with hypothetical conditions:
Enter aging temperature (numerical): 30
Enter charge C-rate (numerical, larger than 0): 1
Enter discharge C-rate (numerical, larger than 0): 1
Enter SoC window ('type 1' = 0-100%; 'type 2' = 10-100%; 'type 3' = 10-90%): type 3
Predicted Number of Cycles until EOL:  1292.3465238