In [None]:
#INSTALLS
!pip install --upgrade requests pandas

In [None]:
#FILE IMPORTS
import pandas as pd
import math
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score
from google.colab import files

In [None]:
def load_data_from_github(csv_url):
    """
    This function loads the CSV summary file into a pandas DataFrame from a raw GitHub URL.
    Columns are renamed and outputs are extracted. The input is a GitHub URL passed in
    through main, which is initiated by the user pasting the url into the
    terminal after running the cell.
    This function returns the dataframe, all X values, and all y values.
    """
    import pandas as pd
    # Load summary dataset directly from the GitHub URL
    df = pd.read_csv(csv_url)
    df.rename(columns={'SOC Window Min': 'min_SOC', 'SOC Window Max': 'max_SOC'}, inplace=True)
    # Obtain outputs (number of cycles until 80% SoH)
    y = df['num_cycles_op']
    # Obtain all x-values (temperature, charging/dischargin rate, min/max SoC)
    X = df.filter(['avg_age_temp', 'avg_age_chg_rate', 'avg_age_dischg_rate', 'min_SOC', 'max_SOC'])
    return df, X, y

def evaluate_model(X, y):
    """
    Runs multiple training/testing splits and fits a RandomForestRegressor on each split.
    Prints the average R2 score over 99 splits and returns the last split and model,
    along with the final model's MSE on the test set. We do this to help
    evaluate the performance of the model across different splits.
    """
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import r2_score, mean_squared_error
    import numpy as np

    r2_scores = []  # create a list to store R2 scores

    # Run 99 iterations of training/testing splits
    for i in range(99):
        # Obtain training/testing split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        # Combine test data into one dataframe for convenience
        df_test = X_test.assign(num_cycles_op=y_test)
        r_forest = RandomForestRegressor()
        r_forest.fit(X_train, y_train)
        pred_y = r_forest.predict(X_test)
        r2_scores.append(r2_score(y_test, pred_y))

    print("Average R2 over 99 splits:", np.average(r2_scores))

    # Train the final model on the last training split
    r_forest = RandomForestRegressor()
    r_forest.fit(X_train, y_train)
    df_test = df_test.assign(pred_y=r_forest.predict(X_test))
    final_r2 = r2_score(df_test['num_cycles_op'], df_test['pred_y'])
    final_mse = mean_squared_error(y_test, r_forest.predict(X_test))
    print("Final Split R2 score with RandomForestRegressor:", final_r2)
    print("Final Split MSE on Test Set:", final_mse)

    # Return last split data, the final model, and the final MSE for further use
    return X_train, X_test, y_train, y_test, df_test, r_forest, final_mse

def parameter_testing(X_train, X_test, y_train, y_test, df_test):
    """
    This function takes the training and test x/y values as inputs,
    passed through main, along with the test dataframe. Nothing is returned;
    however, we do print the R2 score after parameter testing. This function
    is called to evaluate our model performance.
    Performs parameter testing using RandomizedSearchCV on a RandomForestRegressor.
    Updates df_test with predictions from the best estimator and prints the R2 score.
    """
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.metrics import r2_score
    import numpy as np

    print("Parameter Testing in Progress. Please wait ~1 minute for completion...")
    # Parameter testing
    # n_estimators
    n_est = np.linspace(10, 200, 20, dtype=int)
    # max_depth
    max_depth = np.linspace(5, 100, 20, dtype=int)
    # min_samples_split
    min_samples_split = np.linspace(2, 10, 5, dtype=int)
    # min_samples_leaf
    min_samples_leaf = np.linspace(1, 10, 10, dtype=int)
    # max_features
    max_features = ['sqrt', 'log2']

    grid = {
        'n_estimators': n_est,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features
    }

    ranforreg = RandomForestRegressor()
    r_search = RandomizedSearchCV(estimator=ranforreg,
                                  param_distributions=grid,
                                  n_iter=100,
                                  cv=5,
                                  verbose=0,
                                  random_state=10,
                                  n_jobs=-1)
    r_search.fit(X_train, y_train)
    df_test = df_test.assign(pred_y=r_search.predict(X_test))
    search_r2 = r2_score(df_test['num_cycles_op'], df_test['pred_y'])
    print("R2 score after RandomizedSearchCV:", search_r2)

def predict_hypothetical(model, mse):
    """
    This function prompts the user to input hypothetical values for the features
    (avg_age_temp, avg_age_chg_rate, avg_age_dischg_rate, min_SOC, max_SOC) and uses the model
    to predict the corresponding output (y value). Rather than existing for
    performance evaluation, this function exists for the user to interact
    with the model and experiment what different cycle conditions might
    lead to in terms of model predicted battery lifespan. This function is fun!
    The function passes in the final model and MSE through main.
    The user is prompted to input the x-parameters, which are incorporated and
    used to predict the y value. RMSE is also displayed to help the user gauge
    how inaccurate the prediction may be.
    """
    import pandas as pd
    print("Here we can use the model to predict EOL with hypothetical conditions:")
    # Prompt for input values, allowing user-to-model interaction
    try:
        avg_age_temp = float(input("Enter avg_age_temp: "))
        avg_age_chg_rate = float(input("Enter avg_age_chg_rate: "))
        avg_age_dischg_rate = float(input("Enter avg_age_dischg_rate: "))
        min_SOC = float(input("Enter min_SOC: "))
        max_SOC = float(input("Enter max_SOC: "))
    except ValueError: #this exception is included in case the user
    #passes a non-value input into the function.
        print("Please enter numerical values, other cases will not work.")
        return

    input_df = pd.DataFrame([[avg_age_temp, avg_age_chg_rate, avg_age_dischg_rate, min_SOC, max_SOC]],
                            columns=['avg_age_temp', 'avg_age_chg_rate', 'avg_age_dischg_rate', 'min_SOC', 'max_SOC'])

    # This is where we predict the output
    predicted_y = model.predict(input_df)
    print("Predicted Number of Cycles until EOL:", predicted_y[0])
    print("Model's RMSE:", math.sqrt(mse))
    print(f"Prediction could be off by += {math.sqrt(mse)} cycles.")

def main():
    # Here we choose to supress warnings because they made the output very
    #messy during testing. They are ignored for ease of readability, but
    #can optionally be re-added by deleting these two lines of code below
    import warnings
    warnings.filterwarnings("ignore")

    # Ask the user to paste the raw GitHub URL for summary.csv
    csv_url = input("Paste the raw GitHub URL for summary.csv: ").strip()

    df, X, y = load_data_from_github(csv_url)

    # Evaluate model with multiple train/test splits and get final model
    X_train, X_test, y_train, y_test, df_test, final_model, final_mse = evaluate_model(X, y)

    # Perform parameter testing
    parameter_testing(X_train, X_test, y_train, y_test, df_test)

    # Predict a y value using user inputs
    predict_hypothetical(final_model, final_mse)

if __name__ == "__main__":
    main()
