In [1]:
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import neptune
import lakefs_client
from lakefs_client import models
from lakefs_client.client import LakeFSClient
import joblib
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from neptune.integrations.tensorflow_keras import NeptuneCallback

#import neptune.integrations.sklearn as npt_utils
#from neptune.utils import stringify_unsupported


In [2]:
######################################## lakeFS Data Versioning
# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = 'AKIAJ7N3Q7CLFRFGLJ3Q'
configuration.password = '4Qz2msNzhGUwvPUhpcRZQz/vF9hUpEXs5iguekBm'
configuration.host = 'https://cute-cat-6fs7iz.us-east-1.lakefscloud.io'#/repositories/sample-repo/'

# Variables from lakeFS
repo = 'sample-repo'
lake_branch = 'tracking'
commit_ids = ['336e684405152053bb607a31c4240a2490747fee60620e745f76cb6c74523cb7', # UNCLEANED (v1)
              '7edf3e4296bf352581032b687ec1ff4a892a8d056936c83e16f537744b188f55'] # CLEANED (V2)

client = LakeFSClient(configuration)

lakefs_data = client.objects.get_object(
    repository=repo,
    ref=commit_ids[1], # Notes: 0 - UNCLEANED (V1), 1 - CLEANED (V2)
    path='data/athletes.csv')

df = pd.read_csv(lakefs_data)

In [3]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,region,gender,age,height,weight,candj,snatch,deadlift,backsq,eat,background,experience,schedule,howlong
0,21,Southern California,Male,30.0,71.0,200.0,235.0,175.0,385.0,315.0,I eat whatever is convenient|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 1x a week|I ty...,1-2 years|
1,22,Africa,Male,28.0,70.0,176.0,187.0,134.0,335.0,254.0,I eat 1-3 full cheat meals per week|,I have no athletic background besides CrossFit|,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 1x a week|,2-4 years|


In [4]:
# Data Processing
def data_process(dataset):
    df['total_lift'] = df['candj'].fillna(0) + df['snatch'].fillna(0) + df['deadlift'].fillna(0) + df['backsq'].fillna(0)

    vars = ['region','gender', 'eat', 'background', 'experience', 'schedule', 'howlong','age','height','weight','candj','snatch','deadlift','backsq']
    cats = ['region','gender', 'eat', 'background', 'experience', 'schedule', 'howlong']
    numcs = ['age','height','weight']

    # Adjust variables here - used in Pipeline below
    x = df[numcs].fillna(0) # NAs in numeric columns, fill 0 if any
    y = df['total_lift']

    # train test split
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = data_process(df)

In [5]:
# Train Random Forest
parameters = {'n_estimators': 100, "max_depth": 7, "min_samples_split": 5}
rfr = RandomForestRegressor(**parameters)
rfr.fit(x_train, y_train)

In [None]:
# Save model weights locally
joblib.dump(rfr, 'rfr_model.pkl')

In [None]:
# Calculate error
y_pred = rfr.predict(x_test)
mae = mean_absolute_error(y_test, y_pred)

In [None]:
run = neptune.init_run(
    project="mlops-uchicago/mlops-hw2",
    name = "RandomForest Model"
) 
model_version = neptune.init_model_version(
    model="MLOP-RF",
    project="mlops-uchicago/mlops-hw2",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzM2NkNDA2MS01NDVmLTQxNTItODk3Ny1iNDU1MGQxYzhmZmMifQ==", # your credentials
)

model_version["model"].upload('rfr_model.pkl')
model_version["validation/acc"] = mae

run["dataset_version"].track_files('data/athletes.csv')

In [None]:
run.stop()

## Linear Regression Section

In [None]:
features = ['age', 'height', 'weight']  # Adjust this list as needed

# Split the data into features (X) and the target variable (y)
x = df[features]
y = df['total_lift']

# train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# Create and train the linear regression model
lr = LinearRegression()
lr.fit(x_train, y_train)

# Make predictions on the test data
y_pred = lr.predict(x_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)

In [None]:
# Save model weights locally
joblib.dump(lr, 'lr_model.pkl')

In [None]:
run = neptune.init_run(
    project="mlops-uchicago/mlops-hw2",
    name = "LinearRegression Model"
) 
model_version = neptune.init_model_version(
    model="MLOP-LR",
    project="mlops-uchicago/mlops-hw2"
)

model_version["model"].upload('lr_model.pkl')
model_version["validation/acc"] = mae


run["dataset_version"].track_files('data/athletes.csv')

In [None]:
run.stop()

## Neural Network

In [6]:
import tensorflow as tf

In [None]:
# model hyperparameters
epochs = 15
batch_size = 64

# Define the model architecture here that should be used in the pipeline
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(x_train.shape[1],)),   # Input layer for the number of features
    tf.keras.layers.Dense(64, activation='relu'),       # Hidden layer 1
    tf.keras.layers.Dense(32, activation='relu'),       # Hidden layer 2
    tf.keras.layers.Dense(1)                            # Output layer (for regression)
])

model.summary()

In [None]:
run = neptune.init_run(
    project="mlops-uchicago/mlops-hw2",
    name = "LinearRegression Model"
) 

run['parameters'] = {'activation': 'ReLu',
                     'Layers': 3,
                     'epochs': 15,
                     'batch_size': 64
                    }

#run["dataset_version"].track_files('data/athletes.csv')

model_version = neptune.init_model_version(
    model="MLOP-NN",
    project="mlops-uchicago/mlops-hw2"
)

model.compile(optimizer='adam', loss='mean_squared_error')

#model_version["model"].upload('lr_model.pkl')
#model_version["validation/acc"] = mae

In [None]:
from neptune.integrations.tensorflow_keras import NeptuneCallback

neptune_clbk = NeptuneCallback(run=run, base_namespace='metrics')

history = model.fit(x_train, y_train, epochs=epochs, validation_data=(x_test, y_test), verbose=2, callbacks=[neptune_clbk])

# Pipeline

In [38]:
# create a model in neptune (only needs to be run once). It will track all future models like this
model_nept = neptune.init_model_version(
    model="MLOP-PIPELINE",
    project="mlops-uchicago/mlops-hw2"
)

https://app.neptune.ai/mlops-uchicago/mlops-hw2/m/MLOP-PIPELINE/v/MLOP-PIPELINE-9


In [13]:
# Data Processing step
def data_process(dataset):
    df['total_lift'] = df['candj'].fillna(0) + df['snatch'].fillna(0) + df['deadlift'].fillna(0) + df['backsq'].fillna(0)

    vars = ['region','gender', 'eat', 'background', 'experience', 'schedule', 'howlong','age','height','weight','candj','snatch','deadlift','backsq']
    cats = ['region','gender', 'eat', 'background', 'experience', 'schedule', 'howlong']
    numcs = ['age','height','weight']

    # Adjust variables here - used in Pipeline below
    x = df[numcs].fillna(0) # NAs in numeric columns, fill 0 if any
    y = df['total_lift']

    # train test split
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    return x_train, x_test, y_train, y_test


In [34]:
# Modeling step track emissions
from codecarbon import track_emissions

@track_emissions
def modeling(run, user_optimizer, user_loss, user_epochs, output_file):
   # Adjust the architecture ahead of pipeline run
   model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(x_train.shape[1],)),   # Input layer for the number of features
    tf.keras.layers.Dense(64, activation='relu'),       # Hidden layer 1
    tf.keras.layers.Dense(32, activation='relu'),       # Hidden layer 2
    tf.keras.layers.Dense(16, activation='sigmoid'),    # Hidden layer 3
    tf.keras.layers.Dense(8, activation='sigmoid'),     # Hidden layer 4
    tf.keras.layers.Dense(1)                            # Output layer (for regression)
   ])

   # Compile
   model.compile(optimizer=user_optimizer, 
                 loss=user_loss)

   # Train
   neptune_clbk = NeptuneCallback(run=run, base_namespace='metrics')
   history = model.fit(x_train, y_train, epochs=user_epochs, validation_data=(x_test, y_test), verbose=2, callbacks=[neptune_clbk])

   # Save
   tf.keras.models.save_model(model, output_file, overwrite=True, include_optimizer=True, save_format=None,
        signatures=None, options=None, save_traces=True)
   model.save('my_nn_model')

In [35]:
def pipeline(run, mod, dataset_version=1, user_epochs=10, user_optimizer='adam', user_loss='mean_absolute_error', output_file='nn_mlops.h5'):

   # Get the dataset version
   df = client.objects.get_object(
    repository=repo,
    ref=commit_ids[dataset_version], # Notes: 0 - UNCLEANED (V1), 1 - CLEANED (V2)
    path='data/athletes.csv')

   # Processing step, Update function to add features
   data_process(df)

   # Track a defined model
   model_version = neptune.init_model_version(
      project="mlops-uchicago/mlops-hw2", 
      model=mod,
   )

   # modeling step, Update to change model
   modeling(run=run,
            user_optimizer=user_optimizer,
            user_loss=user_loss,
            user_epochs=user_epochs,
            output_file=output_file)

   run['my_model/saved_model'].upload(output_file)   # capture the saved model weights and upload
   run['my_model/emissions'].upload('emissions.csv') # track carbon emissions for each run
   run['parameters'] = {'activation': 'ReLu and Sigmoid',        # Capture model params and store in Neptune
                     'Layers': 4,
                     'epochs': user_epochs,
                     'batch_size': 64
                    }
   run.stop()

In [36]:
# define the run object
run = neptune.init_run(
    project="mlops-uchicago/mlops-hw2",
    api_token='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzM2NkNDA2MS01NDVmLTQxNTItODk3Ny1iNDU1MGQxYzhmZmMifQ=='
) 

# Run the pipeline
pipeline(run=run,
         mod="MLOP-PIPELINE",
         dataset_version=1, 
         user_epochs=12, 
         user_optimizer='adam', 
         user_loss='mean_squared_error',
         output_file='nn_mlops.h5')

https://app.neptune.ai/mlops-uchicago/mlops-hw2/e/MLOP-38
https://app.neptune.ai/mlops-uchicago/mlops-hw2/m/MLOP-PIPELINE/v/MLOP-PIPELINE-7


[codecarbon INFO @ 16:55:18] [setup] RAM Tracking...
[codecarbon INFO @ 16:55:18] [setup] GPU Tracking...
[codecarbon INFO @ 16:55:18] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 16:55:18] [setup] CPU Tracking...
[codecarbon INFO @ 16:55:20] CPU Model on constant consumption mode: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz
[codecarbon INFO @ 16:55:20] >>> Tracker's metadata:
[codecarbon INFO @ 16:55:20]   Platform system: Windows-10-10.0.22621-SP0
[codecarbon INFO @ 16:55:20]   Python version: 3.10.0
[codecarbon INFO @ 16:55:20]   CodeCarbon version: 2.3.1
[codecarbon INFO @ 16:55:20]   Available RAM : 31.919 GB
[codecarbon INFO @ 16:55:20]   CPU count: 12
[codecarbon INFO @ 16:55:20]   CPU model: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz
[codecarbon INFO @ 16:55:20]   GPU count: 1
[codecarbon INFO @ 16:55:20]   GPU model: 1 x NVIDIA GeForce RTX 2080


Epoch 1/12
751/751 - 2s - loss: 1100496.3750 - val_loss: 1086715.2500 - 2s/epoch - 3ms/step
Epoch 2/12
751/751 - 1s - loss: 1085401.2500 - val_loss: 1072675.6250 - 1s/epoch - 2ms/step
Epoch 3/12
751/751 - 1s - loss: 1071622.7500 - val_loss: 1059143.2500 - 1s/epoch - 2ms/step
Epoch 4/12
751/751 - 2s - loss: 1058163.0000 - val_loss: 1045814.8125 - 2s/epoch - 3ms/step
Epoch 5/12
751/751 - 1s - loss: 1044872.3125 - val_loss: 1032630.1875 - 1s/epoch - 2ms/step
Epoch 6/12
751/751 - 2s - loss: 1031711.8750 - val_loss: 1019564.3125 - 2s/epoch - 2ms/step
Epoch 7/12
751/751 - 2s - loss: 1018661.4375 - val_loss: 1006598.5625 - 2s/epoch - 2ms/step
Epoch 8/12
751/751 - 1s - loss: 1005711.8125 - val_loss: 993736.0625 - 1s/epoch - 2ms/step
Epoch 9/12
751/751 - 1s - loss: 992860.0625 - val_loss: 980966.0000 - 1s/epoch - 2ms/step
Epoch 10/12


[codecarbon INFO @ 16:55:35] Energy consumed for RAM : 0.000050 kWh. RAM Power : 11.969547271728516 W
[codecarbon INFO @ 16:55:35] Energy consumed for all GPUs : 0.000096 kWh. Total GPU Power : 23.074830019094836 W
[codecarbon INFO @ 16:55:35] Energy consumed for all CPUs : 0.000198 kWh. Total CPU Power : 47.5 W
[codecarbon INFO @ 16:55:35] 0.000345 kWh of electricity used since the beginning.


751/751 - 2s - loss: 980100.6875 - val_loss: 968290.8125 - 2s/epoch - 2ms/step
Epoch 11/12
751/751 - 2s - loss: 967436.6250 - val_loss: 955709.8750 - 2s/epoch - 2ms/step
Epoch 12/12
751/751 - 2s - loss: 954866.1875 - val_loss: 943220.4375 - 2s/epoch - 2ms/step


  tf.keras.models.save_model(model, output_file, overwrite=True, include_optimizer=True, save_format=None,


INFO:tensorflow:Assets written to: my_nn_model\assets


INFO:tensorflow:Assets written to: my_nn_model\assets
[codecarbon INFO @ 16:55:41] 
Graceful stopping: collecting and writing information.
Please wait a few seconds...
[codecarbon INFO @ 16:55:41] Energy consumed for RAM : 0.000067 kWh. RAM Power : 11.969547271728516 W
[codecarbon INFO @ 16:55:41] Energy consumed for all GPUs : 0.000133 kWh. Total GPU Power : 26.05856418111212 W
[codecarbon INFO @ 16:55:41] Energy consumed for all CPUs : 0.000266 kWh. Total CPU Power : 47.5 W
[codecarbon INFO @ 16:55:41] 0.000466 kWh of electricity used since the beginning.
[codecarbon INFO @ 16:55:41] Done!



Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 12 operations to synchronize with Neptune. Do not kill this process.
All 12 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/mlops-uchicago/mlops-hw2/e/MLOP-38/metadata
