In [3]:
# Lets build a regression model to predict sepal length from sepal width, using iris.csv

import pandas as pd

# Load the data
iris = pd.read_csv('iris.csv')

# Split the data into X and y [Univariate regression]
X = iris[['sepal_width']]
y = iris['sepal_length']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Create a linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

model

  from pandas.core import (


In [5]:
# get size of X_train
print(len(X_train))

print(len(X_test))

112
38


In [5]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R^2:', r2_score(y_test, y_pred))

Mean Squared Error: 0.655536071343486
R^2: 0.00017524291267712044


In [7]:
# Visualize the relationship between sepal width and sepal length with the regression line in red , using plotly    
import plotly.express as px

fig = px.scatter(iris, x='sepal_width', y='sepal_length', title='Sepal Width vs Sepal Length')
fig.add_traces(px.line(x=X_test['sepal_width'], y=y_pred).data)

fig.show()

# Lets build various models using h2o 

In [6]:
import h2o
h2o.init(max_mem_size='2G')

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "18.0.2.1" 2022-08-18; OpenJDK Runtime Environment Homebrew (build 18.0.2.1+0); OpenJDK 64-Bit Server VM Homebrew (build 18.0.2.1+0, mixed mode, sharing)
  Starting server from /Users/pgmenon/opt/anaconda3/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/vp/j3tfbs8x34n30v4fnpskn4dc0000gn/T/tmprel43q1_
  JVM stdout: /var/folders/vp/j3tfbs8x34n30v4fnpskn4dc0000gn/T/tmprel43q1_/h2o_pgmenon_started_from_python.out
  JVM stderr: /var/folders/vp/j3tfbs8x34n30v4fnpskn4dc0000gn/T/tmprel43q1_/h2o_pgmenon_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.3
H2O_cluster_version_age:,2 years !!!
H2O_cluster_name:,H2O_from_python_pgmenon_q83je9
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


## H2o flow command for running automl after importing and exploring the data interactively

runAutoML {"input_spec":{"training_frame":"train75","response_column":"species","validation_frame":"test25","blending_frame":"iris.hex","leaderboard_frame":"iris.hex","ignored_columns":[],"sort_metric":"AUTO"},"build_control":{"project_name":"irisClassifySpecies01","distribution":"AUTO","nfolds":5,"balance_classes":false,"stopping_criteria":{"seed":-1,"max_models":0,"max_runtime_secs":360,"max_runtime_secs_per_model":0,"stopping_rounds":3,"stopping_metric":"AUTO","stopping_tolerance":-1},"keep_cross_validation_predictions":true,"keep_cross_validation_models":true,"keep_cross_validation_fold_assignment":false,"export_checkpoints_dir":"/Users/pgmenon/Documents/CMU/PGSS2024/CMU_PGSS_2024/L4-792024/models_species/"},"build_models":{"exclude_algos":["DeepLearning","StackedEnsemble"],"exploitation_ratio":-1,"monotone_constraints":[]}}, 'exec'

In [8]:
h2o.remove_all()

In [9]:
# Create a new dataframe from iris, which is a pandas dataframe with a column called "Setosa" which is 1 if the species is "Iris-setosa" and 0 otherwise
newdf = iris.copy()
newdf['Setosa'] = newdf['species'].apply(lambda x: 1 if x == 'Iris-setosa' else 0)

# Send this dataframe to h2o
newdf_h2o = h2o.H2OFrame(newdf, destination_frame="iris_setosaBinary.hex")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


# h2o flow code for binary classification of Setosa
runAutoML {"build_control":{"project_name":"classifySetosa01","distribution":"AUTO","nfolds":5,"balance_classes":false,"stopping_criteria":{"seed":-1,"max_models":0,"max_runtime_secs":360,"max_runtime_secs_per_model":0,"stopping_rounds":3,"stopping_metric":"AUTO","stopping_tolerance":-1},"keep_cross_validation_predictions":true,"keep_cross_validation_models":true,"keep_cross_validation_fold_assignment":false,"export_checkpoints_dir":"/Users/pgmenon/Documents/CMU/PGSS2024/CMU_PGSS_2024/L4-792024/models_setosaBinary/"},"input_spec":{"training_frame":"frame_0.750","response_column":"Setosa","validation_frame":"frame_0.250","blending_frame":"iris_setosaBinary.hex","leaderboard_frame":"iris_setosaBinary.hex","ignored_columns":["species"],"sort_metric":"AUTO"},"build_models":{"exclude_algos":["DeepLearning","StackedEnsemble"],"exploitation_ratio":-1,"monotone_constraints":[]}}, 'exec'

In [11]:
# load the best model named XGBoost_grid_1_AutoML_2_20240709_165710_model_116 from /Users/pgmenon/Documents/CMU/PGSS2024/CMU_PGSS_2024/L4-792024/models_setosaBinary

myModel_SetosaBinary = h2o.load_model("/Users/pgmenon/Documents/CMU/PGSS2024/CMU_PGSS_2024/L4-792024/models_setosaBinary/XGBoost_grid_1_AutoML_2_20240709_165710_model_116")
myModel_SetosaBinary

Model Details
H2OXGBoostEstimator :  XGBoost
Model Key:  XGBoost_grid_1_AutoML_2_20240709_165710_model_116


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees
0,,30.0




ModelMetricsBinomial: xgboost
** Reported on train data. **

MSE: 0.012651764014634794
RMSE: 0.11248006052023085
LogLoss: 0.10280085461591056
Mean Per-Class Error: 0.0
AUC: 1.0
AUCPR: 1.0
Gini: 1.0

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.8206549882888794: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,78.0,0.0,0.0,(0.0/78.0)
1,1,0.0,28.0,0.0,(0.0/28.0)
2,Total,78.0,28.0,0.0,(0.0/106.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.820655,1.0,0.0
1,max f2,0.820655,1.0,0.0
2,max f0point5,0.820655,1.0,0.0
3,max accuracy,0.820655,1.0,0.0
4,max precision,0.820655,1.0,0.0
5,max recall,0.820655,1.0,0.0
6,max specificity,0.820655,1.0,0.0
7,max absolute_mcc,0.820655,1.0,0.0
8,max min_per_class_accuracy,0.820655,1.0,0.0
9,max mean_per_class_accuracy,0.820655,1.0,0.0



Gains/Lift Table: Avg response rate: 26.42 %, avg score: 26.51 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.264151,0.820655,3.785714,3.785714,1.0,0.820655,1.0,0.820655,1.0,1.0,278.571429,278.571429,1.0
1,2,0.301887,0.164846,0.0,3.3125,0.0,0.198219,0.875,0.74285,0.0,1.0,-100.0,231.25,0.948718
2,3,1.0,0.051424,0.0,1.0,0.0,0.058507,0.264151,0.265101,0.0,1.0,-100.0,0.0,0.0




ModelMetricsBinomial: xgboost
** Reported on validation data. **

MSE: 0.0200841836513038
RMSE: 0.14171867784912404
LogLoss: 0.13942511625242535
Mean Per-Class Error: 0.0
AUC: 1.0
AUCPR: 1.0
Gini: 1.0

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.8206549882888794: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,22.0,0.0,0.0,(0.0/22.0)
1,1,0.0,22.0,0.0,(0.0/22.0)
2,Total,22.0,22.0,0.0,(0.0/44.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.820655,1.0,0.0
1,max f2,0.820655,1.0,0.0
2,max f0point5,0.820655,1.0,0.0
3,max accuracy,0.820655,1.0,0.0
4,max precision,0.820655,1.0,0.0
5,max recall,0.820655,1.0,0.0
6,max specificity,0.820655,1.0,0.0
7,max absolute_mcc,0.820655,1.0,0.0
8,max min_per_class_accuracy,0.820655,1.0,0.0
9,max mean_per_class_accuracy,0.820655,1.0,0.0



Gains/Lift Table: Avg response rate: 50.00 %, avg score: 44.87 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.5,0.820655,2.0,2.0,1.0,0.820655,1.0,0.820655,1.0,1.0,100.0,100.0,1.0
1,2,0.5,0.509437,0.0,2.0,0.0,0.0,1.0,0.820655,0.0,1.0,-100.0,100.0,1.0
2,3,0.636364,0.117462,0.0,1.571429,0.0,0.144381,0.785714,0.675739,0.0,1.0,-100.0,57.142857,0.727273
3,4,1.0,0.051424,0.0,1.0,0.0,0.051424,0.5,0.448716,0.0,1.0,-100.0,0.0,0.0




Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
0,,2024-07-09 16:58:16,50.966 sec,0.0,0.5,0.693147,0.5,0.264151,1.0,0.735849,0.5,0.693147,0.5,0.5,1.0,0.5
1,,2024-07-09 16:58:16,50.980 sec,5.0,0.124812,0.12908,1.0,1.0,3.785714,0.0,0.134835,0.140878,1.0,1.0,2.0,0.0
2,,2024-07-09 16:58:16,50.998 sec,10.0,0.110179,0.103344,1.0,1.0,3.785714,0.0,0.135392,0.134703,1.0,1.0,2.0,0.0
3,,2024-07-09 16:58:16,51.011 sec,15.0,0.111986,0.10283,1.0,1.0,3.785714,0.0,0.14054,0.138546,1.0,1.0,2.0,0.0
4,,2024-07-09 16:58:16,51.026 sec,20.0,0.112396,0.102804,1.0,1.0,3.785714,0.0,0.141522,0.139278,1.0,1.0,2.0,0.0
5,,2024-07-09 16:58:16,51.041 sec,25.0,0.112468,0.102801,1.0,1.0,3.785714,0.0,0.14169,0.139404,1.0,1.0,2.0,0.0
6,,2024-07-09 16:58:16,51.055 sec,30.0,0.11248,0.102801,1.0,1.0,3.785714,0.0,0.141719,0.139425,1.0,1.0,2.0,0.0



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,petal_width,96.738678,1.0,0.527253
1,petal_length,86.737923,0.896621,0.472747




In [12]:
# Predict the probability of the species being "Iris-setosa" using the model
predictions = myModel_SetosaBinary.predict(newdf_h2o)
predictions

xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%


predict,p0,p1
1,0.179345,0.820655
1,0.179345,0.820655
1,0.179345,0.820655
1,0.179345,0.820655
1,0.179345,0.820655
1,0.179345,0.820655
1,0.179345,0.820655
1,0.179345,0.820655
1,0.179345,0.820655
1,0.179345,0.820655




In [13]:
h2o.shutdown()

H2O session _sid_baf1 closed.


  h2o.shutdown()
