In [3]:
# Lets build a regression model to predict sepal length from sepal width, using iris.csv

import pandas as pd

# Load the data
iris = pd.read_csv('iris.csv')

# Split the data into X and y [Univariate regression]
X = iris[['sepal_width']]
y = iris['sepal_length']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Create a linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

model

  from pandas.core import (


In [5]:
# get size of X_train
print(len(X_train))

print(len(X_test))

112
38


In [5]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R^2:', r2_score(y_test, y_pred))

Mean Squared Error: 0.655536071343486
R^2: 0.00017524291267712044


In [7]:
# Visualize the relationship between sepal width and sepal length with the regression line in red , using plotly    
import plotly.express as px

fig = px.scatter(iris, x='sepal_width', y='sepal_length', title='Sepal Width vs Sepal Length')
fig.add_traces(px.line(x=X_test['sepal_width'], y=y_pred).data)

fig.show()

# Lets build various models using h2o 

In [6]:
import h2o
h2o.init(max_mem_size='2G')

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "18.0.2.1" 2022-08-18; OpenJDK Runtime Environment Homebrew (build 18.0.2.1+0); OpenJDK 64-Bit Server VM Homebrew (build 18.0.2.1+0, mixed mode, sharing)
  Starting server from /Users/pgmenon/opt/anaconda3/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/vp/j3tfbs8x34n30v4fnpskn4dc0000gn/T/tmprel43q1_
  JVM stdout: /var/folders/vp/j3tfbs8x34n30v4fnpskn4dc0000gn/T/tmprel43q1_/h2o_pgmenon_started_from_python.out
  JVM stderr: /var/folders/vp/j3tfbs8x34n30v4fnpskn4dc0000gn/T/tmprel43q1_/h2o_pgmenon_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.3
H2O_cluster_version_age:,2 years !!!
H2O_cluster_name:,H2O_from_python_pgmenon_q83je9
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


## H2o flow command for running automl after importing and exploring the data interactively

runAutoML {"input_spec":{"training_frame":"train75","response_column":"species","validation_frame":"test25","blending_frame":"iris.hex","leaderboard_frame":"iris.hex","ignored_columns":[],"sort_metric":"AUTO"},"build_control":{"project_name":"irisClassifySpecies01","distribution":"AUTO","nfolds":5,"balance_classes":false,"stopping_criteria":{"seed":-1,"max_models":0,"max_runtime_secs":360,"max_runtime_secs_per_model":0,"stopping_rounds":3,"stopping_metric":"AUTO","stopping_tolerance":-1},"keep_cross_validation_predictions":true,"keep_cross_validation_models":true,"keep_cross_validation_fold_assignment":false,"export_checkpoints_dir":"/Users/pgmenon/Documents/CMU/PGSS2024/CMU_PGSS_2024/L4-792024/models_species/"},"build_models":{"exclude_algos":["DeepLearning","StackedEnsemble"],"exploitation_ratio":-1,"monotone_constraints":[]}}, 'exec'

In [8]:
h2o.remove_all()

In [9]:
# Create a new dataframe from iris, which is a pandas dataframe with a column called "Setosa" which is 1 if the species is "Iris-setosa" and 0 otherwise
newdf = iris.copy()
newdf['Setosa'] = newdf['species'].apply(lambda x: 1 if x == 'Iris-setosa' else 0)

# Send this dataframe to h2o
newdf_h2o = h2o.H2OFrame(newdf, destination_frame="iris_setosaBinary.hex")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


# h2o flow code for binary classification of Setosa
runAutoML {"build_control":{"project_name":"classifySetosa01","distribution":"AUTO","nfolds":5,"balance_classes":false,"stopping_criteria":{"seed":-1,"max_models":0,"max_runtime_secs":360,"max_runtime_secs_per_model":0,"stopping_rounds":3,"stopping_metric":"AUTO","stopping_tolerance":-1},"keep_cross_validation_predictions":true,"keep_cross_validation_models":true,"keep_cross_validation_fold_assignment":false,"export_checkpoints_dir":"/Users/pgmenon/Documents/CMU/PGSS2024/CMU_PGSS_2024/L4-792024/models_setosaBinary/"},"input_spec":{"training_frame":"frame_0.750","response_column":"Setosa","validation_frame":"frame_0.250","blending_frame":"iris_setosaBinary.hex","leaderboard_frame":"iris_setosaBinary.hex","ignored_columns":["species"],"sort_metric":"AUTO"},"build_models":{"exclude_algos":["DeepLearning","StackedEnsemble"],"exploitation_ratio":-1,"monotone_constraints":[]}}, 'exec'