In [1]:
# %pip install pandas 
# %pip install tensorflow
# %pip install scikit-learn

In [2]:
%pip install pandas numpy scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
n_samples = 1000
n_features = 286

# Features with some correlation to the target
X = np.random.randn(n_samples, n_features)
# Introduce NaNs in some features (random 10% missing values)


# Generate target variable with correlation to the first few features
# Adding noise for realistic correlation
y = 5 * X[:, 0] + 2 * X[:, 1] - 3 * X[:, 2] + np.random.randn(n_samples) * 0.5
nan_mask = np.random.rand(n_samples, n_features) < 0.1
X[nan_mask] = np.nan
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert X_train and X_test to Pandas DataFrames to simulate your setup
train_x = pd.DataFrame(X_train)
test_x = pd.DataFrame(X_test)

train_y = y_train
test_y = y_test

# Verify the number of NaN values
print("Number of NaNs in training data:", np.isnan(train_x).sum().sum())

Number of NaNs in training data: 22911


In [2]:


import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

# Create synthetic data
np.random.seed(42)
num_samples = 10000
num_features = 286

# Generate random data for train_x (normally distributed)
train_x = np.random.randn(num_samples, num_features)

# Generate weights for each feature to introduce correlation between train_x and train_y
weights = np.random.randn(num_features)

# Generate train_y as a linear combination of train_x with some random noise
train_y = np.dot(train_x, weights) + np.random.randn(num_samples) * 0.5  # Adding noise

# Introduce NaNs into the dataset
train_x[::10] = np.nan  # Every 10th row will have NaN values

# Convert to pandas DataFrames for consistency with the original code
train_x = pd.DataFrame(train_x, columns=[f'feature_{i}' for i in range(num_features)])
train_y = pd.DataFrame(train_y, columns=['target'])

# Convert to PyTorch tensors
X_tensor = torch.tensor(train_x.values, dtype=torch.float32)
y_tensor = torch.tensor(train_y.values, dtype=torch.float32).squeeze()

# Replace NaNs with 0 for the forward passx
X_tensor = torch.nan_to_num(X_tensor, nan=0.0)

# Define the model
class CustomRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(CustomRegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(p=0.3)
       
    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

# Initialize model, optimizer, and loss function
model = CustomRegressionModel(input_dim=num_features)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Training loop
num_epochs = 100
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    for i in range(0, X_tensor.size(0), batch_size):
        batch_X = X_tensor[i:i+batch_size]
        batch_y = y_tensor[i:i+batch_size]
       
        # Forward pass
        predictions = model(batch_X)
        loss = criterion(predictions.squeeze(), batch_y)
       
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
   
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 23.9231
Epoch [20/100], Loss: 7.6703
Epoch [30/100], Loss: 10.1160
Epoch [40/100], Loss: 9.9451
Epoch [50/100], Loss: 4.0704
Epoch [60/100], Loss: 10.6786
Epoch [70/100], Loss: 3.5784
Epoch [80/100], Loss: 4.0220
Epoch [90/100], Loss: 2.6562
Epoch [100/100], Loss: 4.9489


In [5]:
import h2o
from h2o.estimators import H2ODeepLearningEstimator, H2OAutoML
import numpy as np
import pandas as pd

# Initialize H2O cluster
h2o.init()

# Create synthetic data
np.random.seed(42)
num_samples = 10000
num_features = 286

# Generate random data for train_x (normally distributed)
train_x = np.random.randn(num_samples, num_features)

# Generate weights for each feature to introduce correlation between train_x and train_y
weights = np.random.randn(num_features)

# Generate train_y as a linear combination of train_x with some random noise
train_y = np.dot(train_x, weights) + np.random.randn(num_samples) * 0.5  # Adding noise

# Introduce NaNs into the dataset
train_x[::10] = np.nan  # Every 10th row will have NaN values

# Convert to pandas DataFrames for consistency
train_x = pd.DataFrame(train_x, columns=[f'feature_{i}' for i in range(num_features)])
train_y = pd.DataFrame(train_y, columns=['target'])

# Convert the pandas DataFrame to an H2OFrame
train_x_h2o = h2o.H2OFrame(train_x)
train_y_h2o = h2o.H2OFrame(train_y)

# Combine train_x and train_y for H2O training
train_h2o = train_x_h2o.cbind(train_y_h2o)

# Define features and target columns
features = train_x_h2o.columns
target = 'target'

# Initialize and train H2O Deep Learning model
model = H2ODeepLearningEstimator(
    hidden=[256, 128, 64],
    epochs=100,
    activation="RectifierWithDropout",
    input_dropout_ratio=0.2,
    hidden_dropout_ratios=[0.3, 0.3, 0.3],
    l1=1e-5,
    l2=1e-5
)

# Train the model
model.train(x=features, y=target, training_frame=train_h2o)

# Print the model performance
performance = model.model_performance()
print(performance)

# Alternatively, you could use AutoML to search for the best model
# aml = H2OAutoML(max_runtime_secs=600, project_name="regression")
# aml.train(x=features, y=target, training_frame=train_h2o)

# performance = aml.leader.model_performance()
# print(performance)

ModuleNotFoundError: No module named 'h2o'

In [4]:
import h2o
from h2o.estimators import H2ODeepLearningEstimator#, H2OAutoML
import numpy as np
import pandas as pd

# Initialize H2O cluster
h2o.init()

# Create synthetic data
np.random.seed(42)
num_samples = 10000
num_features = 286

# Generate random data for train_x (normally distributed)
train_x = np.random.randn(num_samples, num_features)

# Generate weights for each feature to introduce correlation between train_x and train_y
weights = np.random.randn(num_features)

# Generate train_y as a linear combination of train_x with some random noise
train_y = np.dot(train_x, weights) + np.random.randn(num_samples) * 0.5  # Adding noise

# Introduce NaNs into the dataset
train_x[::10] = np.nan  # Every 10th row will have NaN values

# Convert to pandas DataFrames for consistency
train_x = pd.DataFrame(train_x, columns=[f'feature_{i}' for i in range(num_features)])
train_y = pd.DataFrame(train_y, columns=['target'])

# Convert the pandas DataFrame to an H2OFrame
train_x_h2o = h2o.H2OFrame(train_x)
train_y_h2o = h2o.H2OFrame(train_y)

# Combine train_x and train_y for H2O training
train_h2o = train_x_h2o.cbind(train_y_h2o)

# Define features and target columns
features = train_x_h2o.columns
target = 'target'

# Initialize and train H2O Deep Learning model
model = H2ODeepLearningEstimator(
    hidden=[256, 128, 64],
    epochs=100,
    activation="RectifierWithDropout",
    input_dropout_ratio=0.2,
    hidden_dropout_ratios=[0.3, 0.3, 0.3],
    l1=1e-5,
    l2=1e-5
)

# Train the model
model.train(x=features, y=target, training_frame=train_h2o)

# Print the model performance
performance = model.model_performance()
print(performance)



Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,21 mins 20 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,25 days
H2O_cluster_name:,mattsalomon
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.540 Gb
H2O_cluster_total_cores:,10
H2O_cluster_allowed_cores:,10


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%
ModelMetricsRegression: deeplearning
** Reported on train data. **

MSE: 60.897356463844424
RMSE: 7.803675830263865
MAE: 5.738057477935067
RMSLE: NaN
Mean Residual Deviance: 60.897356463844424


In [48]:
test_x = np.random.randn(num_samples, num_features)
test_y = np.dot(test_x, weights) + np.random.randn(num_samples) * 0.5  # Adding noise
test_x[::10] = np.nan
test_x_h2o = h2o.H2OFrame(test_x, column_names=[f'feature_{i}' for i in range(num_features)])
test_y_h2o = h2o.H2OFrame(test_y,column_names=['target'])

# Combine train_x and train_y for H2O training
test_h2o = test_x_h2o.cbind(test_y_h2o)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [49]:
train_x.shape, test_x.shape

((10000, 286), (10000, 286))

In [50]:
# Make predictions
train_pred = model.predict(train_h2o)
test_pred = model.predict(test_h2o)

deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%


In [51]:
pd.DataFrame(data={'y_pred_train':train_pred.as_data_frame().values.flatten(), 'y_true_train':train_y.values.flatten()})




Unnamed: 0,y_pred_train,y_true_train
0,-0.253800,-15.076742
1,4.444131,8.620448
2,18.115235,28.172118
3,22.196200,34.670782
4,-2.412051,-4.697201
...,...,...
9995,-12.861229,-20.868444
9996,-7.598448,-13.297913
9997,-22.173945,-33.682206
9998,7.497384,12.164308


In [52]:
# # Make predictions
# test_h2o = h2o.H2OFrame(test_x)
# predictions = model.predict(test_h2o)

# # View the predictions
# print(predictions.head())

In [7]:
from h2o.automl import H2OAutoML
# Alternatively, you could use AutoML to search for the best model
aml = H2OAutoML(max_runtime_secs=600, project_name="regression")
aml.train(x=features, y=target, training_frame=train_h2o)

performance = aml.leader.model_performance()
print(performance)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
ModelMetricsRegressionGLM: glm
** Reported on train data. **

MSE: 28.00739141134721
RMSE: 5.292200998766695
MAE: 1.6973435567053836
RMSLE: NaN
Mean Residual Deviance: 28.00739141134721
R^2: 0.8983448524443512
Null degrees of freedom: 9999
Residual degrees of freedom: 9713
Null deviance: 2755137.549332193
Residual deviance: 280073.9141134721
AIC: 62279.45520724916


In [53]:
aml.leaderboard

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
GLM_1_AutoML_1_20240923_211230,5.29464,28.0332,1.71254,,28.0332
StackedEnsemble_AllModels_2_AutoML_1_20240923_211230,5.29509,28.038,1.71367,,28.038
StackedEnsemble_BestOfFamily_1_AutoML_1_20240923_211230,5.29512,28.0383,1.71415,,28.0383
StackedEnsemble_BestOfFamily_2_AutoML_1_20240923_211230,5.29513,28.0384,1.71412,,28.0384
StackedEnsemble_AllModels_1_AutoML_1_20240923_211230,5.29523,28.0395,1.71372,,28.0395
StackedEnsemble_AllModels_3_AutoML_1_20240923_211230,5.29554,28.0428,1.71425,,28.0428
StackedEnsemble_BestOfFamily_3_AutoML_1_20240923_211230,5.29639,28.0518,1.71507,,28.0518
DeepLearning_1_AutoML_1_20240923_211230,5.47443,29.9694,2.44844,,29.9694
DeepLearning_grid_1_AutoML_1_20240923_211230_model_1,9.09619,82.7406,6.89205,,82.7406
GBM_1_AutoML_1_20240923_211230,10.8924,118.643,8.5309,,118.643


In [54]:
perf = aml.leader.model_performance(test_h2o)

In [61]:
perf.rmse(), perf.mae(), perf.r2()

(5.318055544950004, 1.690125741843353, 0.8982046659221945)

In [62]:
train_pred = aml.predict(train_h2o)

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [63]:
pd.DataFrame(data={'y_pred_train':train_pred.as_data_frame().values.flatten(), 'y_true_train':train_y.values.flatten()}).describe()




Unnamed: 0,y_pred_train,y_true_train
count,10000.0,10000.0
mean,0.068778,0.068778
std,15.7277,16.599437
min,-63.873488,-64.967615
25%,-9.638754,-11.136724
50%,0.068778,0.373693
75%,9.699891,11.108575
max,66.888468,67.440638


In [64]:
test_pred = aml.predict(test_h2o)

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [66]:
pd.DataFrame(data={'y_pred_test':test_pred.as_data_frame().values.flatten(), 'y_true_test':test_y}).describe()




Unnamed: 0,y_pred_test,y_true_test
count,10000.0,10000.0
mean,0.201457,0.287887
std,15.797434,16.669042
min,-60.303436,-60.717713
25%,-9.54172,-10.779105
50%,0.068778,0.248595
75%,9.894278,11.415916
max,60.771906,61.402047
