In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the California housing dataset
data = fetch_california_housing()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert the numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# Define the neural network model
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the model, loss function, and optimizer
input_size = X_train.shape[1]
model = Net(input_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    if (epoch+1)%10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# Save the trained model
torch.save(model.state_dict(), 'model.pth')

Epoch [10/100], Loss: 4.6511
Epoch [20/100], Loss: 3.8282
Epoch [30/100], Loss: 2.9218
Epoch [40/100], Loss: 2.0633
Epoch [50/100], Loss: 1.4666
Epoch [60/100], Loss: 1.1218
Epoch [70/100], Loss: 0.9067
Epoch [80/100], Loss: 0.7900
Epoch [90/100], Loss: 0.7369
Epoch [100/100], Loss: 0.7055


In [14]:
X

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [15]:
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [2]:
import pandas as pd

pdf = pd.DataFrame(X_test, columns=data.feature_names)
pdf

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,-0.741019,1.614971,-0.295713,-0.177953,-0.717724,-0.075766,1.044339,-1.359761
1,-0.163541,-0.368623,0.003247,0.011572,-0.437714,-0.010153,1.020922,-0.820054
2,-0.523548,0.504158,-0.384646,-0.084150,-0.248707,-0.010234,1.016238,-1.324780
3,-0.926180,0.266127,-0.840016,-0.386927,-0.244332,0.014403,-0.730692,0.724106
4,-1.011848,0.583502,-0.606825,-0.146700,-0.008073,0.150447,-1.372326,1.223835
...,...,...,...,...,...,...,...,...
4123,1.186209,-0.447966,0.381138,-0.194598,0.234311,-0.007470,-0.941447,0.943987
4124,-0.097091,1.853002,-0.448035,-0.169439,-0.225081,-0.174395,1.006871,-1.434720
4125,0.463735,0.583502,-0.095625,-0.513243,-0.601345,-0.115546,0.800799,-1.194850
4126,-0.523548,0.504158,-0.357973,0.085547,-0.228581,-0.000968,-0.819677,0.684128


In [3]:
from pyspark.sql.types import *

# Spark is somehow auto-converting Pandas float32 to DoubleType(), so forcing FloatType()
schema = StructType([
StructField("MedInc",FloatType(),True),
StructField("HouseAge",FloatType(),True),
StructField("AveRooms",FloatType(),True),
StructField("AveBedrms",FloatType(),True),
StructField("Population",FloatType(),True),
StructField("AveOccup",FloatType(),True),
StructField("Latitude",FloatType(),True),
StructField("Longitude",FloatType(),True)
])

df = spark.createDataFrame(pdf, schema=schema)
df.show(truncate=12)

                                                                                

+------------+------------+------------+------------+------------+------------+-----------+----------+
|      MedInc|    HouseAge|    AveRooms|   AveBedrms|  Population|    AveOccup|   Latitude| Longitude|
+------------+------------+------------+------------+------------+------------+-----------+----------+
| -0.74101853|   1.6149706| -0.29571253| -0.17795317|  -0.7177242| -0.07576603|  1.0443388|-1.3597609|
|  -0.1635414|  -0.3686225|0.0032469928|  0.01157152| -0.43771398|-0.010152793|  1.0209215|-0.8200542|
| -0.52354825|  0.50415844| -0.38464627| -0.08415038| -0.24870707|-0.010233884|  1.0162381|-1.3247799|
|  -0.9261799|  0.26612726|  -0.8400165| -0.38692707| -0.24433191| 0.014403454| -0.7306918| 0.7241065|
|  -1.0118484|   0.5835022|  -0.6068254| -0.14670016|-0.008073272|  0.15044716| -1.3723255| 1.2238349|
|  -0.4511803|    1.218252|  -0.5674745| -0.04344595|   -0.383462| -0.19109002|  1.0583892|-1.3547635|
| -0.77940184| 0.107439816|  0.04742335| -0.18705839|  -0.6188456| 0.0098

In [16]:
def predict_batch_fn():
    device = "cuda" if torch.cuda.is_available() else "cpu"

    loaded_model = torch.load('model.pth')
    
    def predict(inputs):
        torch_inputs = torch.from_numpy(inputs).to(device)
        outputs = model(torch_inputs) # .flatten()
        return outputs.detach().numpy()

    return predict

In [17]:
from pyspark.ml.functions import predict_batch_udf

classify = predict_batch_udf(predict_batch_fn,
                             return_type=FloatType(),
                             input_tensor_shapes=[[8]],
                             batch_size=50)

In [18]:
columns = df.columns
columns

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [19]:
import pyspark.sql.functions as F

preds = df.withColumn("preds", classify(F.struct(*columns)))
preds.show()

[Stage 3:>                                                          (0 + 1) / 1]

+------------+------------+------------+------------+------------+------------+-----------+----------+---------+
|      MedInc|    HouseAge|    AveRooms|   AveBedrms|  Population|    AveOccup|   Latitude| Longitude|    preds|
+------------+------------+------------+------------+------------+------------+-----------+----------+---------+
| -0.74101853|   1.6149706| -0.29571253| -0.17795317|  -0.7177242| -0.07576603|  1.0443388|-1.3597609|1.9241145|
|  -0.1635414|  -0.3686225|0.0032469928|  0.01157152| -0.43771398|-0.010152793|  1.0209215|-0.8200542|1.3536556|
| -0.52354825|  0.50415844| -0.38464627| -0.08415038| -0.24870707|-0.010233884|  1.0162381|-1.3247799|1.5152398|
|  -0.9261799|  0.26612726|  -0.8400165| -0.38692707| -0.24433191| 0.014403454| -0.7306918| 0.7241065|1.5755394|
|  -1.0118484|   0.5835022|  -0.6068254| -0.14670016|-0.008073272|  0.15044716| -1.3723255| 1.2238349|1.7656959|
|  -0.4511803|    1.218252|  -0.5674745| -0.04344595|   -0.383462| -0.19109002|  1.0583892|-1.35

Using cpu device
                                                                                

In [11]:
results.show()

AttributeError: 'list' object has no attribute 'show'

In [20]:
X_test

array([[-7.41018555e-01,  1.61497054e+00, -2.95712538e-01, ...,
        -7.57660305e-02,  1.04433881e+00, -1.35976084e+00],
       [-1.63541400e-01, -3.68622515e-01,  3.24699276e-03, ...,
        -1.01527922e-02,  1.02092152e+00, -8.20054198e-01],
       [-5.23548241e-01,  5.04158431e-01, -3.84646265e-01, ...,
        -1.02338842e-02,  1.01623806e+00, -1.32477985e+00],
       ...,
       [ 4.63734519e-01,  5.83502153e-01, -9.56254733e-02, ...,
        -1.15546226e-01,  8.00798993e-01, -1.19485048e+00],
       [-5.23548241e-01,  5.04158431e-01, -3.57973122e-01, ...,
        -9.67512299e-04, -8.19677481e-01,  6.84128202e-01],
       [ 1.13781687e-01, -6.85997405e-01,  6.87490896e-02, ...,
         2.28090968e-01, -1.37232553e+00,  1.24382398e+00]])

In [21]:
y_test

array([0.938, 1.308, 1.438, ..., 2.947, 1.783, 1.35 ])

In [25]:
predictions = [x['preds'] for x in preds.select('preds').collect()]

Using cpu device                                                  (0 + 10) / 10]
Using cpu device
Using cpu device
Using cpu device
Using cpu device
Using cpu device
Using cpu device
Using cpu device
Using cpu device
Using cpu device
                                                                                

In [26]:
from sklearn.metrics import mean_absolute_error

# Compute MAE
mae = mean_absolute_error(y_test, predictions)

print("Mean Absolute Error on test set:", mae)

Mean Absolute Error on test set: 0.600892072953799


In [28]:
type(y_test)

numpy.ndarray

In [29]:
torch.tensor(y_test)

tensor([0.9380, 1.3080, 1.4380,  ..., 2.9470, 1.7830, 1.3500],
       dtype=torch.float64)

In [30]:
torch.tensor(predictions)

tensor([1.9241, 1.3537, 1.5152,  ..., 2.5636, 1.6112, 1.9115])

In [31]:
criterion(torch.tensor(y_test), torch.tensor(predictions))

tensor(0.6946, dtype=torch.float64)

In [36]:
import torch
import torch.nn as nn

# Define the neural network model using nn.Sequential
input_size = 8  # Number of features in the California housing dataset
test_model = nn.Sequential(
    nn.Linear(input_size, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 1)
)

# Load the trained model's state if available
# Uncomment the following lines if you have a saved model state
# model.load_state_dict(torch.load('model.pth'))
# model.eval()

# Create an arbitrary input tensor with explicit feature values
# Let's define explicit values for the 8 features:
# - MedInc: Median income in block group
# - HouseAge: Median house age in block group
# - AveRooms: Average number of rooms per household
# - AveBedrms: Average number of bedrooms per household
# - Population: Block group population
# - AveOccup: Average number of household members
# - Latitude: Latitude coordinate
# - Longitude: Longitude coordinate

# For demonstration, we'll use the following values (made-up for this example):
input_tensor = torch.tensor([
    5.0,      # MedInc
    30.0,     # HouseAge
    6.0,      # AveRooms
    1.0,      # AveBedrms
    1000.0,   # Population
    3.0,      # AveOccup
    34.05,    # Latitude (e.g., Los Angeles)
    -118.25   # Longitude (e.g., Los Angeles)
], dtype=torch.float32)


# Perform a forward pass through the model
output = test_model(input_tensor.unsqueeze(0))
output

tensor([[55.3222]], grad_fn=<AddmmBackward0>)