In [4]:
OPENAI_API_KEY = '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'

In [5]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from transformers import BertTokenizer, BertModel
import torch
import openai
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# URL of the JSON dataset
url = "https://data.cdc.gov/resource/hfr9-rurv.json?$limit=285000"

# Fetch the JSON data
response = requests.get(url)
data_json = response.json()

# Convert JSON data to DataFrame
data = pd.json_normalize(data_json)

In [6]:
# Drop rows with missing target values
data = data.dropna(subset=['data_value'])

In [7]:
# Keep a copy of the original 'question' column for later use
questions = data['question'].copy()

In [8]:
data = data.drop(columns=['rowid', 'geolocation.type', 'geolocation.coordinates', 'data_value_unit', 'datavaluetypeid', 'data_value_type', 'data_value_footnote_symbol', 'data_value_footnote', 'classid', 'topicid', 'questionid', 'stratificationcategoryid1', 'stratificationid1', 'stratificationcategoryid2', 'stratificationid2'])

In [9]:
data = data.drop(columns=['locationabbr', 'locationdesc'])

In [10]:
data.columns

Index(['yearstart', 'yearend', 'datasource', 'class', 'topic', 'question',
       'stratificationcategory1', 'stratification1', 'stratificationcategory2',
       'stratification2', 'locationid', ':@computed_region_skr5_azej',
       ':@computed_region_hjsp_umg2', 'data_value', 'data_value_alt',
       'low_confidence_limit', 'high_confidence_limit'],
      dtype='object')

In [11]:
# Encode categorical variables
data_encoded = pd.get_dummies(data, columns=['datasource', 'class', 'topic', 'question', 'stratificationcategory1', 'stratification1', 'stratificationcategory2', 'stratification2'])

# Normalize numerical columns
scaler = StandardScaler()
data_encoded[['yearstart', 'yearend', 'data_value']] = scaler.fit_transform(data_encoded[['yearstart', 'yearend', 'data_value']])


In [12]:
# Separate features and target variable
X = data_encoded.drop(columns=['data_value'])
y = data_encoded['data_value']

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Identify columns with non-numeric data
non_numeric_columns = X_train.select_dtypes(include=['object']).columns

In [14]:
# One-hot encode remaining non-numeric columns in the entire dataset
X_train_encoded = pd.get_dummies(X_train, columns=non_numeric_columns)
X_test_encoded = pd.get_dummies(X_test, columns=non_numeric_columns)

# Ensure both train and test sets have the same columns after one-hot encoding
X_train_blm, X_test_blm = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

In [15]:
# Initialize and train the baseline model
baseline_model = RandomForestRegressor(random_state=42)
baseline_model.fit(X_train_blm, y_train)

# Save the model
joblib.dump(baseline_model, 'baseline_model_no_geo.pkl')

['baseline_model_no_geo.pkl']

In [16]:
# Make predictions
y_pred = baseline_model.predict(X_test_blm)

In [17]:
# Evaluate the baseline model

# Mean Squared Error (MSE)
baseline_mse = mean_squared_error(y_test, y_pred)
print(f'Baseline Model MSE (no geolocation): {baseline_mse}')

# Mean Absolute Error (MAE)
baseline_mae = mean_absolute_error(y_test, y_pred)
print(f'Baseline Model MAE (no geolocation): {baseline_mae}')

# R-squared (Coefficient of Determination)
baseline_r2 = r2_score(y_test, y_pred)
print(f'Baseline Model R-squared (no geolocation): {baseline_r2}')

# Explained Variance Score
baseline_evs = explained_variance_score(y_test, y_pred)
print(f'Baseline Model EVS (no geolocation): {baseline_evs}')

Baseline Model MSE (no geolocation): 0.023929710661168356
Baseline Model MAE (no geolocation): 0.08680289587763589
Baseline Model R-squared (no geolocation): 0.9761880389063137
Baseline Model EVS (no geolocation): 0.9761880393042391


# BERT

In [18]:
# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')




In [19]:
# Tokenize the text data
text_data = data['question'].values
inputs = tokenizer(text_data.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=128)


In [20]:
# Function to generate BERT embeddings in batches
def generate_bert_embeddings_in_batches(model, tokenizer, texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # [CLS] token representation
            embeddings.append(batch_embeddings)
    return np.vstack(embeddings)


In [21]:
# Generate BERT embeddings in batches
bert_embeddings = generate_bert_embeddings_in_batches(bert_model, tokenizer, text_data, batch_size=32)

np.save('bert_embeddings_no_geo.npy', bert_embeddings)

In [22]:
# Add BERT embeddings to the numerical and categorical features
X_combined = np.hstack((X.values, bert_embeddings))

In [23]:
# Split the combined data into training and testing sets
X_train_combined, X_test_combined, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)


In [24]:
# Initialize and train the model
model_with_bert = RandomForestRegressor(random_state=42)
model_with_bert.fit(X_train_combined, y_train)

# Save the model
joblib.dump(model_with_bert, 'model_with_bert_no_geo.pkl')

In [None]:
# Make predictions
y_pred_with_bert = model_with_bert.predict(X_test_combined)

In [None]:
# Evaluate the model

# Mean Squared Error (MSE)
bert_mse = mean_squared_error(y_test, y_pred_with_bert)
print(f'BERT Model MSE (no geolocation): {bert_mse}')

# Mean Absolute Error (MAE)
bert_mae = mean_absolute_error(y_test, y_pred_with_bert)
print(f'BERT Model MAE (no geolocation): {bert_mae}')

# R-squared (Coefficient of Determination)
bert_r2 = r2_score(y_test, y_pred_with_bert)
print(f'BERT Model R-squared (no geolocation): {bert_r2}')

# Explained Variance Score
bert_evs = explained_variance_score(y_test, y_pred_with_bert)
print(f'BERT Model EVS (no geolocation): {bert_evs}')

# GPT-4

In [14]:
openai.api_key = OPENAI_API_KEY

In [15]:
# Function to generate embeddings using GPT-4
def get_gpt4_embeddings(text):
    response = openai.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response.data[0].embedding


In [16]:
# Apply GPT-4 embeddings to relevant text columns
data['gpt4_embedding'] = data.apply(
    lambda row: get_gpt4_embeddings(
        f"{row['question']} {row['class']} {row['topic']} {row['stratification1']} {row['stratification2']}"
    ), axis=1
)

In [17]:
# Convert the list of embeddings to a numpy array for model input
gpt4_embeddings = np.array(data['gpt4_embedding'].tolist())

np.save('gpt4_embeddings_no_geo.npy', gpt4_embeddings)

In [18]:
# # Add GPT-4 embeddings to the numerical and categorical features
X_gpt4 = np.hstack((X.values, gpt4_embeddings))


In [19]:
# Re-split the data into training and testing sets
X_train_gpt4, X_test_gpt4, y_train, y_test = train_test_split(X_gpt4, y, test_size=0.2, random_state=42)


In [20]:
# Initialize and train the RandomForestRegressor with GPT-4 embeddings
model_with_gpt4 = RandomForestRegressor(random_state=42)
model_with_gpt4.fit(X_train_gpt4, y_train)

# Save the model
joblib.dump(model_with_gpt4, 'model_with_gpt4_no_geo.pkl')

['model_with_gpt4_no_geo.pkl']

In [21]:
# Make predictions
y_pred_with_gpt4 = model_with_gpt4.predict(X_test_gpt4)


In [22]:
# Evaluate the model
# Mean Squared Error (MSE)
gpt4_mse = mean_squared_error(y_test, y_pred_with_gpt4)
print(f'GPT-4 Model MSE (no geolocation): {gpt4_mse}')

# Mean Absolute Error (MAE)
gpt4_mae = mean_absolute_error(y_test, y_pred_with_gpt4)
print(f'GPT-4 Model MAE (no geolocation): {gpt4_mae}')

# R-squared (Coefficient of Determination)
gpt4_r2 = r2_score(y_test, y_pred_with_gpt4)
print(f'GPT-4 Model R-squared (no geolocation): {gpt4_r2}')

# Explained Variance Score
gpt4_evs = explained_variance_score(y_test, y_pred_with_gpt4)
print(f'GPT-4 Model EVS (no geolocation): {gpt4_evs}')

GPT-4 Model MSE (no geolocation): 1.1409982786002147e-09
GPT-4 Model MAE (no geolocation): 6.387050115009729e-07
GPT-4 Model R-squared (no geolocation): 0.999999998865823
GPT-4 Model EVS (no geolocation): 0.999999998865836
