In [2]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from transformers import BertTokenizer, BertModel
import torch
import openai


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# URL of the JSON dataset
url = "https://data.cdc.gov/resource/hfr9-rurv.json"

# Fetch the JSON data
response = requests.get(url)
data_json = response.json()

# Convert JSON data to DataFrame
data = pd.json_normalize(data_json)

In [None]:
data.head()

In [201]:
data.describe

<bound method NDFrame.describe of     yearstart yearend locationabbr  locationdesc datasource           class  \
3        2022    2022           MD      Maryland      BRFSS   Mental Health   
4        2022    2022           WI     Wisconsin      BRFSS   Mental Health   
6        2022    2022           OK      Oklahoma      BRFSS   Mental Health   
7        2022    2022           PA  Pennsylvania      BRFSS   Mental Health   
8        2022    2022           AZ       Arizona      BRFSS  Overall Health   
..        ...     ...          ...           ...        ...             ...   
994      2022    2022           TN     Tennessee      BRFSS  Overall Health   
995      2022    2022           CO      Colorado      BRFSS      Caregiving   
997      2022    2022           OK      Oklahoma      BRFSS  Overall Health   
998      2022    2022           OK      Oklahoma      BRFSS      Caregiving   
999      2022    2022           OR        Oregon      BRFSS      Caregiving   

                 

In [202]:
data.columns

Index(['yearstart', 'yearend', 'locationabbr', 'locationdesc', 'datasource',
       'class', 'topic', 'question', 'stratificationcategory1',
       'stratification1', 'stratificationcategory2', 'stratification2',
       'locationid', ':@computed_region_skr5_azej',
       ':@computed_region_hjsp_umg2', 'data_value', 'data_value_alt',
       'low_confidence_limit', 'high_confidence_limit', 'latitude',
       'longitude', 'gpt4_embedding'],
      dtype='object')

In [203]:
data.isnull().sum()

yearstart                        0
yearend                          0
locationabbr                     0
locationdesc                     0
datasource                       0
class                            0
topic                            0
question                         0
stratificationcategory1          0
stratification1                  0
stratificationcategory2        134
stratification2                134
locationid                       0
:@computed_region_skr5_azej    155
:@computed_region_hjsp_umg2    166
data_value                       0
data_value_alt                   0
low_confidence_limit             0
high_confidence_limit            0
latitude                       134
longitude                      134
gpt4_embedding                   0
dtype: int64

In [4]:
# Drop rows with missing target values
data = data.dropna(subset=['data_value'])

In [5]:
# Keep a copy of the original 'question' column for later use
questions = data['question'].copy()

In [6]:
data = data.drop(columns=['rowid', 'geolocation.type', 'geolocation.coordinates', 'data_value_unit', 'datavaluetypeid', 'data_value_type', 'data_value_footnote_symbol', 'data_value_footnote', 'classid', 'topicid', 'questionid', 'stratificationcategoryid1', 'stratificationid1', 'stratificationcategoryid2', 'stratificationid2'])

In [7]:
data = data.drop(columns=['locationabbr', 'locationdesc'])

In [8]:
# Encode categorical variables
data_encoded = pd.get_dummies(data, columns=['datasource', 'class', 'topic', 'question', 'stratificationcategory1', 'stratification1', 'stratificationcategory2', 'stratification2'])

# Normalize numerical columns
scaler = StandardScaler()
data_encoded[['yearstart', 'yearend', 'data_value']] = scaler.fit_transform(data_encoded[['yearstart', 'yearend', 'data_value']])


In [9]:
# Separate features and target variable
X = data_encoded.drop(columns=['data_value'])
y = data_encoded['data_value']

# base line model

In [167]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [168]:
# Identify columns with non-numeric data
non_numeric_columns = X_train.select_dtypes(include=['object']).columns

In [169]:
# One-hot encode remaining non-numeric columns in the entire dataset
X_train_encoded = pd.get_dummies(X_train, columns=non_numeric_columns)
X_test_encoded = pd.get_dummies(X_test, columns=non_numeric_columns)

# Ensure both train and test sets have the same columns after one-hot encoding
X_train_blm, X_test_blm = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

In [170]:
# Initialize and train the baseline model
baseline_model = RandomForestRegressor(random_state=42)
baseline_model.fit(X_train_blm, y_train)

In [None]:
# Make predictions
y_pred = baseline_model.predict(X_test_blm)

In [198]:
# Evaluate the baseline model

# Mean Squared Error (MSE)
baseline_mse = mean_squared_error(y_test, y_pred)
print(f'Baseline Model MSE: {baseline_mse}')

# Mean Absolute Error (MAE)
baseline_mae = mean_absolute_error(y_test, y_pred)
print(f'Baseline Model MAE: {baseline_mae}')

# R-squared (Coefficient of Determination)
baseline_r2 = r2_score(y_test, y_pred)
print(f'Baseline Model R-squared: {baseline_r2}')

# Explained Variance Score
baseline_evs = explained_variance_score(y_test, y_pred)
print(f'Baseline Model EVS: {baseline_evs}')

Baseline Model MSE: 0.07170820056482627
Baseline Model MAE: 0.19462481631024336
Baseline Model R-squared: 0.9263512660930154
Baseline Model EVS: 0.9265357304549517


# BERT

In [None]:
# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')


In [173]:
# Tokenize the text data
text_data = data['question'].values
inputs = tokenizer(text_data.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=128)


In [174]:
# Generate BERT embeddings
with torch.no_grad():
    outputs = bert_model(**inputs)
    bert_embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # [CLS] token representation


In [175]:
# Add BERT embeddings to the numerical and categorical features
X_combined = np.hstack((X.values, bert_embeddings))

In [176]:
# Split the combined data into training and testing sets
X_train_combined, X_test_combined, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)


In [177]:
# Initialize and train the model
model_with_bert = RandomForestRegressor(random_state=42)
model_with_bert.fit(X_train_combined, y_train)


In [None]:
# Make predictions
y_pred_with_bert = model_with_bert.predict(X_test_combined)

In [199]:
# Evaluate the model

# Mean Squared Error (MSE)
bert_mse = mean_squared_error(y_test, y_pred_with_bert)
print(f'BERT Model MSE: {bert_mse}')

# Mean Absolute Error (MAE)
bert_mae = mean_absolute_error(y_test, y_pred_with_bert)
print(f'BERT Model MAE: {bert_mae}')

# R-squared (Coefficient of Determination)
bert_r2 = r2_score(y_test, y_pred_with_bert)
print(f'BERT Model R-squared: {bert_r2}')

# Explained Variance Score
bert_evs = explained_variance_score(y_test, y_pred_with_bert)
print(f'BERT Model EVS: {bert_evs}')

BERT Model MSE: 8.08023381572357e-05
BERT Model MAE: 0.005608467688321193
BERT Model R-squared: 0.9999170110272587
BERT Model EVS: 0.9999178385093092


# GPT4

In [10]:
openai.api_key = OPENAI_API_KEY

In [11]:
# Function to generate embeddings using GPT-4
def get_gpt4_embeddings(text):
    response = openai.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response.data[0].embedding


In [12]:
# Apply GPT-4 embeddings to relevant text columns
data['gpt4_embedding'] = data.apply(
    lambda row: get_gpt4_embeddings(
        f"{row['question']} {row['class']} {row['topic']} {row['stratification1']} {row['stratification2']}"
    ), axis=1
)

In [13]:
# Convert the list of embeddings to a numpy array for model input
gpt4_embeddings = np.array(data['gpt4_embedding'].tolist())

In [14]:
# # Add GPT-4 embeddings to the numerical and categorical features
X_gpt4 = np.hstack((X.values, gpt4_embeddings))


In [15]:
# Re-split the data into training and testing sets
X_train_gpt4, X_test_gpt4, y_train, y_test = train_test_split(X_gpt4, y, test_size=0.2, random_state=42)


In [16]:
# Initialize and train the RandomForestRegressor with GPT-4 embeddings
model_with_gpt4 = RandomForestRegressor(random_state=42)
model_with_gpt4.fit(X_train_gpt4, y_train)


In [17]:
# Make predictions
y_pred_with_gpt4 = model_with_gpt4.predict(X_test_gpt4)


In [18]:
# Evaluate the model
# Mean Squared Error (MSE)
gpt4_mse = mean_squared_error(y_test, y_pred_with_gpt4)
print(f'GPT-4 Model MSE: {gpt4_mse}')

# Mean Absolute Error (MAE)
gpt4_mae = mean_absolute_error(y_test, y_pred_with_gpt4)
print(f'GPT-4 Model MAE: {gpt4_mae}')

# R-squared (Coefficient of Determination)
gpt4_r2 = r2_score(y_test, y_pred_with_gpt4)
print(f'GPT-4 Model R-squared: {gpt4_r2}')

# Explained Variance Score
gpt4_evs = explained_variance_score(y_test, y_pred_with_gpt4)
print(f'GPT-4 Model EVS: {gpt4_evs}')

GPT-4 Model MSE: 0.00018665442775470692
GPT-4 Model MAE: 0.008291597267866217
GPT-4 Model R-squared: 0.9998082944185743
GPT-4 Model EVS: 0.999809566158777
