In [7]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Function to ensure data is numeric
def ensure_numeric(X):
    for col in X.columns:
        if X[col].dtype == 'bool':
            X[col] = X[col].astype(int)
        else:
            X[col] = pd.to_numeric(X[col], errors='coerce')
    return X

# Function to calculate VIF for a given set of variables
def calculate_vif(data, variables):
    X = data[variables].copy()
    X = ensure_numeric(X)
    X = sm.add_constant(X)
    
    # Handle missing values by filling them with the median
    X.fillna(X.median(), inplace=True)
    
    # Replace infinite values with large finite numbers
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(X.median(), inplace=True)
    
    # Calculate VIF
    vif_data = pd.DataFrame()
    vif_data['Variable'] = X.columns
    vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    
    return vif_data
    


# Define your variable categories

sociodemographic_variables = [
    'minorpct', 'femalepct', 'rpl_theme1', 'rpl_theme2', 'rpl_theme3', 'rpl_theme4', 
     'mediahhinc', 'TractSNAP_per_capita','TractHUNV_per_capita'#'rpl_themes',
]

human_behavior_variables = [
    'BINGE_CrudePrev', 'SLEEP_CrudePrev', 'CHOLSCREEN_CrudePrev', 'DEPRESSION_CrudePrev',
    'COREM_CrudePrev', 'LPA_CrudePrev', 'workers_public_transport_45_min', 
    'workers_walking_15_min_to_work', 'Number of drivers_per_capita'
]

environment_variables = [
    'environmental_justice_index', 'distance_to_park_within_1_mile', 'grocery_per_capita',
    'convenienc_per_capita', 'fvmarket_per_capita', 'warehousec_per_capita', 
    'fastfood_per_capita', 'INTDENSITY', 'Number of Marta Stops_per_capita(%)', 'med_dist'
]

# Load your data
data = pd.read_csv("D:\\project-Geo\\data\\Database_one_standardized_final.csv")

# Calculate VIF for each category
print("\nCalculating VIF for Sociodemographic Variables")
vif_sociodemographic = calculate_vif(data, sociodemographic_variables)
print(vif_sociodemographic)

print("\nCalculating VIF for Human Behavior Variables")
vif_human_behavior = calculate_vif(data, human_behavior_variables)
print(vif_human_behavior)

print("\nCalculating VIF for Environment Variables")
vif_built_environment = calculate_vif(data, environment_variables)
print(vif_built_environment)





Calculating VIF for Sociodemographic Variables
               Variable         VIF
0                 const  136.171255
1              minorpct    3.015293
2             femalepct    1.259725
3            rpl_theme1    4.075219
4            rpl_theme2    1.912702
5            rpl_theme3    2.407605
6            rpl_theme4    2.350843
7            mediahhinc    3.104207
8  TractSNAP_per_capita    5.204663
9  TractHUNV_per_capita    3.200476

Calculating VIF for Human Behavior Variables
                          Variable          VIF
0                            const  3978.023556
1                  BINGE_CrudePrev     4.829334
2                  SLEEP_CrudePrev     5.308950
3             CHOLSCREEN_CrudePrev     4.096629
4             DEPRESSION_CrudePrev     1.886533
5                  COREM_CrudePrev     6.467711
6                    LPA_CrudePrev    11.769741
7  workers_public_transport_45_min     1.407863
8   workers_walking_15_min_to_work     1.400317
9     Number of drivers_per_ca

In [6]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Function to ensure data is numeric
def ensure_numeric(X):
    for col in X.columns:
        if X[col].dtype == 'bool':
            X[col] = X[col].astype(int)
        else:
            X[col] = pd.to_numeric(X[col], errors='coerce')
    return X

# Function to calculate VIF for a given set of variables
def calculate_vif(data, variables):
    X = data[variables].copy()
    X = ensure_numeric(X)
    X = sm.add_constant(X)
    
    # Handle missing values by filling them with the median
    X.fillna(X.median(), inplace=True)
    
    # Replace infinite values with large finite numbers
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(X.median(), inplace=True)
    
    # Calculate VIF
    vif_data = pd.DataFrame()
    vif_data['Variable'] = X.columns
    vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    
    return vif_data

# Define your variable categories
sociodemographic_variables = [
    'minorpct', 'femalepct', 'rpl_theme1', 'rpl_theme2', 'rpl_theme3', 'rpl_theme4', 
    'mediahhinc', 'TractSNAP_per_capita', 'TractHUNV_per_capita'
]

human_behavior_variables = [
    'BINGE_CrudePrev', 'SLEEP_CrudePrev', 'CHOLSCREEN_CrudePrev', 'DEPRESSION_CrudePrev',
    'COREM_CrudePrev', 'LPA_CrudePrev', 'workers_public_transport_45_min', 
    'workers_walking_15_min_to_work', 'Number of drivers_per_capita'
]

environment_variables = [
    'environmental_justice_index', 'distance_to_park_within_1_mile', 'grocery_per_capita',
    'convenienc_per_capita', 'fvmarket_per_capita', 'warehousec_per_capita', 
    'fastfood_per_capita', 'INTDENSITY', 'Number of Marta Stops_per_capita(%)', 'med_dist'
]

# Combine all variable lists
all_variables = sociodemographic_variables + human_behavior_variables + environment_variables

# Load your data
data = pd.read_csv("D:\\project-Geo\\data\\Database_one_standardized_final.csv")

# Calculate VIF for all variables combined
print("\nCalculating VIF for All Variables")
all_vif = calculate_vif(data, all_variables)
print(all_vif)



Calculating VIF for All Variables
                               Variable          VIF
0                                 const  7229.259249
1                              minorpct     9.200921
2                             femalepct     1.978818
3                            rpl_theme1     5.706344
4                            rpl_theme2     2.629354
5                            rpl_theme3     4.241102
6                            rpl_theme4     3.319255
7                            mediahhinc     4.954895
8                  TractSNAP_per_capita     8.250195
9                  TractHUNV_per_capita     6.104014
10                      BINGE_CrudePrev     6.559023
11                      SLEEP_CrudePrev    12.656646
12                 CHOLSCREEN_CrudePrev     9.335879
13                 DEPRESSION_CrudePrev     7.238166
14                      COREM_CrudePrev    10.743433
15                        LPA_CrudePrev    18.897656
16      workers_public_transport_45_min     1.696558
17       wo