In [None]:
import pandas as pd
import numpy as np

In [None]:
### Make a State-Level Mean Imputation Function
def impute_mean(df, col_to_impute):
  col_to_impute = str(col_to_impute)
  avg_col_name = str('avg_' + col_to_impute)

  # Test to see if column is a binary variable
  binary_var = True if np.max(df[col_to_impute]) == 1 and np.min(df[col_to_impute]) == 0 else False

  state_avg_df = df[['StateName', col_to_impute]]
  state_avg_df = state_avg_df.groupby('StateName').mean()
  state_avg_df.reset_index(inplace=True)
  state_avg_df.rename(columns={col_to_impute:avg_col_name}, inplace=True)

  # If the column is binary, round to 1 or 0, and fill with 0 if null
  if binary_var:
    state_avg_df[avg_col_name] = np.round(state_avg_df[avg_col_name])
    state_avg_df[avg_col_name].fillna(0, inplace=True)

  # Merge state-level average column to dataframe
  interim_df = df.copy()
  interim_df = interim_df.merge(right=state_avg_df, on='StateName', how='left', copy=False)

  # Fill NAs with State-Level Means
  interim_df[col_to_impute] = np.where(interim_df[col_to_impute].isnull(), 
                                       interim_df[avg_col_name], 
                                       interim_df[col_to_impute])
  # If any NAs are leftover, fill them with the mean of the column
  interim_df[col_to_impute] = np.where(interim_df[col_to_impute].isnull(),
                                       np.nanmean(interim_df[col_to_impute]),
                                       interim_df[col_to_impute])
  interim_df.drop(columns=avg_col_name, inplace=True)
  df = interim_df.copy()
  return df[col_to_impute]

In [None]:
honey_df = pd.read_csv('../04 - Data/Final Data/honey_neonic_cleaned.csv', index_col=0)
aphis_df = pd.read_csv('../04 - Data/Final Data/aphis_clean.csv', index_col=0)
temp_df = pd.read_csv('../04 - Data/Final Data/temperature.csv', index_col=0)
urb_df = pd.read_csv('../04 - Data/Final Data/urb_by_state-year.csv', index_col=0)
aq_df = pd.read_csv('../04 - Data/Final Data/aq_features.csv', index_col=0)

In [None]:
df = honey_df.merge(right=temp_df, left_on=['StateName', 'year'], right_on=['state', 'year'], how='left')
df.drop(columns=['state', 'code'], inplace=True)
merged_df = df.copy()

In [None]:
df = merged_df.merge(right=aphis_df, left_on=['StateName', 'year'], right_on=['state', 'year'], how='left')
df.drop(columns=['state', 'code'], inplace=True)
merged_df = df.copy()

In [None]:
df = merged_df.merge(right=urb_df, left_on=['StateName', 'year'], right_on=['state', 'year'], how='left')
df.drop(columns=['state'], inplace=True)
merged_df = df.copy()

In [None]:
df = merged_df.merge(right=aq_df, left_on=['StateName', 'year'], right_on=['State', 'Year'], how='left')
df.drop(columns=['State', 'Year'], inplace=True)
merged_df = df.copy()

In [None]:
cols_to_impute = [i for i in merged_df.columns if
    (merged_df[str(i)].dtype == 'float64' or 
    merged_df[str(i)].dtype == 'int64') and
    merged_df[str(i)].count() < 825]

In [None]:
imputed_df = merged_df.copy()
for col in cols_to_impute:
  imputed_df[col] = impute_mean(imputed_df, col)

In [None]:
final_data_dummies = pd.get_dummies(imputed_df)

In [None]:
final_data_dummies_minus1_col = pd.get_dummies(imputed_df, drop_first=True)

In [None]:
final_data_dummies.to_csv('../04 - Data/Final Data/final_data_dummies.csv', index=False)

In [None]:
final_data_dummies_minus1_col.to_csv(
    '../04 - Data/Final Data/final_data_dummies_minus1_col.csv', index=False)

In [None]:
imputed_df.to_csv('../04 - Data/Final Data/final_data.csv', index=False)