In [3]:
import pandas as pd
import geopandas as gpd
import numpy as np
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [2]:
df_population = pd.read_csv('../artifacts/data/cleaned/population_groups.csv')
df_newsfacts = pd.read_csv('../artifacts/data/cleaned/pew_newsfacts.csv', on_bad_lines='skip')

In [4]:
df_population['Pop_High_Less'] = np.ceil(df_population['POP_ESTIMATE_2022'] * ((df_population['Percent_High_Less'] + df_population['Percent_High']) / 100))
df_population['Pop_some_Collage'] = np.ceil(df_population['POP_ESTIMATE_2022'] * (df_population['Percent_Some_collage'] / 100))
df_population['Pop_Collage_Plus'] = np.ceil(df_population['POP_ESTIMATE_2022'] * (df_population['Percent_Collage_Plus'] / 100))

In [5]:
def get_percentage(df, category, column):
    percentage_str = df[df['Category'] == category][column].iloc[0]
    return float(percentage_str.strip('%')) / 100

# List of categories and their corresponding population columns in df_population
categories = [
    ('High school or less', 'Pop_High_Less'),
    ('Some college', 'Pop_some_Collage'),
    ('College+', 'Pop_Collage_Plus')
]

# Media types
media_types = ['Television', 'Radio', 'Print', 'Digital devices']

# Loop over each category and media type, calculate new columns
for category, pop_col in categories:
    for media in media_types:
        # Get the percentage from df_newsfacts
        percentage = get_percentage(df_newsfacts, category, media)
        
        # Calculate new column in df_population
        new_col_name = f"{media}_{pop_col}"
        df_population[new_col_name] = np.ceil(df_population[pop_col] * percentage).astype(int)

In [6]:
for media in media_types:
    # Summing up the population columns for each media
    df_population[f'{media}_Demand_Pop'] = (
        df_population[f'{media}_Pop_High_Less'] +
        df_population[f'{media}_Pop_some_Collage'] +
        df_population[f'{media}_Pop_Collage_Plus']
    )
    
    # Calculating the percentage of total demand relative to POP_ESTIMATE_2022
    df_population[f'{media}_Demand_Percent'] = (
        df_population[f'{media}_Demand_Pop'] / df_population['POP_ESTIMATE_2022'] * 100
    ).round(2)

In [7]:
df_population.to_csv('../artifacts/data/cleaned/media_demand.csv', index=False)