In [1]:
import pandas as pd
import geopandas as gpd
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [2]:
df_tv = pd.read_csv('../artifacts/data/cleaned/television.csv', on_bad_lines='skip')
df_radio = pd.read_csv('../artifacts/data/cleaned/radio.csv', on_bad_lines='skip')
df_newspaper = pd.read_csv('../artifacts/data/cleaned/newspaper.csv', on_bad_lines='skip')
df_population = pd.read_csv('../artifacts/data/cleaned/population_groups.csv')
df_newsfacts = pd.read_csv('../artifacts/data/cleaned/pew_newsfacts.csv', on_bad_lines='skip')
df_county = pd.read_csv('../artifacts/shapefiles/county.csv')
df_shapefile_county = gpd.read_file('../artifacts/shapefiles/cb_2018_us_county_500k.shp')

In [3]:
def process_media_sales_volume(df_media):
    """
    Converts 'sales_volume' to numeric, removes entries with missing 'sales_volume',
    and filters entries with 'sales_volume' below a specified threshold.

    Parameters:
        df_media (pd.DataFrame): DataFrame containing media data with a 'sales_volume' column.

    Returns:
        pd.DataFrame: DataFrame with processed sales volume data.
    """
    def convert_sales_volume_to_number(s):
        """
        Convert sales volume string to a number, handling ranges, specific markers, 
        and non-numeric characters.
        """
        try:
            if isinstance(s, str):
                s = s.replace('$', '').replace(',', '').replace('_', '').strip()
                # Handle cases like "Under $500,000" by taking the maximum or minimum value directly
                if 'under' in s.lower():
                    return float(s.split(' ')[1])
                elif 'over' in s.lower():
                    return float(s.split(' ')[1])  # Modify based on how you want to handle 'over' cases
                # Handle ranges by returning the lower bound
                return float(s.split(' ')[0].split('to')[0]) if 'to' in s else float(s)
        except ValueError:
            return None  # Return None for unconvertible values

    print("Initial count:", len(df_media))

    # Convert 'sales_volume' to numeric
    df_media['sales_volume'] = df_media['sales_volume'].apply(convert_sales_volume_to_number)

    # Drop rows where 'sales_volume' is NaN
    df_media_filtered = df_media.dropna(subset=['sales_volume'])

    # Filter rows where 'sales_volume' is below 1,000,000
    df_media_filtered = df_media_filtered[df_media_filtered['sales_volume'] < 1000000]

    print("Filtered count:", len(df_media_filtered))
    return df_media_filtered





In [4]:
df_tv_filtered = process_media_sales_volume(df_tv)
df_newspaper_filtered = process_media_sales_volume(df_newspaper)
df_radio_filtered = process_media_sales_volume(df_radio)

Initial count: 11487
Filtered count: 7246
Initial count: 41549
Filtered count: 32113
Initial count: 44117
Filtered count: 38822
