In [1]:
import pandas as pd
import geopandas as gpd
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

# Define your functions here...
def read_data(file_path, fields):
    """Read data from the specified file path into a DataFrame using only the specified fields."""
    return pd.read_csv(file_path, quotechar='"', sep=',', on_bad_lines='skip', usecols=fields)

def prepare_population_dataframe(df_population):
    """Prepare and clean the population DataFrame."""
    df_population['County'] = df_population['Area_Name'].str.replace(' county', '', case=False).str.strip()
    df_population_selected = df_population[df_population['Area_Name'].str.lower().str.endswith('county')]
    df_population_selected['POP_ESTIMATE_2022'] = df_population_selected['POP_ESTIMATE_2022'].str.replace(',', '').astype(float)
    return df_population_selected

def add_geography_column(df_media, df_county):
    """Merge media DataFrame with county DataFrame to add Geography column."""
    merged_mp_df = df_media.merge(df_county, left_on=['state', 'county_name'], right_on=['New_State', 'New_County'])
    merged_mp_df = merged_mp_df[df_media.columns.tolist() + ['Geography']]
    return merged_mp_df






In [2]:
def convert_sales_volume_to_number(s):
    """Convert sales volume string to a number, handling ranges and non-numeric characters."""
    try:
        if isinstance(s, str):
            s = s.replace('$', '').replace(',', '').replace('_', '').strip()
            return float(s.split(' ')[0].split('to')[0]) if 'to' in s else float(s)
    except ValueError:
        return None
    return s

In [6]:
# Read in your data
fields = ['company_name','legal_name','address1','city','state','zip5','zip4','address_type_code','mail_addr_address1','mail_addr_city','mail_addr_state','mail_addr_zip5','mail_addr_zip4','mail_addr_address_type_code','county_code','county_name','cbsa','cbsa_name','dma','dma_name','latitude','longitude','census_id','census_block','census_tract','MedianIncomeCensusArea','MeanHousingCensusArea','url','naics_desc','sic2code','sic4code','sic6code','sic2desc','sic4desc','sic6desc','sic8desc','sic_division','exact_sales_volume','sales_volume','exact_number_of_employees','number_of_employees','employee_code','location_type','parent_company','parent_address','parent_city','parent_state','parent_zip','business_specialty','company_year_started','business_type','state_where_entity_formed','minority','woman','government','small','home_office','franchise','chain','site_status','zip_centroid_lat','zip_centroid_long','owner_company','owner_address','owner_city','owner_state','owner_zip','owner_country','owner_phone']
df_newspaper = read_data('../artifacts/data/Raw/newspaper.csv', fields)
df_county = pd.read_csv('../artifacts/data/shapefiles/county.csv')
df_population = read_data('../artifacts/data/Raw/PopulationEstimates.csv', ['Area_Name','State', 'POP_ESTIMATE_2022'])

# Prepare the population DataFrame
df_population_prepared = prepare_population_dataframe(df_population)

# Add geography column to newspaper data
df_newspaper_with_geography = add_geography_column(df_newspaper, df_county)

# Convert sales volume to numeric
df_newspaper_with_geography['sales_volume'] = df_newspaper_with_geography['sales_volume'].apply(convert_sales_volume_to_number)
df_newspaper_filtered = df_newspaper_with_geography.dropna(subset=['sales_volume'])
df_newspaper_filtered = df_newspaper_filtered[df_newspaper_filtered['sales_volume'] < 1000000]

# Load the shapefile as a GeoDataFrame
df_shapefile_county = gpd.read_file('../artifacts/data/shapefiles/cb_2018_us_county_500k.shp')

# Merge the shapefile GeoDataFrame with the filtered newspaper data

merged_gdf = df_shapefile_county.merge(df_newspaper_filtered, left_on='AFFGEOID', right_on='Geography')



# Merge the result with the prepared population DataFrame
merged_gdf_final = merged_gdf.merge(df_population_prepared, how='left', left_on=['state', 'county_name'], right_on=['State', 'County'])

# Calculate station count and news stations per 100,000 population
merged_gdf_final['station_count'] = merged_gdf_final.groupby('AFFGEOID')['AFFGEOID'].transform('count')
merged_gdf_final['newsstation_per_100k_pop'] = (merged_gdf_final['station_count'] / merged_gdf_final['POP_ESTIMATE_2022']) * 100000

# Drop duplicates based on 'AFFGEOID'
merged_gdf_final = merged_gdf_final.drop_duplicates(subset=['AFFGEOID'])

In [7]:
import pandas as pd
import plotly.express as px
import json
# Remove duplicates based on 'Geography'
# merged_gdf_county_2 = merged_gdf_county_2.drop_duplicates(subset=['Geography'])

# Convert GeoDataFrame to JSON for the choropleth
json_data = merged_gdf_final.geometry.to_json()

# Convert GeoJSON to dictionary (which is compatible with plotly)
geojson = json.loads(json_data)

# Replace NaN values with a placeholder (-1) for color mapping
# merged_gdf_county['station_count'].fillna(-1, inplace=True)

# Determine the range of your data
min_value = merged_gdf_final['newsstation_per_100k_pop'].min()
max_value = merged_gdf_final['newsstation_per_100k_pop'].max()

fig = px.choropleth_mapbox(
    merged_gdf_final,
    geojson=geojson,
    locations=merged_gdf_final.index,
    color='newsstation_per_100k_pop',
    color_continuous_scale="Viridis",
    range_color=(min_value, max_value),  # Set to the actual min and max of your data
    mapbox_style="carto-positron",
    zoom=3,
    center={"lat": 37.0902, "lon": -95.7129},
    opacity=0.5,
    hover_name='county_name'
)

# Save the figure to HTML
fig.write_html('../artifacts/output/NewspaperStations_per_100k_people.html')