In [1]:
import requests
import pandas as pd
import csv
from bs4 import BeautifulSoup
import re

In [2]:
# Pre-processing
def process_community_info(url, city): 
    """Processes community information from a list of strings.

    Args:
        community_info: A list of strings containing community information.
        city: The name of the city to use for replacement.

    Returns:
        A list of lists, where each inner list represents grouped information for a community.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    community_info = [row.text for row in soup.findAll('div', attrs={'class': 'hgraph'})]

    result = []
    current_neighborhood = None
    current_list = None

    for item in community_info:
        match = re.match(r'^([^:]+):(.*)', item)
        if match:
            name = match.group(1)
            content = match.group(2)

            if name in ['Males', 'This neighborhood']:
                name = current_neighborhood
            else:
                current_neighborhood = name

            content = re.sub(r'(city|{}):.*'.format(city), '', content)

            if current_list and name == current_list[0]:
                current_list.append(content)
            else:
                if current_list:
                    result.append(current_list)
                current_list = [name, content]

    if current_list:
        result.append(current_list)

    return result 

In [3]:
# Get population density & median age
def create_neighborhood_dicts(result, city):
    community_dicts = []
    for neighborhood_data in result:
        info = {}
        info['community'] = neighborhood_data[0]

        for item in neighborhood_data[1:]:
            if 'people' in item:
                match = re.findall(r'\d+,\d+', item)
                if match:
                    info['population.density'] = match[0].replace(',', '')  
                else:
                    info['population.density'] = None 
            if 'yearsFemales:' in item: 
                ages = re.findall(r'(\d+\.\d+)', item)
                if ages:  
                    info['median.age.male'] = float(ages[0])  # Convert to float for calculation
                    info['median.age.female'] = float(ages[1])

        # Calculate median age and remove old keys
        if 'median.age.male' in info and 'median.age.female' in info:
            mean_age = info['median.age'] = (info['median.age.male'] + info['median.age.female']) / 2
            info['median.age'] = "{:.1f}".format(mean_age)
            del info['median.age.male']
            del info['median.age.female']
        
        info['city'] = city
        
        community_dicts.append(info)
    return community_dicts 


In [18]:
# Chicago Population density & Median age
url = 'https://www.city-data.com/nbmaps/neigh-Chicago-Illinois.html'
result = process_community_info(url, 'Chicago')
Chi_neighborhood_dicts = create_neighborhood_dicts(result, 'Chicago')
df = pd.DataFrame(Chi_neighborhood_dicts)
df.to_csv('chi_demo1.csv', index=False) 

In [19]:
# Los Angelest City Population density & Median age
url = 'https://www.city-data.com/nbmaps/neigh-Los-Angeles-California.html'
result = process_community_info(url, 'Los Angeles')
LA_neighborhood_dicts = create_neighborhood_dicts(result, 'Los Angeles City')
df = pd.DataFrame(LA_neighborhood_dicts)
df.to_csv('la_demo1.csv', index=False) 

In [20]:
# New York City Population density & Median age
url = 'https://www.city-data.com/nbmaps/neigh-New-York-New-York.html'
result = process_community_info(url, 'New York')
NYC_neighborhood_dicts = create_neighborhood_dicts(result, 'New York City')
df = pd.DataFrame(NYC_neighborhood_dicts)
df.to_csv('nyc_demo1.csv', index=False) 

In [23]:
# All Population density & Median age
chi_df = pd.DataFrame(Chi_neighborhood_dicts) 
la_df = pd.DataFrame(LA_neighborhood_dicts)
nyc_df = pd.DataFrame(NYC_neighborhood_dicts)
all_demo1 = pd.concat([chi_df, la_df, nyc_df], axis=0)
all_demo1.to_csv('all_demo1.csv', index=False)