In [None]:
import requests
import pandas as pd
import csv
import re
import sys
import time
import random
from bs4 import BeautifulSoup

In [None]:
# Pre-processing
def process_community_info(url, city): 
    """Processes community information from a list of strings.

    Args:
        community_info: A list of strings containing community information.
        city: The name of the city to use for replacement.

    Returns:
        A list of lists, where each inner list represents grouped information for a community.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    community_info = [row.text for row in soup.findAll('div', attrs={'class': 'hgraph'})]

    result = []
    current_neighborhood = None
    current_list = None

    for item in community_info:
        match = re.match(r'^([^:]+):(.*)', item)
        if match:
            name = match.group(1)
            content = match.group(2)

            if name in ['Males', 'This neighborhood']:
                name = current_neighborhood
            else:
                current_neighborhood = name

            content = re.sub(r'(city|{}):.*'.format(city), '', content)

            if current_list and name == current_list[0]:
                current_list.append(content)
            else:
                if current_list:
                    result.append(current_list)
                current_list = [name, content]

    if current_list:
        result.append(current_list)

    return result 

In [None]:
# Get population density & median age
def create_neighborhood_dicts(result, city):
    community_dicts = []
    for neighborhood_data in result:
        info = {}
        info['community'] = neighborhood_data[0]

        for item in neighborhood_data[1:]:
            if 'people' in item:
                match = re.findall(r'\d+,\d+', item)
                if match:
                    info['population.density'] = match[0].replace(',', '')  
                else:
                    info['population.density'] = None 
            if 'yearsFemales:' in item: 
                ages = re.findall(r'(\d+\.\d+)', item)
                if ages:  
                    info['median.age.male'] = float(ages[0])  # Convert to float for calculation
                    info['median.age.female'] = float(ages[1])

        # Calculate median age and remove old keys
        if 'median.age.male' in info and 'median.age.female' in info:
            mean_age = info['median.age'] = (info['median.age.male'] + info['median.age.female']) / 2
            info['median.age'] = "{:.1f}".format(mean_age)
            del info['median.age.male']
            del info['median.age.female']
        
        info['city'] = city
        
        community_dicts.append(info)
    return community_dicts 


In [None]:
# Chicago Population density & Median age
url = 'https://www.city-data.com/nbmaps/neigh-Chicago-Illinois.html'
result = process_community_info(url, 'Chicago')
Chi_neighborhood_dicts = create_neighborhood_dicts(result, 'Chicago')
df = pd.DataFrame(Chi_neighborhood_dicts)
df.to_csv('chi_density.age.csv', index=False) 

In [None]:
# Los Angelest City Population density & Median age
url = 'https://www.city-data.com/nbmaps/neigh-Los-Angeles-California.html'
result = process_community_info(url, 'Los Angeles')
LA_neighborhood_dicts = create_neighborhood_dicts(result, 'Los Angeles City')
df = pd.DataFrame(LA_neighborhood_dicts)
df.to_csv('la_density.age.csv', index=False) 

In [None]:
# New York City Population density & Median age
url = 'https://www.city-data.com/nbmaps/neigh-New-York-New-York.html'
result = process_community_info(url, 'New York')
NYC_neighborhood_dicts = create_neighborhood_dicts(result, 'New York City')
df = pd.DataFrame(NYC_neighborhood_dicts)
df.to_csv('nyc_density.age.csv', index=False) 

In [None]:
# All Population density & Median age
chi_df = pd.DataFrame(Chi_neighborhood_dicts) 
la_df = pd.DataFrame(LA_neighborhood_dicts)
nyc_df = pd.DataFrame(NYC_neighborhood_dicts)
all_demo1 = pd.concat([chi_df, la_df, nyc_df], axis=0)
all_demo1.to_csv('all_density.age.csv', index=False)

In [None]:
# Get dominant race
def get_neighborhood_url(base_url, city_url):
    response = requests.get(city_url)
    soup = BeautifulSoup(response.text, "html.parser")
    links = []
    for element in soup.find_all("div", class_="neighborhood"):
        for link in element.find_all('a', href=lambda href: href and "neighborhood" in href): 
            links.append(link['href'])
    full_urls = [(base_url + link) for link in links]
    return full_urls

def get_neighborhood_souptxt(full_urls):
    requested_pages = []
    for index, full_url in enumerate(full_urls):            
        requested_url = requests.get(full_url)                              
        requested_url_txt = requested_url.text                   
        time.sleep(random.randint(1, 2))                         
        requested_pages.append(requested_url_txt)
    souped_pages = []
    for requested_page in requested_pages:    
        souped_page = BeautifulSoup(requested_page, 'html.parser')   
        souped_pages.append(souped_page)
    return souped_pages

In [None]:
def get_neighborhood_domininant_race(souped_pages):
    dominant_race = []
    for soup in souped_pages:
        race_info = soup.find('span', class_='badge alert-info')
        if race_info:
            dmn_race = race_info.find_next('b').get_text(strip=True)
        else:
            dmn_race = None 
        dominant_race.append(dmn_race)
    return(dominant_race)

def get_neighborhood_name(full_urls):
    neighborhood_name = []
    pattern = re.compile(r'-(Los-Angeles-CA|New-York-NY|Chicago-IL)\.html$')
    for url in full_urls:
        name = pattern.sub('', url.split('/')[-1])
        name = name.replace('-', ' ')
        neighborhood_name.append(name)
    return neighborhood_name

def extract_city_name_from_url(city_url):
    pattern = re.compile(r'neigh-(.*?)-(New-York|California|Illinois)\.html')
    
    match = pattern.search(city_url)
    if match:
        city_name_with_dashes = match.group(1)
        city_name = city_name_with_dashes.replace('-', ' ')
    return city_name

def get_dominant_csv(city_name, neighborhood_name, dominant_race):
    df = pd.DataFrame(list(zip(neighborhood_name, dominant_race)), 
                      columns=['Neighborhood Name', 'Dominant Race'])
    filename = f'C:/Users/13945/Desktop/data mgmt/umai/neighborhood_dominant_race_{city_name}.csv'
    df.to_csv(filename, index=False)

In [None]:
# Combine dominant race in three cities
def overall(city_url, base_url):
    full_urls = get_neighborhood_url(base_url, city_url)
    souped_pages = get_neighborhood_souptxt(full_urls)
    dominant_race = get_neighborhood_domininant_race(souped_pages)
    neighborhood_name = get_neighborhood_name(full_urls)
    city_name = extract_city_name_from_url(city_url)
    get_dominant_csv(city_name, neighborhood_name, dominant_race)

In [None]:
#All dominant race
chi_race = pd.read_csv('chi_race.csv')
la_race = pd.read_csv('la_race.csv')
nyc_race = pd.read_csv('nyc_race.csv')
chi_df = pd.DataFrame(chi_race)
la_df = pd.DataFrame(la_race)
nyc_df = pd.DataFrame(nyc_race)
all_race = pd.concat([chi_df, la_df, nyc_df], axis=0)
all_race.to_csv('all_race.csv', index=False)

In [None]:
#All demographic attributes
all_density_and_age = pd.read_csv('all_density.age.csv')
all_race.rename(columns={'Neighborhood Name': 'community'}, inplace=True)
all_demo = pd.merge(all_density_and_age, all_race, on=['community', 'city'])
all_demo.to_csv('all_demo.csv', index=False)