<div align="center">

# Term Project

---

**Author:** Matthew Walczyk  
**Date:** 9/9/2024  
**Modified By:** Matthew Walczyk  
**Description:** Term project showing the use of a flat file, API, and website to gather data and then clean it and display it.

---
</div>

In [1]:
import pandas as pd

# Load the CSV file
url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"
covid_data = pd.read_csv(url)

# Step 1: Replace headers - renaming columns for better readability
covid_data.rename(columns={
    'iso_code': 'Country_Code',
    'continent': 'Continent',
    'location': 'Location',
    'date': 'Date',
    'total_cases': 'Total_Cases',
    'new_cases': 'New_Cases',
    'total_deaths': 'Total_Deaths',
    'new_deaths': 'New_Deaths',
    'total_vaccinations': 'Total_Vaccinations',
    'people_vaccinated': 'People_Vaccinated',
    'people_fully_vaccinated': 'People_Fully_Vaccinated'
}, inplace=True)

# Step 2: Convert 'Date' column to datetime format for better analysis
covid_data['Date'] = pd.to_datetime(covid_data['Date'])

# Step 3: Identify and remove duplicate rows
covid_data.drop_duplicates(inplace=True)

# Step 4: Remove any rows where critical columns (Total_Cases, New_Cases, Total_Deaths) have null values
covid_data_cleaned = covid_data.dropna(subset=['Total_Cases', 'New_Cases', 'Total_Deaths'])

# Step 5: Fill missing values in non-critical columns with 0 (e.g., vaccinations data might be missing for some dates)
covid_data_cleaned.fillna({
    'Total_Vaccinations': 0,
    'People_Vaccinated': 0,
    'People_Fully_Vaccinated': 0
}, inplace=True)

# Display cleaned data
print(covid_data_cleaned.head())


  Country_Code Continent     Location       Date  Total_Cases  New_Cases  \
0          AFG      Asia  Afghanistan 2020-01-05          0.0        0.0   
1          AFG      Asia  Afghanistan 2020-01-06          0.0        0.0   
2          AFG      Asia  Afghanistan 2020-01-07          0.0        0.0   
3          AFG      Asia  Afghanistan 2020-01-08          0.0        0.0   
4          AFG      Asia  Afghanistan 2020-01-09          0.0        0.0   

   new_cases_smoothed  Total_Deaths  New_Deaths  new_deaths_smoothed  ...  \
0                 NaN           0.0         0.0                  NaN  ...   
1                 NaN           0.0         0.0                  NaN  ...   
2                 NaN           0.0         0.0                  NaN  ...   
3                 NaN           0.0         0.0                  NaN  ...   
4                 NaN           0.0         0.0                  NaN  ...   

   male_smokers  handwashing_facilities  hospital_beds_per_thousand  \
0        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_data_cleaned.fillna({


In [11]:
import requests
import pandas as pd

# Step 1: Fetch Data from the COVID-API (e.g., Country-Level Data)
url = "https://covid-api.com/api/reports?iso=USA"
try:
    response = requests.get(url, verify=False)
    response.raise_for_status()
    data = response.json()
except requests.exceptions.RequestException as e:
    print(f"Error fetching data: {e}")
    data = None

if data:
    # Step 2: Normalize the main data structure
    main_data = pd.json_normalize(data['data'], sep='_')

    # Step 3: Check if 'region' exists in the data, then flatten it if present
    if 'region' in main_data.columns:
        region_data = pd.json_normalize(main_data['region'], sep='_')
        main_data = main_data.drop(columns=['region'])

        # Combine the two DataFrames (main_data and region_data)
        covid_api_data = pd.concat([main_data, region_data], axis=1)
    else:
        covid_api_data = main_data  # If 'region' is not found, we use the main_data directly

    # Step 4: Replace headers to make them more readable
    covid_api_data.rename(columns={
        'iso': 'Country_Code',
        'name': 'Country',
        'province': 'Province',
        'date': 'Date',
        'confirmed': 'Total_Cases',
        'deaths': 'Total_Deaths',
        'recovered': 'Total_Recovered',
        'active': 'Active_Cases'
    }, inplace=True)

    # Step 5: Convert 'Date' column to datetime format for consistency
    covid_api_data['Date'] = pd.to_datetime(covid_api_data['Date'])

    # Step 6: Identify columns that contain lists (unhashable types) using map
    list_columns = [col for col in covid_api_data.columns if covid_api_data[col].map(lambda x: isinstance(x, list)).any()]

    if list_columns:
        print(f"Columns '{list_columns}' contain lists and will be excluded from drop_duplicates")

    # Step 7: Drop columns with lists before applying drop_duplicates
    covid_api_data_cleaned = covid_api_data.drop(columns=list_columns)

    # Step 8: Remove any duplicate rows
    covid_api_data_cleaned.drop_duplicates(inplace=True)

    # Step 9: Check if 'Province' column exists before filling missing values
    if 'Province' in covid_api_data_cleaned.columns:
        covid_api_data_cleaned['Province'].fillna('N/A', inplace=True)

    # Step 10: Fill missing numerical values (e.g., deaths, recovered cases) with 0
    covid_api_data_cleaned.fillna({
        'Total_Cases': 0,
        'Total_Deaths': 0,
        'Total_Recovered': 0,
        'Active_Cases': 0
    }, inplace=True)

    # Step 11: Display cleaned data
    print(covid_api_data_cleaned.head())

else:
    print("No data available.")




Columns '['region_cities']' contain lists and will be excluded from drop_duplicates
        Date  Total_Cases  Total_Deaths  Total_Recovered  confirmed_diff  \
0 2023-03-09      1928913         15683                0               0   
1 2023-03-09      4083292         41496                0           10320   
2 2023-03-09     12129699        101159                0           19402   
3 2023-03-09      2443514         33102                0               0   
4 2023-03-09      6794738         77157                0            1368   

   deaths_diff  recovered_diff          last_update  Active_Cases  \
0            0               0  2023-03-10 04:21:03       1913230   
1           53               0  2023-03-10 04:21:03       4041796   
2          205               0  2023-03-10 04:21:03      12028540   
3            0               0  2023-03-10 04:21:03       2410412   
4            5               0  2023-03-10 04:21:03       6717581   

   active_diff  fatality_rate region_iso reg

In [11]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO

# Step 1: Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_in_the_United_States"
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.text, 'lxml')

# Step 2: Extract all tables on the page using pandas.read_html
tables = pd.read_html(str(soup))

# Step 3: Display the number of tables found on the page
print(f"Number of tables found: {len(tables)}")

# Step 4: Loop through all the tables and clean each one
cleaned_tables = []
for i, table in enumerate(tables):
    print(f"\nTable {i + 1}:\n")
    df = table.copy()
    
    # Optional: Clean the headers (depending on the table structure)
    df.columns = [str(col).strip() for col in df.columns]  # Clean column names

    # Step 5: Check for and handle missing values
    df.fillna(0, inplace=True)

    # Step 6: Remove duplicate rows if any
    df.drop_duplicates(inplace=True)

    # Display first few rows of the cleaned table
    print(df.head())

    # Append cleaned table to the list
    cleaned_tables.append(df)
    
    print(cleaned_tables)


  tables = pd.read_html(str(soup))


Number of tables found: 63

Table 1:

              COVID-19 pandemic in the United States  \
0  COVID-19 cases per 100,000 people by state, as...   
1                                            Disease   
2                                       Virus strain   
3                                           Location   
4                                     First outbreak   

            COVID-19 pandemic in the United States.1  
0  COVID-19 cases per 100,000 people by state, as...  
1                                           COVID-19  
2                                         SARS-CoV-2  
3                                      United States  
4                             Wuhan, Hubei, China[1]  
[               COVID-19 pandemic in the United States  \
0   COVID-19 cases per 100,000 people by state, as...   
1                                             Disease   
2                                        Virus strain   
3                                            Location   
4        