### **Tutorial 08: Data Ingestion - A Hands-on Guide**


In [None]:
import pandas as pd
import requests
import sqlite3

In [None]:
# 1. Read Data from a CSV File
print("Reading data from CSV...")
csv_file = "./data/raw/mmdt.csv"  
try:
    df_csv = pd.read_csv(csv_file)
    print("CSV Data Loaded Successfully!")
    display(df_csv.head())
except Exception as e:
    print(f"Error loading CSV: {e}")

In [None]:
# 2. Read Data from a JSON File
import json
print("\nReading data from JSON...")
json_file = "./data/raw/wip_2025.json" 
try:
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)
    df_json = pd.json_normalize(data, record_path=["data", "regional"])
    print("JSON Data Loaded Successfully!")
    display(df_json.head())
except Exception as e:
    print(f"Error loading JSON: {e}")

In [None]:
#3 Ingesting Data from a Database (e.g., SQLite)
print("\nReading data from SQLite database...")
db_path = '../Projects/database/mmdt.db3'
try:
    query = "SELECT * FROM participants;"
    df_sqlite = pd.read_sql_query(query, f"sqlite:///{db_path}")
    display(df_sqlite.head())
except Exception as e:
    print(f"Error loading SQLite: {e}")

In [None]:
#4 Ingesting Data from a Database (e.g., MySQL)
import pyodbc
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
import urllib

load_dotenv()  
server = os.getenv("DB_SERVER")
database = os.getenv("DB_NAME")
username = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD") 
driver = 'ODBC Driver 18 for SQL Server' 

params = urllib.parse.quote_plus(
    f"DRIVER={{{driver}}};SERVER={server};DATABASE={database};"
    f"UID={username};PWD={password};ENCRYPT=yes;TrustServerCertificate=yes"
)

engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}")
try:
    query = """ 
        SELECT * FROM Sales.Orders;
        """
    df_mysql = pd.read_sql(query, engine)
    display(df_mysql.head())
except Exception as e:
    print(f"Error loading mySQL: {e}") 

#### Get your API key from here: **https://pygis.io/docs/d_access_census.html**
- '**C17002_001E**': Total population for poverty status, count of ratio of income to poverty in the past 12 months (total)
- '**C17002_002E**': Population below 50% of the poverty level, count of ratio of income to poverty in the past 12 months (< 0.50)
- '**C17002_003E**': Population between 50% and 100% of the poverty level, count of ratio of income to poverty in the past 12 months (0.50 - 0.99)
- '**B01003_001E**': Total population.


In [None]:
# 5.  Real-time Ingestion using APIs

from census import Census
from us import states

c = Census(os.environ.get('census_api_key'))
data_census = c.acs5.state_county_tract(fields = ('NAME', 'C17002_001E', 'C17002_002E', 'C17002_003E', 'B01003_001E'),
                                      state_fips = states.MA.fips,
                                      county_fips = "*",
                                      tract = "*",
                                      year = 2022)
try:
    df_census = pd.DataFrame(data_census)
    display(df_census.head())
except Exception as e:
    print(f"Error loading census data: {e}") 