In [11]:
#pip install psycopg2-binary kafka-python pyspark redis xgboost flask apache-airflow dash pandas

## 2. Data Ingestion from PostgreSQL (Batch & Real‑Time)
2.1 Batch Extraction from Multiple Tables
Use a loop to read from all four tables. For example, in a Jupyter Notebook cell:

In [12]:
import pandas as pd
import psycopg2

# PostgreSQL connection parameters
conn = psycopg2.connect(
    host="localhost",
    port=5433,
    database="nyc",
    user="postgres",
    password="password"
)

tables = [
    "yellow_tripdata_2024_01",
    "yellow_tripdata_2024_02",
    "green_tripdata_2024_01",
    "green_tripdata_2024_02"
]

dataframes = {}
for table in tables:
    query = f"SELECT * FROM public.{table} LIMIT 1000;"
    df = pd.read_sql(query, conn)
    dataframes[table] = df
    print(f"Data from {table}:")
    print(df.head())

conn.close()


Data from yellow_tripdata_2024_01:
   vendorid     pickup_datetime    dropoff_datetime  passenger_count  \
0         2 2024-01-09 15:30:28 2024-01-09 15:31:47                1   
1         2 2024-01-09 15:42:28 2024-01-09 15:51:15                1   
2         2 2024-01-09 15:53:13 2024-01-09 16:01:11                1   
3         2 2024-01-09 15:53:00 2024-01-09 16:00:50                1   
4         2 2024-01-09 15:19:36 2024-01-09 15:33:17                1   

   trip_distance  ratecodeid store_and_fwd_flag  pulocationid  dolocationid  \
0           0.28           1                  N           164           234   
1           1.05           1                  N           114           234   
2           0.96           1                  N           234           125   
3           0.91           1                  N           137           229   
4           1.50           1                  N           237           142   

   payment_type  fare_amount  extra  mta_tax  tip_amount 

  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)


# Before proceeding to section 2.2, we need to run kafka 

please follow the readme.md for more information section 1

## 2.2 Simulating Real‑Time Streaming from PostgreSQL to Kafka
Publish data from all tables into Kafka. You can add an extra field (e.g., "table_name") so your downstream processing knows the source.

In [13]:
import psycopg2
import pandas as pd
import json
import time
from kafka import KafkaProducer
from datetime import datetime
from decimal import Decimal

# ✅ Kafka Producer Setup
producer = KafkaProducer(
    bootstrap_servers="localhost:9092",
    value_serializer=lambda v: json.dumps(v, default=str).encode("utf-8")
)

# ✅ PostgreSQL Connection
DB_CONFIG = {
    "host": "localhost",
    "port": 5433,
    "database": "nyc",
    "user": "postgres",
    "password": "password"
}

TABLES = ["yellow_tripdata_2024_01", "yellow_tripdata_2024_02", "green_tripdata_2024_01", "green_tripdata_2024_02"]

conn = psycopg2.connect(**DB_CONFIG)

for table in TABLES:
    taxi_type = "yellow" if "yellow" in table else "green"

    with conn.cursor() as cursor:
        query = f"SELECT * FROM public.{table} LIMIT 500;"
        cursor.execute(query)

        # Get column names
        columns = [desc[0] for desc in cursor.description]

        # Fetch rows and send to Kafka
        rows = cursor.fetchall()
        for row in rows:
            data = dict(zip(columns, row))
            data["table_name"] = table
            data["taxi_type"] = taxi_type  # Include taxi type

            # ✅ Handle Null Values
            for key, value in data.items():
                if isinstance(value, datetime):
                    data[key] = value.strftime("%Y-%m-%d %H:%M:%S")
                elif isinstance(value, Decimal):
                    data[key] = float(value)
                elif value is None:
                    data[key] = 0  # Convert null to 0 for numerical consistency

            producer.send("nyc_taxi_topic", data)
            time.sleep(0.05)  # Simulate real-time streaming

producer.flush()
conn.close()
print("✅ Streaming data successfully published to Kafka!")


✅ Streaming data successfully published to Kafka!
