In [None]:
import pandas as pd
from pymongo import MongoClient
import psycopg2
from pyspark.sql import SparkSession

# Load data locally
df = pd.read_csv("/mnt/data/marketing_campaign_dataset.csv")

# 1. Load Data into MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["retail_db"]
collection = db["marketing_campaign"]

# Convert DataFrame to dictionary records and insert into MongoDB
collection.insert_many(df.to_dict(orient="records"))
print("Data successfully inserted into MongoDB!")

# 2. Define PostgreSQL Schema
postgres_conn = psycopg2.connect(
    dbname="retail_db", user="admin", password="password", host="localhost"
)
cursor = postgres_conn.cursor()

cursor.execute("""
    CREATE TABLE IF NOT EXISTS marketing_campaign (
        id SERIAL PRIMARY KEY,
        customer_id INT,
        product_category VARCHAR(255),
        amount_spent DECIMAL,
        rating DECIMAL,
        purchase_date DATE
    );
""")
postgres_conn.commit()

# 3. Transfer Data from MongoDB to PostgreSQL
records = collection.find({}, {"_id": 0})
for record in records:
    cursor.execute(
        "INSERT INTO marketing_campaign (customer_id, product_category, amount_spent, rating, purchase_date) VALUES (%s, %s, %s, %s, %s)",
        (record["customer_id"], record["product_category"], record["amount_spent"], record["rating"], record["purchase_date"])
    )
postgres_conn.commit()
cursor.close()
postgres_conn.close()
print("Data successfully transferred to PostgreSQL!")

# 4. Spark SQL Analysis
spark = SparkSession.builder.appName("RetailAnalysis").getOrCreate()
df_spark = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/retail_db") \
    .option("dbtable", "marketing_campaign") \
    .option("user", "admin") \
    .option("password", "password") \
    .load()

df_spark.createOrReplaceTempView("marketing_campaign")

# Example queries
df_spark.sql("SELECT product_category, COUNT(*) as purchases FROM marketing_campaign GROUP BY product_category ORDER BY purchases DESC").show()
df_spark.sql("SELECT customer_id, SUM(amount_spent) as total_spent FROM marketing_campaign GROUP BY customer_id ORDER BY total_spent DESC LIMIT 5").show()

# Export to Parquet
df_spark.write.parquet("/mnt/data/marketing_campaign.parquet")
print("Data exported to Parquet!")
