In [None]:
import numpy as np
import pandas as pd
from pymongo import MongoClient, ASCENDING

def fill_na_values(raw_df, boolean_cols = []):
    string_cols = []
    number_cols = []

    for col in raw_df.columns:
        if raw_df[col].dtype == object:  
            string_cols.append(col)
        elif np.issubdtype(raw_df[col].dtype, np.number):
            number_cols.append(col)

    raw_df[string_cols] = raw_df[string_cols].fillna("")
    raw_df[number_cols] = raw_df[number_cols].fillna(0)

    for col in boolean_cols:
        raw_df[col] = raw_df[col].astype(bool)

    clean_df = raw_df.copy()
    return clean_df

MONGO_URI = "mongodb://localhost:27017/"
DB_NAME = "CS3203"
COLLECTION_CATEGORIES = "categories"
COLLECTION_SOURCES = "sources"
COLLECTION_POSTS = "posts"
COLLECTION_COMPLAINTS = "complaints"

client = MongoClient(MONGO_URI)
db = client[DB_NAME]

# 1. Collection: categories
categories_data = [
    {"name": "Housing", "color": "#8D6E63"},  # Warm Earthy Brown - homes and buildings
    {"name": "Healthcare", "color": "#E74C3C"},  # Bold Red - urgency and medical cross color
    {"name": "Public Safety", "color": "#FF5733"},  # Vibrant Red-Orange - alert and emergency
    {"name": "Transport", "color": "#1ABC9C"},  # Teal - modern transport vibes
    {"name": "Education", "color": "#3498DB"},  # Bright Blue - trust and knowledge
    {"name": "Environment", "color": "#2ECC71"},  # Fresh Green - nature and sustainability
    {"name": "Employment", "color": "#F1C40F"},  # Bright Yellow - opportunities and career
    {"name": "Public Health", "color": "#E67E22"},  # Warm Orange - community wellness
    {"name": "Legal", "color": "#9C640C"},  # Deep Brown-Gold - traditional legal scales
    {"name": "Economy", "color": "#F39C12"},  # Golden Orange - wealth and finance
    {"name": "Politics", "color": "#5DADE2"},  # Sky Blue - governance and transparency
    {"name": "Technology", "color": "#8E44AD"},  # Purple - innovation and creativity
    {"name": "Infrastructure", "color": "#34495E"},  # Dark Gray-Blue - solid structures
    {"name": "Others", "color": "#95A5A6"}  # Neutral Gray - miscellaneous
]

categories_collection = db[COLLECTION_CATEGORIES]
categories_collection.drop()
categories_collection.insert_many(categories_data)
categories_collection.create_index([("name", 1)], unique=True)

print("Collection categories inserted successfully!")

# 2. Collection: sources
sources_data = [
    {"name": "Reddit", "color": "#FF5733"}  # Vibrant Red-Orange
]

sources_collection = db[COLLECTION_SOURCES]
sources_collection.drop()
sources_collection.insert_many(sources_data)
sources_collection.create_index([("name", 1)], unique=True)

print("Collection sources inserted successfully!")

# 3. Collection: posts
import pandas as pd
from pymongo import MongoClient, ASCENDING
import pandas as pd

raw_df = pd.read_csv("./data/historical_posts.csv")
clean_df = fill_na_values(raw_df)
clean_df["created_utc"] = pd.to_datetime(clean_df["created_utc"], unit='s')
clean_df["comments"] = clean_df["comments"].apply(lambda i: i.split("|")).apply(lambda ls: [i for i in ls if i != "" and i != "[deleted]" and i != "[removed]"])
clean_df = clean_df.dropna()

posts_data = clean_df.to_dict(orient="records")

posts_collection = db[COLLECTION_POSTS]  
posts_collection.drop()
posts_collection.insert_many(posts_data)
posts_collection.create_index([("id", 1)], unique=True)
posts_collection.create_index([("created_utc", ASCENDING)])

print("Collection posts inserted successfully!")

# 4. Collection: complaints
categories = {i["name"] for i in categories_data}

raw_df = pd.read_csv("./data/sentiment_scored_2023_data.csv")
raw_df = raw_df[raw_df["Intent Category"] == "Direct Complaint"]
clean_df = pd.DataFrame()
clean_df["id"] = raw_df["name"]
clean_df["title"] = raw_df["title"]
clean_df["description"] = raw_df["selftext"].fillna("")
clean_df["category"] = raw_df["Domain Category"].apply(lambda i: i if i in categories else "Others")
clean_df["date"] = pd.to_datetime(raw_df["created_utc"])
clean_df["sentiment"] = raw_df["sentiment_title_selftext_polarity"]
clean_df["url"] = raw_df["name"].apply(lambda i: f"https://www.reddit.com/r/singapore/comments/{i}")
clean_df["source"] = "Reddit"
clean_df = clean_df.dropna()

complaints_data = clean_df.to_dict(orient="records")

complaints_collection = db[COLLECTION_COMPLAINTS]  
complaints_collection.drop()
complaints_collection.insert_many(complaints_data)
complaints_collection.create_index([("id", 1)], unique=True)
complaints_collection.create_index([("title", "text"), ("description", "text")])

print("Collection complaints inserted successfully!")

Collection categories inserted successfully!
Collection sources inserted successfully!
Collection posts inserted successfully!


KeyError: 'name'