In [1]:
import json
import random
from datetime import datetime, timedelta
from faker import Faker
import psycopg2
import uuid
import requests

In [2]:
fake = Faker()

In [3]:
def get_db_connection():
    return psycopg2.connect(
        host='localhost',
        port='5433',
        user='admin',
        password='password',
        database='university'
    )

def load_student_ids():
    student_ids = []
    try:
        conn = get_db_connection()
        cur = conn.cursor()
        cur.execute("SELECT student_id FROM students")
        student_ids = [row[0] for row in cur.fetchall()]
        cur.close()
        conn.close()
    except Exception as e:
        print(f"Error loading student IDs: {e}")
    
    return student_ids

def load_counselor_ids():
    api_url = 'http://localhost:8002/counselors'
    response = requests.get(api_url)
    json_data = response.json()
    counselor_ids = []
    for obj in json_data:
        counselor_ids.append(obj['counselor_id'])

    return counselor_ids

In [5]:
student_ids = load_student_ids()
counselor_ids = load_counselor_ids()
print(len(student_ids), len(counselor_ids))

9000 12


In [6]:
NUM_RECORDS = 500
START_DATE = datetime(2023, 8, 1)
END_DATE = datetime(2024, 8, 31)

In [7]:
REGISTRATION_MONTHS = [4, 11]  # April and November
EXAM_MONTHS = [5, 12]          # May and December

def is_peak_season(dt):
    return dt.month in REGISTRATION_MONTHS or dt.month in EXAM_MONTHS

def generate_request_time(date):
    if date.weekday() >= 5:  # Weekend
        hour = random.choices(
            [10, 11, 12, 13, 14], weights=[1, 2, 3, 2, 1], k=1
        )[0]
    else:  # Weekday
        hour = random.choices(
            [9, 10, 11, 13, 14, 15, 16], weights=[1, 2, 3, 3, 2, 2, 1], k=1
        )[0]
    minute = random.randint(0, 59)
    second = random.randint(0, 59)
    return date.replace(hour=hour, minute=minute, second=second)

In [8]:
def weighted_dates():
    """Generate a list of weighted dates based on seasonality."""
    current = START_DATE
    date_weights = []

    while current <= END_DATE:
        weight = 1
        if current.weekday() < 5:
            weight += 2  # Weekday
        if is_peak_season(current):
            weight += 2
        date_weights.extend([current] * weight)
        current += timedelta(days=1)

    return date_weights

In [12]:
def generate_reports(student_ids, counselor_ids):
    date_pool = weighted_dates()
    data = []

    for _ in range(NUM_RECORDS):
        meeting_id = str(uuid.uuid4())
        student_id = random.choice(student_ids)
        counselor_id = random.choice(counselor_ids)
        date = random.choice(date_pool)
        timestamp = generate_request_time(date)
        format = random.choice(["in-person", "online"])
        report_content = fake.paragraph(nb_sentences=10)

        data.append({
            "meeting_id": meeting_id,
            "student_id": student_id,
            "counselor_id": counselor_id,
            "meeting_date": timestamp.isoformat(),
            "format": format,
            "report_content": report_content
        })

    return data

In [13]:
data = generate_reports(student_ids, counselor_ids)
len(data)

500

In [17]:
data_sorted = sorted(data, key=lambda d: d['meeting_date'])

In [18]:
data_sorted[:5]

[{'meeting_id': '6c8c3ee2-e259-430f-9dfd-ecdd22b7b811',
  'student_id': 'lr451',
  'counselor_id': 'jthompson',
  'meeting_date': '2023-08-02T13:45:40',
  'format': 'in-person',
  'report_content': 'Weight little product single parent themselves. Area marriage throughout no. Economic represent cut yard plan Republican. Really soldier though town increase indeed. Magazine tend ready citizen beautiful clearly. House window worker prepare might design various exactly. Least approach success face hit race themselves machine. Admit modern possible loss skin. Hospital argue anything. Debate amount light real color. Describe page gas hold collection stay new. Language catch magazine ever speech.'},
 {'meeting_id': 'ec963d09-d7cf-49d0-9bbd-a90f025ad2a6',
  'student_id': 'ak647',
  'counselor_id': 'acasados',
  'meeting_date': '2023-08-02T14:16:27',
  'format': 'in-person',
  'report_content': 'Result read have success. Character operation personal table benefit computer. Notice near set. Film 

In [19]:
with open("/Users/ngochoang/Library/CloudStorage/GoogleDrive-nhungoc1508@gmail.com/My Drive/Graduate/Semester 2 (Spring 2025)/Big Data Management/Project/P1 - Landing zone/Data/past_meeting_reports.json", "w") as f:
    json.dump(data_sorted, f, indent=4)