In [None]:
%%sql -r dataframe_2
USE DATABASE AI_DEMOS;
CREATE SCHEMA IF NOT EXISTS AI_DEMOS.CALL_CENTER_DEMO;
USE SCHEMA AI_DEMOS.CALL_CENTER_DEMO;

In [None]:
# Import python packages
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()

## Generate Structured Data

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

def create_call_center_dataset():
    """
    Creates a realistic call center dataset with call detail records and employee information.
    
    Returns:
        tuple: (call_detail_records_df, employees_df) - Two pandas DataFrames
    """
    
    # Set random seed for reproducibility
    np.random.seed(42)
    random.seed(42)
    
    # Constants
    NUM_CALLS = 3000
    NUM_EMPLOYEES = 25
    NUM_TEAMS = 5
    SENTIMENTS = [1,0,-1,0]

    sentiments = np.random.choice(SENTIMENTS, NUM_CALLS,p=[0.4,0.3,0.2,0.1])
    
    # Create employee data first
    employee_ids = [f"EMP_{i:04d}" for i in range(1, NUM_EMPLOYEES + 1)]
    teams = [f"TEAM{i}" for i in range(1, NUM_TEAMS + 1)]
    
    # Assign employees to teams (roughly equal distribution)
    employees_data = []
    for i, emp_id in enumerate(employee_ids):
        team = teams[i % NUM_TEAMS]
        employees_data.append({
            'EMPLOYEE_ID': emp_id,
            'TEAM': team
        })
    
    employees_df = pd.DataFrame(employees_data)
    
    # Generate weekdays between July and September 2025
    start_date = datetime(2025, 7, 1)
    end_date = datetime(2025, 9, 30)
    
    # Get all weekdays in the range
    weekdays = []
    current_date = start_date
    while current_date <= end_date:
        if current_date.weekday() < 5:  # Monday = 0, Friday = 4
            weekdays.append(current_date.date())
        current_date += timedelta(days=1)
    
    # Create call detail records
    call_records = []
    
    for call_num in range(1, NUM_CALLS + 1):
        # Generate CALL_ID
        call_id = f"CALL_{call_num:04d}"
        
        # Randomly assign employee
        emp_id = random.choice(employee_ids)
        
        # Generate random weekday and time between 8am and 6pm
        random_date = random.choice(weekdays)
        
        # Generate TIMESTAMP1 (call start) - random time between 8am and 5:45pm
        # Ensuring there's time for the full call duration
        start_hour = random.randint(8, 17)  # 8am to 5pm
        start_minute = random.randint(0, 59)
        start_second = random.randint(0, 59)
        
        # If it's 5pm, limit minutes to ensure call ends by 6pm
        if start_hour == 17:
            start_minute = min(start_minute, 30)  # Max 5:30pm start
        
        timestamp1 = datetime.combine(random_date, datetime.min.time().replace(
            hour=start_hour, minute=start_minute, second=start_second
        ))
        
        # Generate time difference between TIMESTAMP1 and TIMESTAMP2 (5 seconds to 10 minutes)
        diff1_seconds = random.randint(5, 150)  # 5 seconds to 10 minutes
        timestamp2 = timestamp1 + timedelta(seconds=diff1_seconds)
        
        # Generate time difference between TIMESTAMP2 and TIMESTAMP3 (1 minute to 15 minutes)
        diff2_seconds = random.randint(60, 900)  # 1 minute to 15 minutes
        timestamp3 = timestamp2 + timedelta(seconds=diff2_seconds)
        
        # Ensure timestamp3 doesn't go beyond 6pm
        end_of_day = datetime.combine(random_date, datetime.min.time().replace(hour=18))
        if timestamp3 > end_of_day:
            # Adjust backwards if needed
            overflow = (timestamp3 - end_of_day).total_seconds()
            timestamp1 -= timedelta(seconds=overflow)
            timestamp2 -= timedelta(seconds=overflow)
            timestamp3 -= timedelta(seconds=overflow)
        
        call_records.append({
            'CALL_ID': call_id,
            'EMP_ID': emp_id,
            'TIMESTAMP1': timestamp1,
            'TIMESTAMP2': timestamp2,
            'TIMESTAMP3': timestamp3
        })
    
    call_detail_records_df = pd.DataFrame(call_records)
    call_detail_records_df['SENTIMENT'] = sentiments
    
    # Sort by timestamp for more realistic data
    call_detail_records_df = call_detail_records_df.sort_values('TIMESTAMP1').reset_index(drop=True)
    
    return call_detail_records_df, employees_df


cdr_df, emp_df = create_call_center_dataset()

cdr_df.head()

In [None]:
cdr_df = session.write_pandas(
    df=cdr_df, 
    database='AI_DEMOS', 
    schema='CALL_CENTER_DEMO', 
    table_name='CALL_DETAIL_RECORDS', 
    overwrite=True, 
    auto_create_table=True, 
    use_logical_type=True
)


cdr_df.show()

In [None]:
emp_df = session.write_pandas(
    df=emp_df, 
    database='AI_DEMOS', 
    schema='CALL_CENTER_DEMO', 
    table_name='EMPLOYEES', 
    overwrite=True, 
    auto_create_table=True, 
    use_logical_type=True
)


emp_df.show()

In [None]:
%%sql -r dataframe_1
ALTER TABLE EMPLOYEES ADD CONSTRAINT PK_EMPLOYEE_ID PRIMARY KEY (EMPLOYEE_ID);

ALTER TABLE CALL_DETAIL_RECORDS
ADD CONSTRAINT fk_employee
FOREIGN KEY (EMPLOYEE_ID)
REFERENCES EMPLOYEES (EMPLOYEE_ID);

In [None]:
%%sql -r dataframe_4
ALTER TABLE CALL_DETAIL_RECORDS
ADD CONSTRAINT FK_EMPLOYEE_ID
FOREIGN KEY (EMP_ID)
REFERENCES EMPLOYEES (EMPLOYEE_ID);

## Generate Unstructured Data

In [None]:
%%sql -r dataframe_3
CREATE STAGE IF NOT EXISTS CALL_CENTER_RECORDINGS 
	DIRECTORY = ( ENABLE = true ) 
	ENCRYPTION = ( TYPE = 'SNOWFLAKE_SSE' );

CREATE STAGE IF NOT EXISTS DOCUMENTS 
	DIRECTORY = ( ENABLE = true ) 
	ENCRYPTION = ( TYPE = 'SNOWFLAKE_SSE' );

In [None]:
session.file.put('call-recordings/*', '@CALL_CENTER_RECORDINGS', auto_compress=False, overwrite=True)
_ = session.sql("ALTER STAGE CALL_CENTER_RECORDINGS REFRESH").collect()

session.file.put('sla-documents/*', '@DOCUMENTS', auto_compress=False, overwrite=True)
_ = session.sql("ALTER STAGE DOCUMENTS REFRESH").collect()