# Customer Service Survey Data Generation

Author: Mary Radau

This notebook creates a synthetic dataset with 150 observations and 25 variables representing post-call customer service surveys for a phone company. The dataset includes numeric, categorical, boolean, datetime, and text fields, with some injected outliers and missing comments to make it suitable for data exploration and cleaning exercises. The generated CSV is saved as `customer_service_survey.csv`.

In [5]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta
from pathlib import Path
import os

current_working_directory = os.getcwd()

np.random.seed(42)
random.seed(42)
n = 150

file_path = Path(current_working_directory)
root_dir = file_path.parent
data_dir = root_dir / 'data'  # use Path / operator to build path
data_dir.mkdir(parents=True, exist_ok=True)  # ensure the data directory exists

# Helper to create customer IDs
def make_client_ids(n):
    return [f'CID{10000 + i}' for i in range(n)]

client_id = make_client_ids(n)

# Call start and end times (end = start + duration)
start_base = datetime(2025, 1, 1)
start_times = [start_base + timedelta(days=int(x)) + timedelta(minutes=int(y)) for x, y in zip(np.random.exponential(scale=30, size=n), np.random.uniform(0, 60, size=n))]
dur_minutes = np.clip(np.random.normal(loc=8, scale=6, size=n), 0.5, None)  # call durations, some short, some long
end_times = [s + timedelta(minutes=float(d)) for s, d in zip(start_times, dur_minutes)]

# Satisfaction score 1-10 with bias towards 7-9, but some low and high outliers
satisfaction = np.clip(np.round(np.random.normal(loc=8, scale=1.8, size=n)), 1, 10).astype(int)
# Introduce a few extreme low/high outliers
for idx in np.random.choice(n, size=4, replace=False):
    satisfaction[idx] = np.random.choice([1, 10])

# Monthly bill in dollars - right skewed with some extreme bills
monthly_bill = np.round(np.random.lognormal(mean=4, sigma=0.6, size=n), 2)  # typical bills around exp(4)=54
# add some very large corporate accounts
for idx in np.random.choice(n, size=3, replace=False):
    monthly_bill[idx] *= np.random.uniform(5, 20)

# Agent ID and team
agent_id = [f'A{np.random.randint(100,200)}' for _ in range(n)]
team = np.random.choice(['Retention','Billing','Technical Support','Sales'], size=n, p=[0.25,0.25,0.35,0.15])

# Resolution flag (1 resolved, 0 unresolved), probability depends on team
team_prob = {'Retention':0.9, 'Billing':0.85, 'Technical Support':0.75, 'Sales':0.8}
resolved = [np.random.binomial(1, team_prob[t]) for t in team]

# Hold time seconds - skewed, with some long holds
hold_time = np.round(np.random.exponential(scale=30, size=n)).astype(int)  # seconds
hold_time[np.random.choice(n, size=5, replace=False)] *= 10  # inject long hold outliers

# Queue position (1 = first), some missing when no queue
queue_pos = np.where(np.random.rand(n) < 0.2, np.nan, np.random.randint(1, 20, size=n)).astype(float)

# Call rating (thumbs up/down) derived from satisfaction with noise
thumbs_up = [1 if s >= 7 and np.random.rand() > 0.1 else (0 if s <= 4 and np.random.rand() > 0.2 else np.nan) for s in satisfaction]

# Comments - present 60% of the time, otherwise missing. Some long comments, some short, include special characters occasionally
sample_comments = ['Agent was helpful and resolved my issue quickly',
                   'Wait time was too long',
                   'Billing error corrected',
                   'Not happy with the service',
                   'Excellent support!',
                   'Transferred multiple times before resolution',
                   'I was disconnected',
                   'Great, thanks',
                   'Agent could not help',
                   'Resolved after supervisor escalation']
comments = [random.choice(sample_comments) if np.random.rand() < 0.6 else '' for _ in range(n)]
# Make some comments extremely long for text analysis edge cases
for idx in np.random.choice(n, size=4, replace=False):
    comments[idx] = comments[idx] + ' ' + ' '.join(['very']*np.random.randint(20,50))

# Follow-up required flag
follow_up = [1 if 'supervisor' in c or 'billing' in c.lower() else 0 for c in comments]

# Customer tenure months - uniform-ish with some very long-tenure outliers
tenure_months = np.clip(np.random.exponential(scale=24, size=n).astype(int), 0, 600)
tenure_months[np.random.choice(n, size=2, replace=False)] = 999  # extreme outliers

# Number of prior calls in last 3 months - Poisson with occasional spikes
prior_calls_3m = np.random.poisson(lam=0.6, size=n),
# Fix tuple accidental creation above
prior_calls_3m = np.array(prior_calls_3m).astype(int).flatten()
for idx in np.random.choice(n, size=5, replace=False):
    prior_calls_3m[idx] += np.random.randint(5, 30)

# Language preference
language = np.random.choice(['English','Spanish','Other'], size=n, p=[0.78, 0.18, 0.04])

# Customer age group
age = np.clip(np.random.normal(loc=40, scale=12, size=n).astype(int), 18, 90)

# Device type
device = np.random.choice(['Android','iOS','Feature Phone','Web'], size=n, p=[0.45,0.35,0.05,0.15])

# Promotion flag - whether customer is on a promotional plan
promo = np.random.binomial(1, 0.12, size=n).astype(int)

# Contract type
contract = np.random.choice(['Month-to-month','1-year','2-year'], size=n, p=[0.6,0.25,0.15])

# Satisfaction delta - difference between expectation and outcome (-5 to 5)
satisfaction_delta = np.round(np.random.normal(loc=0, scale=1.6, size=n)).astype(int)
satisfaction_delta = np.clip(satisfaction_delta, -5, 5)

# Net promoter score style (0-10)
nps = np.clip(np.round(np.random.normal(loc=7, scale=2, size=n)), 0, 10).astype(int)

# Support topic
topic = np.random.choice(['Connectivity','Billing','Account','Technical','Plan Change','Other'], size=n, p=[0.35,0.2,0.15,0.2,0.05,0.05])

# Escalation level (0 none, 1 supervisor, 2 manager)
escalation = np.random.choice([0,1,2], size=n, p=[0.9,0.08,0.02])

# Discount given ($) - mostly zero, some small discounts, a few large ones
discount_given = np.round(np.where(np.random.rand(n) < 0.18, np.random.exponential(scale=8, size=n), 0), 2)
discount_given[np.random.choice(n, size=2, replace=False)] = 999.99  # absurd outliers

# Satisfaction reason code (categorical numeric)
reason_code = np.random.choice([1,2,3,4,5], size=n, p=[0.3,0.25,0.2,0.15,0.1])

# Platform used to call (VoIP, Mobile, Landline)
platform = np.random.choice(['VoIP','Mobile','Landline'], size=n, p=[0.5,0.4,0.1])

# Build DataFrame
df = pd.DataFrame({
    'client_id': client_id,
    'call_start': start_times,
    'call_end': end_times,
    'call_duration_min': np.round(dur_minutes,2),
    'agent_id': agent_id,
    'team': team,
    'topic': topic,
    'resolved': resolved,
    'escalation_level': escalation,
    'satisfaction_score': satisfaction,
    'satisfaction_delta': satisfaction_delta,
    'nps_score': nps,
    'monthly_bill_usd': monthly_bill,
    'discount_given_usd': discount_given,
    'tenure_months': tenure_months,
    'prior_calls_3m': prior_calls_3m,
    'hold_time_seconds': hold_time,
    'queue_position': queue_pos,
    'thumbs_up': thumbs_up,
    'comments': comments,
    'follow_up_required': follow_up,
    'language': language,
    'age': age,
    'device': device,
    'contract_type': contract,
    'platform': platform,
    'created_at': [datetime.now() for _ in range(n)]
})

# Quick sanity checks
assert df.shape[0] == n and df.shape[1] >= 25

# Save to CSV
out_path = data_dir / 'customer_service_survey.csv'
df.to_csv(out_path, index=False)
print(f'Wrote {out_path} with shape', df.shape)
df.head().T


Wrote c:\Users\MFSALASG\Dropbox\Project Inventory\Data_Exploration\data\customer_service_survey.csv with shape (150, 27)


Unnamed: 0,0,1,2,3,4
client_id,CID10000,CID10001,CID10002,CID10003,CID10004
call_start,2025-01-15 00:54:00,2025-04-01 00:14:00,2025-02-09 00:08:00,2025-01-28 00:29:00,2025-01-06 00:59:00
call_end,2025-01-15 01:02:16.405862,2025-04-01 00:18:05.423875,2025-02-09 00:28:51.819872,2025-01-28 00:40:48.210848,2025-01-06 00:59:30
call_duration_min,8.27,4.09,20.86,11.8,0.5
agent_id,A119,A192,A162,A153,A173
team,Retention,Technical Support,Technical Support,Retention,Technical Support
topic,Billing,Connectivity,Account,Plan Change,Technical
resolved,1,1,1,1,1
escalation_level,0,0,0,0,0
satisfaction_score,4,7,7,8,9
