## 0. Import and preprocessing the data

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

df = pd.read_excel('Call-Center-Dataset.xlsx')

target_columns = ['Speed of answer in seconds', 'AvgTalkDuration', 'Satisfaction rating']

def time_to_seconds(t):
    if pd.isnull(t) or not isinstance(t, datetime.time):
        return np.nan
    return t.hour * 3600 + t.minute * 60 + t.second

df['AvgTalkDuration'] = df['AvgTalkDuration'].apply(time_to_seconds)
df[target_columns] = df[target_columns].where(pd.notna(df[target_columns]), 0)
df['Time'] = df['Time'].apply(time_to_seconds)

In [2]:
df

Unnamed: 0,Call Id,Agent,Date,Time,Topic,Answered (Y/N),Resolved,Speed of answer in seconds,AvgTalkDuration,Satisfaction rating
0,ID0001,Diane,2021-01-01,33178,Contract related,Y,Y,109.0,143.0,3.0
1,ID0002,Becky,2021-01-01,33178,Technical Support,Y,N,70.0,242.0,3.0
2,ID0003,Stewart,2021-01-01,35251,Contract related,Y,Y,10.0,131.0,3.0
3,ID0004,Greg,2021-01-01,35251,Contract related,Y,Y,53.0,37.0,2.0
4,ID0005,Becky,2021-01-01,36029,Payment related,Y,Y,95.0,60.0,3.0
...,...,...,...,...,...,...,...,...,...,...
4995,ID4996,Jim,2021-03-31,59875,Payment related,Y,Y,22.0,340.0,1.0
4996,ID4997,Diane,2021-03-31,60307,Payment related,Y,Y,100.0,196.0,3.0
4997,ID4998,Diane,2021-03-31,60826,Payment related,Y,Y,84.0,109.0,4.0
4998,ID4999,Jim,2021-03-31,61344,Streaming,Y,Y,98.0,58.0,5.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Call Id                     5000 non-null   object 
 1   Agent                       5000 non-null   object 
 2   Date                        5000 non-null   object 
 3   Time                        5000 non-null   int64  
 4   Topic                       5000 non-null   object 
 5   Answered (Y/N)              5000 non-null   object 
 6   Resolved                    5000 non-null   object 
 7   Speed of answer in seconds  5000 non-null   float64
 8   AvgTalkDuration             5000 non-null   float64
 9   Satisfaction rating         5000 non-null   float64
dtypes: float64(3), int64(1), object(6)
memory usage: 390.8+ KB


## 1. Exploratory Data Analysis (EDA)

**Mean Client Server and Abandonment Count**

In [4]:
average_client_calling = df.groupby('Date')['Call Id'].nunique().reset_index(name='ClientCount')["ClientCount"].mean()

In [5]:
average_abandonment_count = df[df["Answered (Y/N)"]=="N"].groupby('Date')['Call Id'].nunique().reset_index(name='ClientCount')["ClientCount"].mean()

**Average working time**

In [6]:
daily_time_range = df.groupby('Date')['Time'].agg(['min', 'max'])
daily_time_range['working_hours'] = (daily_time_range['max'] - daily_time_range['min'])
daily_time_range.reset_index()
average_working_time = daily_time_range['working_hours'].mean()

**Average service time for each call each agent**

In [7]:
average_service_duration = df[df["Answered (Y/N)"]=="Y"][["AvgTalkDuration"]].mean().values[0]

In [8]:
df[df["Answered (Y/N)"]=="Y"].groupby(["Agent", "Topic"])[["AvgTalkDuration"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,AvgTalkDuration
Agent,Topic,Unnamed: 2_level_1
Becky,Admin Support,224.31
Becky,Contract related,220.275862
Becky,Payment related,225.535714
Becky,Streaming,210.630252
Becky,Technical Support,220.454545
Dan,Admin Support,230.528736
Dan,Contract related,228.544554
Dan,Payment related,240.933962
Dan,Streaming,227.377193
Dan,Technical Support,228.808696


**Abandonment Rate**

In [9]:
average_abandonment_rate = average_abandonment_count/average_client_calling

**Idle time**

In [10]:
# First, sort the data by Date, Agent, and Time
df_sorted = df.sort_values(by=['Date', 'Agent', 'Time'])

# Calculate the idle time (in seconds) between consecutive calls for each Agent on each Date
df_sorted['IdleTime'] = df_sorted.groupby(['Date', 'Agent'])['Time'].diff()

# Drop the first call per group as its idle time is NaN (since there's no previous call)
idle_time_per_agent_date = (
    df_sorted.groupby(['Date', 'Agent'])['IdleTime']
    .mean()
    .reset_index(name='Average Idle Time (seconds)')
)

average_idle_time = idle_time_per_agent_date["Average Idle Time (seconds)"].mean()

#### Summary

In [11]:
# Example dictionary
summary_data = {
    'Name': ['Average number of client served', 'Average idle time', 'Average service duration','Average working time', 'Average abandonment rate',   ],
    'Value': [average_client_calling,  average_idle_time, average_service_duration, average_working_time,average_abandonment_rate],
}

# Create DataFrame
summary_df = pd.DataFrame(summary_data)

# Display the DataFrame
summary_df


Unnamed: 0,Name,Value
0,Average number of client served,55.555556
1,Average idle time,4605.545152
2,Average service duration,224.922792
3,Average working time,30504.933333
4,Average abandonment rate,0.1892


## 2. Distribution Fitting

In [12]:
seed_num = 111 # For reproducibilitys

### 2.1 Data Collection

In [13]:
topics = ["Admin Support", "Contract related", "Payment related",  "Streaming", "Technical Support"]
agents = ["Becky", "Dan", "Diane", "Greg", "Jim", "Joe" , "Martha", "Stewart"]


def get_avg_talk_duration(agent, topic):
    avg_talk_duration_grouped = df[df["Answered (Y/N)"]=="Y"].groupby(['Agent', 'Topic'])['AvgTalkDuration'].apply(list).reset_index()
    result = avg_talk_duration_grouped[
        (avg_talk_duration_grouped['Agent'] == agent) & 
        (avg_talk_duration_grouped['Topic'] == topic)
    ]
    if not result.empty:
        return result.iloc[0]['AvgTalkDuration']
    else:
        return f"No records found for Agent: {agent}, Topic: {topic}"


# Function to calculate inter-arrival times
def calculate_inter_arrival_times(time_list):
    sorted_times = sorted(set(time_list))
    inter_arrival_times = np.diff(sorted_times)
    return inter_arrival_times

topic_time = df[df["Answered (Y/N)"]=="Y"].groupby(['Date', 'Topic'])['Time'].apply(list).reset_index()
topic_time['InterArrivalTimes'] = topic_time['Time'].apply(calculate_inter_arrival_times)

topic_iat = topic_time.groupby('Topic')['InterArrivalTimes'].apply(lambda lists: [time for sublist in lists for time in sublist]).reset_index()
topic_iat['Mean_InterArrivalTime'] = topic_iat['InterArrivalTimes'].apply(lambda x: sum(x)/len(x))


abandonment_time = df[df["Answered (Y/N)"]=="N"].groupby(['Date', 'Topic'])['Time'].apply(list).reset_index()
abandonment_time['InterArrivalTimes'] = abandonment_time['Time'].apply(calculate_inter_arrival_times)

abandonment_iat = abandonment_time.groupby('Topic')['InterArrivalTimes'].apply(lambda lists: [time for sublist in lists for time in sublist]).reset_index()
abandonment_iat['Mean_InterArrivalTime'] = abandonment_iat['InterArrivalTimes'].apply(lambda x: sum(x)/len(x))

**Count of values in inter-arrival**

In [14]:
total_iat = 0

for i in range(len(topic_iat["InterArrivalTimes"])):
    total_iat += len(topic_iat["InterArrivalTimes"][i])

total_iat

3233

**Count of values in inter-abandonment**

In [15]:
total_abandonment_iat = 0

for i in range(len(abandonment_iat["InterArrivalTimes"])):
    total_abandonment_iat += len(abandonment_iat["InterArrivalTimes"][i])

total_abandonment_iat

529

### 2.2 Maximum Likelihood Estimation (MLE)

In [16]:
import numpy as np
import pandas as pd
from scipy import stats

def fit_distributions_mle(data, data_name, distributions=None, top_k=3):
    # print(f"\n Generating MLE test for: {data_name}")
    
    if distributions is None:
        # distributions = ['norm', 'expon', 'gamma', 'uniform', 'pareto']
        distributions = ['norm', 'expon', 'uniform', 'pareto']

    data = np.asarray(data)
    results = []
    n = len(data)

    for dist_name in distributions:
        dist = getattr(stats, dist_name)
        params = dist.fit(data)
        k = len(params)
        log_likelihood = np.sum(dist.logpdf(data, *params))
        aic = 2 * len(params) - 2 * log_likelihood
        bic = np.log(n) * k - 2 * log_likelihood

        formal_names = {
            'norm': 'Normal',
            'expon': 'Exponential',
            'uniform': 'Uniform',
            'pareto': 'Pareto',
            'lognorm': 'Log-Normal'
        }

        results.append({
            'distribution': formal_names[dist_name],
            # 'params': params,
            'log_likelihood': log_likelihood,
            'aic': aic,
            'bic': bic,
            'data_name': data_name
        })
    # Sort by lowest AIC
    results.sort(key=lambda x: x['bic'])


    return results


def summarize_agent_mle(agents, topics, get_data_function, top_k=3):
    all_results = []

    for agent in agents:
        for topic in topics:
            data = get_data_function(agent, topic)
            display_name = f"{agent} - Topic: {topic}"

            if isinstance(data, list) and data:
                results = fit_distributions_mle(data, display_name, top_k=top_k)
                for r in results[:top_k]:  # Store only top_k results
                    all_results.append({
                        'Agent': agent,
                        'Topic': topic,
                        'Distribution': r['distribution'],
                        # 'Params': r['params'],
                        'Log-Likelihood': r['log_likelihood'],
                        'AIC': r['aic'],
                        'BIC': r['bic']
                    })
            else:
                print(f"⚠️ Skipping {display_name} — no valid data")

    df_summary = pd.DataFrame(all_results)
    return df_summary


def summarize_topic_mle(topics, topic_df, display_prefix, top_k=3):

    all_results = []

    for topic in topics:
        row = topic_df[topic_df["Topic"] == topic]
        if not row.empty:
            data = row["InterArrivalTimes"].values[0]
            display_name = f"{display_prefix}: {topic}"

            if isinstance(data, list) and data:
                results = fit_distributions_mle(data, display_name, top_k=top_k)
                for r in results[:top_k]:
                    all_results.append({
                        'Topic': topic,
                        'Distribution': r['distribution'],
                        # 'Params': r['params'],
                        'Log-Likelihood': r['log_likelihood'],
                        'AIC': r['aic'],
                        'BIC': r['bic']
                    })
            else:
                print(f"⚠️ Skipping {topic} — empty or invalid data")
        else:
            print(f"⚠️ Topic {topic} not found in DataFrame")

    return pd.DataFrame(all_results)

In [18]:
summarize_agent_mle(agents, topics, get_avg_talk_duration, top_k=1)

Unnamed: 0,Agent,Topic,Distribution,Log-Likelihood,AIC,BIC
0,Becky,Admin Support,Uniform,-594.017125,1192.034251,1197.244591
1,Becky,Contract related,Uniform,-516.105331,1036.210663,1041.142479
2,Becky,Payment related,Uniform,-666.179919,1336.359838,1341.796835
3,Becky,Streaming,Uniform,-709.665942,1423.331884,1428.890131
4,Becky,Technical Support,Uniform,-588.59664,1181.193281,1186.38352
5,Dan,Admin Support,Uniform,-518.382948,1040.765897,1045.697713
6,Dan,Contract related,Uniform,-601.800894,1207.601788,1212.832029
7,Dan,Payment related,Uniform,-630.768111,1265.536221,1270.863099
8,Dan,Streaming,Uniform,-674.753899,1353.507799,1358.980196
9,Dan,Technical Support,Uniform,-683.72337,1371.44674,1376.936604


In [20]:
summarize_topic_mle(topics, topic_iat , "Topic Inter-Arrival Times", top_k=1)

Unnamed: 0,Topic,Distribution,Log-Likelihood,AIC,BIC
0,Admin Support,Exponential,-5793.810737,11591.621473,11600.531871
1,Contract related,Exponential,-5745.409883,11494.819767,11503.70166
2,Payment related,Exponential,-5907.715724,11819.431447,11828.382313
3,Streaming,Exponential,-6211.357296,12426.714591,12435.767581
4,Technical Support,Exponential,-5842.209707,11688.419414,11697.336091


In [22]:
summarize_topic_mle(topics, abandonment_iat , "Topic Inter-Arrival Times", top_k=1)

Unnamed: 0,Topic,Distribution,Log-Likelihood,AIC,BIC
0,Admin Support,Exponential,-965.831508,1935.663017,1940.812439
1,Contract related,Exponential,-1041.14966,2086.29932,2091.607241
2,Payment related,Exponential,-1044.066969,2092.133937,2097.441858
3,Streaming,Exponential,-958.022029,1920.044058,1925.19348
4,Technical Support,Exponential,-1237.864895,2479.72979,2485.386417


### 2.3 Method of Moments (MoM)

In [None]:
import numpy as np

def sample_moment(data, order):
    return np.mean(np.power(data, order))

def solve_uniform(data):
    m1 = sample_moment(data, 1)
    m2 = sample_moment(data, 2)
    
    # Coefficients for quadratic equation
    a = 1
    b = -2 * m1
    c = 4 * m1**2 - 3 * m2
    
    discriminant = b**2 - 4 * a * c
    
    # Handle complex discriminant (if necessary)
    sqrt_discriminant = np.sqrt(discriminant) if discriminant >= 0 else np.sqrt(complex(discriminant))
    
    # Quadratic formula solutions
    a1 = (-b + sqrt_discriminant) / (2 * a)
    b1 = 2 * m1 - a1
    
    a2 = (-b - sqrt_discriminant) / (2 * a)
    b2 = 2 * m1 - a2
    
    # Select min and max real parts
    min_a = min(np.real(a1), np.real(a2))
    max_b = max(np.real(b1), np.real(b2))
    
    return min_a, max_b

### 2.4 Kolmogorov-Smirnov (KS) test

In [24]:
from scipy.stats import ks_2samp
import numpy as np
import pandas as pd

def ks_test_summary_by_agent_topic(agents, topics, get_data_function):
    all_results = []

    for agent in agents:
        for topic in topics:
            durations = get_data_function(agent, topic)
            data_name = f"{agent} - {topic}"

            durations_seconds = np.array([d for d in durations if d > 0])
            size = len(durations_seconds)

            min_val, max_val = solve_uniform(durations_seconds)


            # mean = np.mean(durations_seconds)
            # std = np.std(durations_seconds)
            # min_val, max_val = np.min(durations_seconds), np.max(durations_seconds)
            # ptp = max_val - min_val


            np.random.seed(seed_num) 

            distributions = {
                'Uniform': np.random.uniform(min_val, max_val, size),
                # 'Normal': np.random.normal(mean, std, size),
                # 'Exponential': np.random.exponential(scale=mean, size=size),
                # 'Gamma': np.random.gamma((mean / std) ** 2, (std ** 2) / mean, size)
                # 'Poisson': np.random.poisson(lam=mean, size=size),
                # 'Weibull': np.random.weibull(a=1.5, size=size) * mean / 1.5,
                # 'Beta': np.random.beta(2, 5, size=size) * ptp + min_val,
                # 'Lognormal': np.random.lognormal(np.log(mean), 0.954, size),
                # 'Pareto': np.random.pareto(2.62, size=size) * mean / 3
            }

            for dist_name, synthetic in distributions.items():
                stat, p_value = ks_2samp(durations_seconds, synthetic)
                if p_value > 0.05:
                    all_results.append({
                        'Agent': agent,
                        'Topic': topic,
                        'Distribution': dist_name,
                        'KS Statistic': stat,
                        'p-value': p_value,
                        'min_val': int(min_val),
                        'max_val': int(max_val)
                    })

    return pd.DataFrame(all_results)

def ks_test_summary_by_topic(topics, topic_df):
    all_results = []

    for topic in topics:
        row = topic_df[topic_df["Topic"] == topic]
        if row.empty:
            print(f"⚠️ Skipping {topic} — not found")
            continue

        durations = row["InterArrivalTimes"].values[0]
        data_name = f"{topic}"

        durations_seconds = np.array([d for d in durations if d > 0])
        size = len(durations_seconds)
        if size < 2:
            print(f"⚠️ Skipping {data_name} — insufficient data")
            continue

        mean = np.mean(durations_seconds)
        std = np.std(durations_seconds)
        min_val, max_val = np.min(durations_seconds), np.max(durations_seconds)
        ptp = max_val - min_val

        np.random.seed(seed_num) 

        distributions = {
            # 'Uniform': np.random.uniform(min_val, max_val, size),
            # 'Normal': np.random.normal(mean, std, size),
            'Exponential': np.random.exponential(scale=mean, size=size),
            # 'Gamma': np.random.gamma((mean / std) ** 2, (std ** 2) / mean, size),
            # 'Poisson': np.random.poisson(lam=mean, size=size),
            # 'Weibull': np.random.weibull(a=1.5, size=size) * mean / 1.5,
            # 'Beta': np.random.beta(2, 5, size=size) * ptp + min_val,
            # 'Lognormal': np.random.lognormal(np.log(mean), 0.954, size),
            # 'Pareto': np.random.pareto(2.62, size=size) * mean / 3
        }

        for dist_name, synthetic in distributions.items():
            stat, p_value = ks_2samp(durations_seconds, synthetic)
            if p_value > 0.05:
                all_results.append({
                    'Topic': topic,
                    'Distribution': dist_name,
                    'KS Statistic': stat,
                    'p-value': p_value,
                    'Mean': int(mean),
                })

    return pd.DataFrame(all_results)


In [25]:
ks_test_summary_by_agent_topic(agents, topics, get_avg_talk_duration)

Unnamed: 0,Agent,Topic,Distribution,KS Statistic,p-value,min_val,max_val
0,Becky,Admin Support,Uniform,0.09,0.815415,29,419
1,Becky,Contract related,Uniform,0.114943,0.61642,27,413
2,Becky,Payment related,Uniform,0.080357,0.864736,27,423
3,Becky,Streaming,Uniform,0.07563,0.887478,7,413
4,Becky,Technical Support,Uniform,0.090909,0.810627,28,412
5,Dan,Admin Support,Uniform,0.137931,0.381215,42,418
6,Dan,Contract related,Uniform,0.09901,0.707902,31,425
7,Dan,Payment related,Uniform,0.103774,0.620308,37,444
8,Dan,Streaming,Uniform,0.078947,0.871642,35,419
9,Dan,Technical Support,Uniform,0.086957,0.779703,25,431


In [26]:
ks_test_summary_by_topic(topics, topic_iat)

Unnamed: 0,Topic,Distribution,KS Statistic,p-value,Mean
0,Admin Support,Exponential,0.033019,0.879121,3412
1,Contract related,Exponential,0.046252,0.514006,3595
2,Payment related,Exponential,0.049307,0.40948,3389
3,Streaming,Exponential,0.064422,0.11748,3361
4,Technical Support,Exponential,0.043887,0.570997,3573


In [27]:
ks_test_summary_by_topic(topics, abandonment_iat)

Unnamed: 0,Topic,Distribution,KS Statistic,p-value,Mean
0,Admin Support,Exponential,0.175258,0.101648,8108
1,Contract related,Exponential,0.142857,0.234976,7534
2,Payment related,Exponential,0.114286,0.501191,7743
3,Streaming,Exponential,0.154639,0.196996,7247
4,Technical Support,Exponential,0.12,0.330121,7439
