In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

In [2]:
df = pd.read_csv('../data/clean_data.csv')

## Compute Engagement Score
### Aggregate Engagement Metrics

In [5]:
# Example of data aggregation and correction
df_agg = pd.DataFrame()
df_agg['MSISDN/Number'] = df['MSISDN/Number']
df_agg['Handset Type'] = df['Handset Type']

# If the column does not exist, create or correct the engagement metric
df_agg['Total Data Usage (Bytes)'] = (
    df['Total DL (Bytes)'] + df['Total UL (Bytes)']
) / df.shape[0]

# Rename or create the columns you need for clustering
df_agg['Engagement Metric'] = (
    (df['Total DL (Bytes)'] + df['Total UL (Bytes)']) / 2
)
df_agg['Experience Metric'] = (
    (df['Avg RTT DL (ms)'] + df['Avg RTT UL (ms)']) / 2
)

print(df_agg.columns)

Index(['MSISDN/Number', 'Handset Type', 'Total Data Usage (Bytes)',
       'Engagement Metric', 'Experience Metric'],
      dtype='object')


### K-Means Clustering

In [6]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

# Prepare data for clustering
X_engagement = df_agg[['Engagement Metric']]
X_experience = df_agg[['Experience Metric']]

# Run k-means clustering
kmeans_engagement = KMeans(n_clusters=3, random_state=42)
df_agg['Engagement Cluster'] = kmeans_engagement.fit_predict(X_engagement)

kmeans_experience = KMeans(n_clusters=3, random_state=42)
df_agg['Experience Cluster'] = kmeans_experience.fit_predict(X_experience)

### Engagement and Experience Scores

In [8]:
from sklearn.metrics.pairwise import euclidean_distances

# Assuming you have the cluster centers from previous clustering
engagement_cluster_centers = kmeans_engagement.cluster_centers_
experience_cluster_centers = kmeans_experience.cluster_centers_

# Identify the least engaged and worst experience clusters
least_engaged_cluster = engagement_cluster_centers.argmin()
worst_experience_cluster = experience_cluster_centers.argmax()

# Compute engagement scores
df_agg['Engagement Score'] = euclidean_distances(
    X_engagement, engagement_cluster_centers[least_engaged_cluster].reshape(1, -1)
).flatten()

# Compute experience scores
df_agg['Experience Score'] = euclidean_distances(
    X_experience, experience_cluster_centers[worst_experience_cluster].reshape(1, -1)
).flatten()

### Satisfaction Score and Report Top 10 Customers

In [9]:
# Calculate satisfaction score
df_agg['Satisfaction Score'] = (df_agg['Engagement Score'] + df_agg['Experience Score']) / 2

# Report top 10 satisfied customers
top_10_satisfied = df_agg.nlargest(10, 'Satisfaction Score')
print(top_10_satisfied[['MSISDN/Number', 'Satisfaction Score']])

        MSISDN/Number  Satisfaction Score
51666    3.367492e+10        1.863890e+08
138863   3.366469e+10        1.855531e+08
66304    3.366855e+10        1.854696e+08
76298    3.365881e+10        1.853367e+08
107682   3.366783e+10        1.850830e+08
60549    3.361083e+10        1.848897e+08
5382     3.360667e+10        1.847924e+08
118931   3.366261e+10        1.847159e+08
50450    3.366205e+10        1.846860e+08
146885   3.366853e+10        1.845953e+08


### Regression Model

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Features and target
X = df_agg[['Engagement Score', 'Experience Score']]
y = df_agg['Satisfaction Score']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

Mean Squared Error: 1.4812030114354862e-15
R2 Score: 1.0


### K-means on Engagement and Experience Scores

In [11]:
# Prepare data for clustering
X_scores = df_agg[['Engagement Score', 'Experience Score']]

# Run k-means clustering
kmeans_scores = KMeans(n_clusters=2, random_state=42)
df_agg['Score Cluster'] = kmeans_scores.fit_predict(X_scores)

# Inspect cluster centers
print("Score Cluster Centers:\n", kmeans_scores.cluster_centers_)

Score Cluster Centers:
 [[6.80262502e+07 1.54081263e+04]
 [2.58926908e+08 1.54101696e+04]]


### Aggregate Average Satisfaction and Experience Score per Cluster

In [12]:
# Aggregate average scores per cluster
cluster_summary = df_agg.groupby('Score Cluster').agg({
    'Satisfaction Score': 'mean',
    'Experience Score': 'mean'
}).reset_index()

print(cluster_summary)

   Score Cluster  Satisfaction Score  Experience Score
0              0        3.390075e+07      15408.090064
1              1        1.293260e+08      15410.207381
