In [19]:
import pickle
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import sys,os

In [20]:
sys.path.append(os.path.abspath(os.path.join('../utils')))
from cleaner import *
from helper import *
from plot import *
from outlier_handler import OutlierHandler

In [21]:
#read data
user_engagement_df = pd.read_csv("../data/user_engagement_analysis.csv")
user_engagement_df.head()

Unnamed: 0,MSISDN/Number,cluster,xdr Sessions,Dur. (ms),Total Data Volume (Bytes)
0,33601001722,1,1.0,116720.0,878690600.0
1,33601001754,5,1.0,181230.0,156859600.0
2,33601002511,5,1.0,134969.0,595966500.0
3,33601007832,3,1.0,49878.0,422320700.0
4,33601008617,4,2.0,37104.0,1457411000.0


In [22]:
user_experience_df = pd.read_csv("../data/experience_analytics_data.csv")
user_experience_df.head()

Unnamed: 0,MSISDN/Number,cluster,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes)
0,33601001722,2,46.0,76.0,2895381.0
1,33601001754,1,31.0,99.0,7685875.5
2,33601002511,2,59.0,97.0,4150403.0
3,33601007832,2,84.0,248.0,2396.0
4,33601008617,1,119.0,43204.5,8047304.5


Engagement Score

In [23]:
eng_df = user_engagement_df.set_index('MSISDN/Number')[
    ['xdr Sessions', 'Dur. (ms)', 'Total Data Volume (Bytes)']]

In [24]:
def normalizer(df, columns):
    norm = Normalizer()
    return pd.DataFrame(norm.fit_transform(df), columns=columns)
def scaler(df, columns):
    minmax_scaler = MinMaxScaler()
    return pd.DataFrame(minmax_scaler.fit_transform(df), columns=columns)

In [25]:
normalized_metrics = scaler(eng_df, ['Total Avg RTT (ms)',
    'Total Avg Bearer TP (kbps)',
    'Total TCP Retrans. Vol (Bytes)'])
normalized_metrics.describe()

Unnamed: 0,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes)
count,105716.0,105716.0,105716.0
mean,0.142654,0.386189,0.400389
std,0.260005,0.268673,0.25124
min,0.0,0.0,0.0
25%,0.0,0.2077,0.206669
50%,0.0,0.303929,0.371524
75%,0.4,0.52462,0.524002
max,1.0,1.0,1.0


In [26]:
with open("../models/user_engagement_analysis.pkl", "rb") as f:
    kmeans1 = pickle.load(f)

In [27]:
less_engaged_cluster = 5

In [28]:
distance = kmeans1.fit_transform(normalized_metrics)
distance_from_less_engaged_cluster = list(
    map(lambda x: x[less_engaged_cluster], distance))
user_engagement_df['engagement_score'] = distance_from_less_engaged_cluster
user_engagement_df.head()

Unnamed: 0,MSISDN/Number,cluster,xdr Sessions,Dur. (ms),Total Data Volume (Bytes),engagement_score
0,33601001722,1,1.0,116720.0,878690600.0,0.315788
1,33601001754,5,1.0,181230.0,156859600.0,0.214956
2,33601002511,5,1.0,134969.0,595966500.0,0.155875
3,33601007832,3,1.0,49878.0,422320700.0,0.413304
4,33601008617,4,2.0,37104.0,1457411000.0,0.854176


Experience score

In [29]:
exp_df = user_experience_df.set_index('MSISDN/Number')[
    ['Total Avg RTT (ms)', 'Total Avg Bearer TP (kbps)', 'Total TCP Retrans. Vol (Bytes)']]

In [30]:
normalized_metrics = scaler(exp_df, ['Total Avg RTT (ms)',
    'Total Avg Bearer TP (kbps)',
    'Total TCP Retrans. Vol (Bytes)'])
normalized_metrics.describe()

Unnamed: 0,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes)
count,105716.0,105716.0,105716.0
mean,0.348834,0.222822,0.227383
std,0.234005,0.326709,0.294822
min,0.0,0.0,0.0
25%,0.164905,0.001531,0.007261
50%,0.270613,0.004767,0.060832
75%,0.498943,0.400919,0.404356
max,1.0,1.0,1.0


In [31]:
with open("../models/user_experience_analytics.pkl", "rb") as f:
    kmeans2 = pickle.load(f)

In [32]:
worst_experience_cluster = 1

In [33]:
distance = kmeans2.fit_transform(normalized_metrics)
distance_from_worst_experience_cluster = list(
    map(lambda x: x[worst_experience_cluster], distance))
user_experience_df['experience_score'] = distance_from_worst_experience_cluster
user_experience_df.head()

Unnamed: 0,MSISDN/Number,cluster,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes),experience_score
0,33601001722,2,46.0,76.0,2895381.0,0.631399
1,33601001754,1,31.0,99.0,7685875.5,0.471144
2,33601002511,2,59.0,97.0,4150403.0,0.530726
3,33601007832,2,84.0,248.0,2396.0,0.794562
4,33601008617,1,119.0,43204.5,8047304.5,0.358125


satisfaction score

In [34]:
user_engagement_df.rename(columns={'cluster': 'engagement_cluster'}, inplace=True)
user_engagement_df.head()

Unnamed: 0,MSISDN/Number,engagement_cluster,xdr Sessions,Dur. (ms),Total Data Volume (Bytes),engagement_score
0,33601001722,1,1.0,116720.0,878690600.0,0.315788
1,33601001754,5,1.0,181230.0,156859600.0,0.214956
2,33601002511,5,1.0,134969.0,595966500.0,0.155875
3,33601007832,3,1.0,49878.0,422320700.0,0.413304
4,33601008617,4,2.0,37104.0,1457411000.0,0.854176


In [35]:
user_experience_df.rename(columns={'cluster': 'experience_cluster'}, inplace=True)
user_experience_df.head()

Unnamed: 0,MSISDN/Number,experience_cluster,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes),experience_score
0,33601001722,2,46.0,76.0,2895381.0,0.631399
1,33601001754,1,31.0,99.0,7685875.5,0.471144
2,33601002511,2,59.0,97.0,4150403.0,0.530726
3,33601007832,2,84.0,248.0,2396.0,0.794562
4,33601008617,1,119.0,43204.5,8047304.5,0.358125


In [36]:
user_satisfaction_df = pd.merge(user_engagement_df, user_experience_df, on='MSISDN/Number')
user_satisfaction_df['satisfaction_score'] = (
    user_satisfaction_df['engagement_score'] + user_satisfaction_df['experience_score'])/2
user_satisfaction_df.head()

Unnamed: 0,MSISDN/Number,engagement_cluster,xdr Sessions,Dur. (ms),Total Data Volume (Bytes),engagement_score,experience_cluster,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes),experience_score,satisfaction_score
0,33601001722,1,1.0,116720.0,878690600.0,0.315788,2,46.0,76.0,2895381.0,0.631399,0.473593
1,33601001754,5,1.0,181230.0,156859600.0,0.214956,1,31.0,99.0,7685875.5,0.471144,0.34305
2,33601002511,5,1.0,134969.0,595966500.0,0.155875,2,59.0,97.0,4150403.0,0.530726,0.343301
3,33601007832,3,1.0,49878.0,422320700.0,0.413304,2,84.0,248.0,2396.0,0.794562,0.603933
4,33601008617,4,2.0,37104.0,1457411000.0,0.854176,1,119.0,43204.5,8047304.5,0.358125,0.60615


In [37]:
user_satisfaction_df = user_satisfaction_df[['MSISDN/Number', 'engagement_score',
                        'experience_score', 'satisfaction_score']]
user_satisfaction_df.set_index('MSISDN/Number', inplace=True)
user_satisfaction_df.head()

Unnamed: 0_level_0,engagement_score,experience_score,satisfaction_score
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33601001722,0.315788,0.631399,0.473593
33601001754,0.214956,0.471144,0.34305
33601002511,0.155875,0.530726,0.343301
33601007832,0.413304,0.794562,0.603933
33601008617,0.854176,0.358125,0.60615


In [38]:
top10_satisfied = user_satisfaction_df.sort_values('satisfaction_score', ascending=False).head(10)
top10_satisfied

Unnamed: 0_level_0,engagement_score,experience_score,satisfaction_score
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33663325062,1.296374,1.114188,1.205281
33698522306,1.296374,1.111982,1.204178
33665743237,1.296374,1.11109,1.203732
33665077427,1.296374,1.108978,1.202676
33663545447,1.296374,1.107904,1.202139
33658063955,1.296374,1.107353,1.201864
33682179297,1.296374,1.105919,1.201146
33667854000,1.296374,1.105352,1.200863
33667083269,1.296374,1.104854,1.200614
33687453952,1.296374,1.10456,1.200467


Linear regression for satisfaction score prediction

In [39]:
# splitting training and testing data
X = user_satisfaction_df[['engagement_score', 'experience_score']]
y = user_satisfaction_df[['satisfaction_score']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [40]:
model = LinearRegression()
model.fit(X_train, y_train)

In [41]:
y_pred = model.predict(X_test)

In [42]:
print(f'Intercept: {model.intercept_}')
print(f'Coefficients: { model.coef_}')
print(f"Mean squared error: {np.mean((y_pred - y_test.values) ** 2)}")

Intercept: [2.99760217e-15]
Coefficients: [[0.5 0.5]]
Mean squared error: 4.982290044117643e-31
