In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [27]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_column', None)
df_engagement = pd.read_csv('../data/engagement_clusters.csv', na_values=['?', None, 'undefined'])
df_engagement.sample(5)

Unnamed: 0,MSISDN/Number,Bearer Id,Dur. (ms),Total Data,cluster
43533,33664755403,1,24200.0,756668358.0,0
49976,33667466157,1,102551.0,597798827.0,0
6946,33618885648,1,86400.0,238852745.0,1
43706,33664817276,1,86399.0,730079409.0,0
37533,33662644108,1,94287.0,634232178.0,0


In [28]:
df_experience = pd.read_csv('../data/experience_clusters.csv', na_values=['?', None, 'undefined'])
df_experience.sample(5)

Unnamed: 0,MSISDN/Number,Total RTT,Total TCP retransmission,Total Throughput,Handset Type,encoded,cluster
32362,33763734494,797.0,3462231.0,5029.0,Apple iPhone Se (A1723),57,0
10654,33659416397,1103.0,11289444.0,123959.0,Huawei B528S-23A,218,2
14875,33662229287,203.0,2423533.0,82204.0,Huawei B528S-23A,218,0
10402,33659288467,61.0,94783.0,40199.0,Apple iPhone 7 (A1778),46,0
25794,33684542746,55.0,50984.0,3631.0,Apple iPhone 6 (A1586),36,0


In [29]:
# how many missing values exist or better still what is the % of missing values in the dataset?
def percent_missing(df):

    # Calculate total number of cells in dataframe
    totalCells = np.product(df.shape)

    # Count number of missing values per column
    missingCount = df.isnull().sum()

    # Calculate total number of missing values
    totalMissing = missingCount.sum()

    # Calculate percentage of missing values
    print("The dataset contains", round(((totalMissing/totalCells) * 100), 2), "%", "missing values.")

percent_missing(df_engagement)
percent_missing(df_experience)

The dataset contains 0.0 % missing values.
The dataset contains 0.0 % missing values.


In [30]:
less_engaged_centroid = df_engagement[df_engagement['cluster'] == 0].groupby('cluster').mean()
less_engaged_centroid

Unnamed: 0_level_0,MSISDN/Number,Bearer Id,Dur. (ms),Total Data
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,42346350000.0,1.0,101469.777192,706454800.0


In [31]:
def compute_engagement_score(df,centroid):
    x = float(centroid['Bearer Id'])
    y = float(centroid['Dur. (ms)'])
    z = float(centroid['Total Data'])
    df['engagement score'] = ((df['Bearer Id'] - x)**2 + (df['Dur. (ms)'] - y)**2 + (df['Total Data'] - z)**2)**0.5
    return df
df_engagement = compute_engagement_score(df_engagement,less_engaged_centroid)

In [32]:
df_engagement.head()

Unnamed: 0,MSISDN/Number,Bearer Id,Dur. (ms),Total Data,cluster,engagement score
0,33601001722,1,116720.0,878690574.0,0,172235800.0
1,33601001754,1,181230.0,156859643.0,1,549595100.0
2,33601002511,1,134969.0,595966483.0,0,110488300.0
3,33601007832,1,49878.0,422320698.0,1,284134100.0
4,33601011634,2,128360.0,654723066.0,2,51731720.0


In [33]:
worst_experience_centroid = df_experience[df_experience['cluster'] == 0]\
    [['Total RTT', 'Total TCP retransmission', 'Total Throughput', 'encoded', 'cluster']].groupby('cluster').mean()
worst_experience_centroid.head()

Unnamed: 0_level_0,Total RTT,Total TCP retransmission,Total Throughput,encoded
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,124.876095,4584649.0,42207.289489,145.086606


In [34]:
def compute_experience_score(df,centroid):
    x = float(centroid['Total RTT'])
    y = float(centroid['Total TCP retransmission'])
    z = float(centroid['Total Throughput'])
    w = float(centroid['encoded'])
    df['experience score'] = ((df['Total RTT'] - x)**2 + (df['Total TCP retransmission'] - y)**2 \
                              + (df['Total Throughput'] - z)**2 + (df['encoded'] - w)**2)**0.5
    return df
df_experience = compute_experience_score(df_experience,worst_experience_centroid)
df_experience.head()

Unnamed: 0,MSISDN/Number,Total RTT,Total TCP retransmission,Total Throughput,Handset Type,encoded,cluster,experience score
0,33601008617,91.0,9370832.0,56781.0,Apple iPhone Se (A1723),57,0,4786205.0
1,33601011634,39.0,110232.0,42416.0,Huawei Mate 10 Pro Porsche Design Huawei Mate 10,284,0,4474417.0
2,33601021217,160.0,14594645.0,19256.0,Apple iPhone 7 Plus (A1784),49,0,10010020.0
3,33601031129,60.0,2325497.0,38190.0,Apple iPhone 8 Plus (A1897),55,0,2259155.0
4,33601034530,656.0,2006261.0,8539.0,Apple iPhone 7 (A1778),46,0,2578608.0


In [35]:
df_satisfaction = pd.merge(df_engagement[['MSISDN/Number', 'engagement score']], df_experience[['MSISDN/Number', 'experience score']], on="MSISDN/Number")
df_satisfaction.head()

Unnamed: 0,MSISDN/Number,engagement score,experience score
0,33601011634,51731720.0,4474417.0
1,33601021217,76362370.0,10010020.0
2,33601031129,69401720.0,2259155.0
3,33601062786,63261520.0,2747921.0
4,33601071144,591160400.0,4553683.0


In [36]:
df_satisfaction['Satisfaction Score'] = (df_satisfaction['engagement score'] + df_satisfaction['experience score'])/2

In [37]:
df_satisfaction.head()

Unnamed: 0,MSISDN/Number,engagement score,experience score,Satisfaction Score
0,33601011634,51731720.0,4474417.0,28103070.0
1,33601021217,76362370.0,10010020.0,43186200.0
2,33601031129,69401720.0,2259155.0,35830440.0
3,33601062786,63261520.0,2747921.0,33004720.0
4,33601071144,591160400.0,4553683.0,297857000.0


### Top 10 satisfied customers

In [38]:
df_satisfaction[['MSISDN/Number', 'Satisfaction Score']].sort_values(by="Satisfaction Score", ascending=False).head(10)

Unnamed: 0,MSISDN/Number,Satisfaction Score
18829,33761164622,350621700.0
14162,33668534621,350176200.0
12565,33666235447,349355700.0
6295,33659111802,345702300.0
3251,33641356514,344982200.0
5924,33658792462,344172600.0
13649,33667821744,342408400.0
5598,33658511672,341160300.0
6752,33659472904,340441100.0
17009,33698481232,339735400.0


### Regression model for Satisfaction Score

In [39]:
df_feature = pd.merge(df_engagement[['MSISDN/Number', 'Bearer Id' , 'Dur. (ms)', 'Total Data']], \
                      df_experience[['MSISDN/Number', 'Total RTT', 'Total TCP retransmission', 'Total Throughput', 'encoded']], \
                      on="MSISDN/Number")
df_feature.head()

Unnamed: 0,MSISDN/Number,Bearer Id,Dur. (ms),Total Data,Total RTT,Total TCP retransmission,Total Throughput,encoded
0,33601011634,2,128360.0,654723066.0,39.0,110232.0,42416.0,284
1,33601021217,1,38416.0,630092434.0,160.0,14594645.0,19256.0,49
2,33601031129,1,55730.0,637053075.0,60.0,2325497.0,38190.0,55
3,33601062786,1,10532.0,769716233.0,55.0,1836731.0,46431.0,679
4,33601071144,1,86399.0,115294379.0,33.0,31013.0,62865.0,43


In [40]:
df_feature = pd.merge(df_feature, \
                      df_satisfaction[['MSISDN/Number', 'Satisfaction Score']], \
                      on="MSISDN/Number")
df_feature.head()

Unnamed: 0,MSISDN/Number,Bearer Id,Dur. (ms),Total Data,Total RTT,Total TCP retransmission,Total Throughput,encoded,Satisfaction Score
0,33601011634,2,128360.0,654723066.0,39.0,110232.0,42416.0,284,28103070.0
1,33601021217,1,38416.0,630092434.0,160.0,14594645.0,19256.0,49,43186200.0
2,33601031129,1,55730.0,637053075.0,60.0,2325497.0,38190.0,55,35830440.0
3,33601062786,1,10532.0,769716233.0,55.0,1836731.0,46431.0,679,33004720.0
4,33601071144,1,86399.0,115294379.0,33.0,31013.0,62865.0,43,297857000.0


In [41]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
X = df_feature.loc[:, ['Bearer Id', 'Dur. (ms)', 'Total Data', 'Total RTT', 'Total TCP retransmission', 'Total Throughput', 'encoded']].values
X = StandardScaler().fit_transform(X)
X.shape

(21370, 7)

In [42]:
y =  df_feature.loc[:, ['Satisfaction Score']].values
y = StandardScaler().fit_transform(y)
y.shape

(21370, 1)

In [43]:
reg = LinearRegression().fit(X, y)
reg.score(X, y)

0.6745208819037907

In [44]:
#save model
pickle.dump(reg, open('../models/satisfaction_model.sav', 'wb'))

## K means on the engagement and experience scores

In [45]:
from sklearn.preprocessing import StandardScaler
x_score = df_satisfaction[['engagement score', 'experience score']].loc[:, :].values
x_score = StandardScaler().fit_transform(x_score)
x_score.shape

(21370, 2)

In [46]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0).fit(x_score)
df_satisfaction['cluster'] = kmeans.labels_

In [47]:
df_satisfaction.head()

Unnamed: 0,MSISDN/Number,engagement score,experience score,Satisfaction Score,cluster
0,33601011634,51731720.0,4474417.0,28103070.0,0
1,33601021217,76362370.0,10010020.0,43186200.0,0
2,33601031129,69401720.0,2259155.0,35830440.0,0
3,33601062786,63261520.0,2747921.0,33004720.0,0
4,33601071144,591160400.0,4553683.0,297857000.0,1


### Average engagement  and experience scores per cluster

In [51]:
df_satisfaction[['engagement score', 'experience score', 'cluster']].groupby('cluster').mean()

Unnamed: 0_level_0,engagement score,experience score
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,131538900.0,6513352.0
1,457550800.0,6379523.0


In [52]:
df_satisfaction.to_csv('../data/user_experience_scores.csv', index=False)