In [61]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
%matplotlib inline

In [62]:
pwd = os.getcwd()
file_path = os.path.join(pwd, os.pardir, 'data/clean_data.csv')

In [63]:
data = pd.read_csv(file_path, parse_dates=["Start", "End"])

In [64]:
data_modified = data.copy()

In [65]:
user_agg_engagement = data_modified.groupby(["MSISDN/Number"], as_index=False).agg({'Bearer Id':'count', "Dur. (s)": "sum", "Total UL (Bytes)": "sum", "Total DL (Bytes)": "sum",  "Social Media UL (Bytes)": "sum", "Social Media DL (Bytes)": "sum", "Google UL (Bytes)": "sum", "Google DL (Bytes)": "sum", "Email UL (Bytes)": "sum", "Email DL (Bytes)": "sum", "Youtube UL (Bytes)": "sum", "Youtube DL (Bytes)": "sum", "Netflix UL (Bytes)": "sum", "Netflix DL (Bytes)": "sum", "Gaming UL (Bytes)": "sum", "Gaming DL (Bytes)": "sum", "Other UL (Bytes)": "sum", "Other DL (Bytes)": "sum"})
user_agg_engagement.reset_index(drop=True, inplace=True)

In [66]:
user_agg_engagement["MSISDN/Number"] = user_agg_engagement["MSISDN/Number"].astype(str)

In [67]:
user_agg_engagement.rename(columns={"Bearer Id": "Session Count"}, inplace=True)

In [68]:
user_agg_engagement["Total Data Transferred"] = user_agg_engagement["Total UL (Bytes)"] + user_agg_engagement["Total DL (Bytes)"]

In [69]:
user_agg_engagement["Google Volume (Bytes)"] = user_agg_engagement["Google UL (Bytes)"] + user_agg_engagement["Google DL (Bytes)"]
user_agg_engagement["Netflix Volume (Bytes)"] = user_agg_engagement["Netflix UL (Bytes)"] + user_agg_engagement["Netflix DL (Bytes)"]
user_agg_engagement["Youtube Volume (Bytes)"] = user_agg_engagement["Youtube UL (Bytes)"] + user_agg_engagement["Youtube DL (Bytes)"]
user_agg_engagement["Gaming Volume (Bytes)"] = user_agg_engagement["Gaming UL (Bytes)"] + user_agg_engagement["Gaming DL (Bytes)"]
user_agg_engagement["Email Volume (Bytes)"] = user_agg_engagement["Email UL (Bytes)"] + user_agg_engagement["Email DL (Bytes)"]
user_agg_engagement["Social Media Volume (Bytes)"] = user_agg_engagement["Social Media UL (Bytes)"] + user_agg_engagement["Social Media DL (Bytes)"]
user_agg_engagement["Other Volume (Bytes)"] = user_agg_engagement["Other UL (Bytes)"] + user_agg_engagement["Other DL (Bytes)"]

In [70]:
volume_per_user = user_agg_engagement.drop(columns=[
    "Total UL (Bytes)",
    "Total DL (Bytes)",
    "Social Media UL (Bytes)",
    "Social Media DL (Bytes)",
    "Google UL (Bytes)",
    "Google DL (Bytes)",
    "Email UL (Bytes)",
    "Email DL (Bytes)",
    "Youtube UL (Bytes)",
    "Youtube DL (Bytes)",
    "Netflix UL (Bytes)",
    "Netflix DL (Bytes)",
    "Gaming UL (Bytes)",
    "Gaming DL (Bytes)",
    "Other UL (Bytes)",
    "Other DL (Bytes)"
])

In [71]:
# to convert the scientific notation to decimal format
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [72]:
volume_per_user.head()

Unnamed: 0,MSISDN/Number,Session Count,Dur. (s),Total Data Transferred,Google Volume (Bytes),Netflix Volume (Bytes),Youtube Volume (Bytes),Gaming Volume (Bytes),Email Volume (Bytes),Social Media Volume (Bytes),Other Volume (Bytes)
0,33601001722.0,1,116720.0,878690574.0,4389005.0,27180981.0,21624548.0,812458661.0,1331362.0,2232135.0,386570872.0
1,33601001754.0,1,181230.0,156859643.0,5334863.0,11221763.0,12432223.0,119750078.0,3307781.0,2660565.0,281710071.0
2,33601002511.0,1,134969.0,595966483.0,3443126.0,19353900.0,21333570.0,538827713.0,3205380.0,3195623.0,501693672.0
3,33601007832.0,1,49878.0,422320698.0,9678493.0,1942092.0,6977321.0,391126127.0,2284670.0,280294.0,35279702.0
4,33601008617.0,2,37104.0,1457410944.0,18499616.0,49201724.0,41533002.0,1314797820.0,3305469.0,2912542.0,804804484.0


In [73]:
plt.style.use('seaborn')

In [74]:
kmeans = KMeans(n_clusters=3)

In [75]:
engagement_metrics = volume_per_user[["Session Count", "Dur. (s)", "Total Data Transferred"]]

In [76]:
engagement_metrics

Unnamed: 0,Session Count,Dur. (s),Total Data Transferred
0,1,116720.00,878690574.00
1,1,181230.00,156859643.00
2,1,134969.00,595966483.00
3,1,49878.00,422320698.00
4,2,37104.00,1457410944.00
...,...,...,...
106466,1,8810.00,714641641.00
106467,1,140988.00,480307335.00
106468,1,877385.00,232123971.00
106469,1,253030.00,596287824.00


In [77]:
kmeans.fit_predict(engagement_metrics)

array([0, 1, 0, ..., 1, 0, 1])

In [78]:
kmeans.cluster_centers_

array([[1.38820006e+00, 1.39101407e+05, 8.27260962e+08],
       [1.06058568e+00, 1.08950848e+05, 3.33201023e+08],
       [3.12626890e+00, 3.74741022e+05, 1.84150621e+09]])

In [79]:
kmeans.cluster_centers_[0]

array([1.38820006e+00, 1.39101407e+05, 8.27260962e+08])

In [85]:
least_eng = kmeans.cluster_centers_[0]
for index,row in engagement_metrics.iterrows():
    x = row['Session Count']
    y = row['Dur. (s)']
    z = row['Total Data Transferred']

    ec_dis = ( (x - least_eng[0])**2 + (y - least_eng[1])**2 + (z - least_eng[2])**2)**0.5

    print(ec_dis)

51429616.38834029
670401320.8053865
231294479.5186038
404940274.3113061
630149989.7730696
212043772.60210562
172537896.81604314
494600608.28955334
162871616.89982215
94797202.5523347
712284934.8711885
197168554.18951923
344841883.0076206
468349073.85510534
596936567.280287
285481015.62876105
129701304.12412894
190207905.75325197
108808609.72642265
180057258.9300453
805499685.1755106
295601665.51320934
408269140.4961587
667996467.6044818
366374616.74949926
227527155.39373025
19432972.949558705
555239703.3434923
576819964.4660112
1112265958.7576597
651877983.771918
413468956.4818541
40582670.32280588
622591784.7092133
365626116.49406546
229848394.6822599
57544873.10971322
275347747.5191333
106494743.0048426
134662561.672012
711966585.4323019
453554688.95359886
347954113.7663866
197943539.73574165
50583308.32140886
28984954.84325912
209694395.48285452
273849804.5529784
293125738.23294985
725392380.5962371
152204590.953398
684222386.2270358
686111194.9951445
664330179.6354356
464213430.904