In [23]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
%matplotlib inline

In [2]:
pwd = os.getcwd()
file_path = os.path.join(pwd, os.pardir, 'data/clean_data.csv')

In [3]:
data = pd.read_csv(file_path, parse_dates=["Start", "End"])

In [4]:
data_modified = data.copy()

In [6]:
user_agg = data_modified.groupby(["MSISDN/Number"], as_index=False).agg({'Bearer Id':'count', "Dur. (s)": "sum", "Total UL (Bytes)": "sum", "Total DL (Bytes)": "sum",  "Social Media UL (Bytes)": "sum", "Social Media DL (Bytes)": "sum", "Google UL (Bytes)": "sum", "Google DL (Bytes)": "sum", "Email UL (Bytes)": "sum", "Email DL (Bytes)": "sum", "Youtube UL (Bytes)": "sum", "Youtube DL (Bytes)": "sum", "Netflix UL (Bytes)": "sum", "Netflix DL (Bytes)": "sum", "Gaming UL (Bytes)": "sum", "Gaming DL (Bytes)": "sum", "Other UL (Bytes)": "sum", "Other DL (Bytes)": "sum"})
user_agg.reset_index(drop=True, inplace=True)

In [7]:
user_agg["MSISDN/Number"] = user_agg["MSISDN/Number"].astype(str)

In [8]:
user_agg.rename(columns={"Bearer Id": "Session Count"}, inplace=True)

In [9]:
user_agg["Total Data Transferred"] = user_agg["Total UL (Bytes)"] + user_agg["Total DL (Bytes)"]

In [10]:
user_agg["Google Volume (Bytes)"] = user_agg["Google UL (Bytes)"] + user_agg["Google DL (Bytes)"]
user_agg["Netflix Volume (Bytes)"] = user_agg["Netflix UL (Bytes)"] + user_agg["Netflix DL (Bytes)"]
user_agg["Youtube Volume (Bytes)"] = user_agg["Youtube UL (Bytes)"] + user_agg["Youtube DL (Bytes)"]
user_agg["Gaming Volume (Bytes)"] = user_agg["Gaming UL (Bytes)"] + user_agg["Gaming DL (Bytes)"]
user_agg["Email Volume (Bytes)"] = user_agg["Email UL (Bytes)"] + user_agg["Email DL (Bytes)"]
user_agg["Social Media Volume (Bytes)"] = user_agg["Social Media UL (Bytes)"] + user_agg["Social Media DL (Bytes)"]
user_agg["Other Volume (Bytes)"] = user_agg["Other UL (Bytes)"] + user_agg["Other DL (Bytes)"]

In [11]:
volume_per_user = user_agg.drop(columns=[
    "Total UL (Bytes)",
    "Total DL (Bytes)",
    "Social Media UL (Bytes)",
    "Social Media DL (Bytes)",
    "Google UL (Bytes)",
    "Google DL (Bytes)",
    "Email UL (Bytes)",
    "Email DL (Bytes)",
    "Youtube UL (Bytes)",
    "Youtube DL (Bytes)",
    "Netflix UL (Bytes)",
    "Netflix DL (Bytes)",
    "Gaming UL (Bytes)",
    "Gaming DL (Bytes)",
    "Other UL (Bytes)",
    "Other DL (Bytes)"
])

In [12]:
# to convert the scientific notation to decimal format
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [13]:
volume_per_user.head()

Unnamed: 0,MSISDN/Number,Session Count,Dur. (s),Total Data Transferred,Google Volume (Bytes),Netflix Volume (Bytes),Youtube Volume (Bytes),Gaming Volume (Bytes),Email Volume (Bytes),Social Media Volume (Bytes),Other Volume (Bytes)
0,33601001722.0,1,116720.0,878690574.0,4389005.0,27180981.0,21624548.0,812458661.0,1331362.0,2232135.0,386570872.0
1,33601001754.0,1,181230.0,156859643.0,5334863.0,11221763.0,12432223.0,119750078.0,3307781.0,2660565.0,281710071.0
2,33601002511.0,1,134969.0,595966483.0,3443126.0,19353900.0,21333570.0,538827713.0,3205380.0,3195623.0,501693672.0
3,33601007832.0,1,49878.0,422320698.0,9678493.0,1942092.0,6977321.0,391126127.0,2284670.0,280294.0,35279702.0
4,33601008617.0,2,37104.0,1457410944.0,18499616.0,49201724.0,41533002.0,1314797820.0,3305469.0,2912542.0,804804484.0


In [14]:
scaler = MinMaxScaler()

In [15]:
df_norm = pd.DataFrame(scaler.fit_transform(volume_per_user.select_dtypes(include=np.number)), columns=volume_per_user.select_dtypes(include=np.number).columns)

In [16]:
df_norm.head()

Unnamed: 0,Session Count,Dur. (s),Total Data Transferred,Google Volume (Bytes),Netflix Volume (Bytes),Youtube Volume (Bytes),Gaming Volume (Bytes),Email Volume (Bytes),Social Media Volume (Bytes),Other Volume (Bytes)
0,0.0,0.01,0.1,0.03,0.07,0.05,0.1,0.03,0.05,0.05
1,0.0,0.01,0.01,0.03,0.03,0.03,0.02,0.08,0.06,0.03
2,0.0,0.01,0.06,0.02,0.05,0.05,0.07,0.08,0.07,0.06
3,0.0,0.0,0.04,0.06,0.0,0.02,0.05,0.05,0.01,0.0
4,0.06,0.0,0.16,0.12,0.12,0.09,0.17,0.08,0.07,0.1


In [17]:
kmeans = KMeans(n_clusters=3)
y_predicted = kmeans.fit_predict(volume_per_user.select_dtypes(include=np.number))
y_predicted

array([0, 1, 1, ..., 1, 1, 1])

In [18]:
volume_per_user["Cluster"] = kmeans.labels_

In [19]:
less_engaged_cluster = volume_per_user.loc[volume_per_user["Cluster"] == 0]

In [22]:
less_engaged_cluster.head()

Unnamed: 0,MSISDN/Number,Session Count,Dur. (s),Total Data Transferred,Google Volume (Bytes),Netflix Volume (Bytes),Youtube Volume (Bytes),Gaming Volume (Bytes),Email Volume (Bytes),Social Media Volume (Bytes),Other Volume (Bytes),Cluster
0,33601001722.0,1,116720.0,878690574.0,4389005.0,27180981.0,21624548.0,812458661.0,1331362.0,2232135.0,386570872.0,0
4,33601008617.0,2,37104.0,1457410944.0,18499616.0,49201724.0,41533002.0,1314797820.0,3305469.0,2912542.0,804804484.0,0
8,33601014694.0,2,495702.0,990132189.0,14134211.0,54216117.0,63443802.0,833087952.0,4599825.0,5288685.0,1041882992.0,0
9,33601020306.0,1,124854.0,732463761.0,12935169.0,35525071.0,28569712.0,647095674.0,2783280.0,956775.0,138339248.0,0
11,33601021217.0,1,38416.0,630092434.0,10995863.0,21062897.0,2708699.0,578111252.0,3143898.0,1227381.0,607878546.0,0


In [34]:
volume_per_user[["Session Count", "Dur. (s)", "Total Data Transferred"]]

Unnamed: 0,Session Count,Dur. (s),Total Data Transferred
0,1,116720.00,878690574.00
1,1,181230.00,156859643.00
2,1,134969.00,595966483.00
3,1,49878.00,422320698.00
4,2,37104.00,1457410944.00
...,...,...,...
106466,1,8810.00,714641641.00
106467,1,140988.00,480307335.00
106468,1,877385.00,232123971.00
106469,1,253030.00,596287824.00


In [35]:
less_engaged_cluster[["Session Count", "Dur. (s)", "Total Data Transferred"]]

Unnamed: 0,Session Count,Dur. (s),Total Data Transferred
0,1,116720.00,878690574.00
4,2,37104.00,1457410944.00
8,2,495702.00,990132189.00
9,1,124854.00,732463761.00
11,1,38416.00,630092434.00
...,...,...,...
106455,1,173035.00,852939331.00
106456,1,44199.00,902088414.00
106463,1,157069.00,710287575.00
106465,2,210389.00,1094693386.00


In [49]:
volume_per_user.iloc[:1, 1:]

Unnamed: 0,Session Count,Dur. (s),Total Data Transferred,Google Volume (Bytes),Netflix Volume (Bytes),Youtube Volume (Bytes),Gaming Volume (Bytes),Email Volume (Bytes),Social Media Volume (Bytes),Other Volume (Bytes),Cluster
0,1,116720.0,878690574.0,4389005.0,27180981.0,21624548.0,812458661.0,1331362.0,2232135.0,386570872.0,0


In [52]:
less_engaged_cluster.iloc[:, 1:]

Unnamed: 0,Session Count,Dur. (s),Total Data Transferred,Google Volume (Bytes),Netflix Volume (Bytes),Youtube Volume (Bytes),Gaming Volume (Bytes),Email Volume (Bytes),Social Media Volume (Bytes),Other Volume (Bytes),Cluster
0,1,116720.00,878690574.00,4389005.00,27180981.00,21624548.00,812458661.00,1331362.00,2232135.00,386570872.00,0
4,2,37104.00,1457410944.00,18499616.00,49201724.00,41533002.00,1314797820.00,3305469.00,2912542.00,804804484.00,0
8,2,495702.00,990132189.00,14134211.00,54216117.00,63443802.00,833087952.00,4599825.00,5288685.00,1041882992.00,0
9,1,124854.00,732463761.00,12935169.00,35525071.00,28569712.00,647095674.00,2783280.00,956775.00,138339248.00,0
11,1,38416.00,630092434.00,10995863.00,21062897.00,2708699.00,578111252.00,3143898.00,1227381.00,607878546.00,0
...,...,...,...,...,...,...,...,...,...,...,...
106455,1,173035.00,852939331.00,3985257.00,38233089.00,27209316.00,774396461.00,1006355.00,227137.00,680777687.00,0
106456,1,44199.00,902088414.00,14577153.00,30788883.00,7844833.00,829371743.00,3442797.00,3499011.00,173813829.00,0
106463,1,157069.00,710287575.00,5990586.00,24293718.00,28947763.00,638609868.00,3984626.00,2486946.00,712062086.00,0
106465,2,210389.00,1094693386.00,10246473.00,51950311.00,38012812.00,972344996.00,5315327.00,4250312.00,1075140407.00,0


In [60]:
score = euclidean_distances(volume_per_user.iloc[:1, 1:], less_engaged_cluster.iloc[:, 1:])

In [56]:
score.mean()

436484252.0604281