# User Engagement analysis

In [2]:
import pandas as pd
import numpy as np
import matplotlib
from sqlalchemy import create_engine
from urllib.parse import quote

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### Connecting to the database

In [3]:
username = 'postgres'
password = 'nati@postgres'
hostname = 'localhost'
port = '5432'
database_name = 'TellCo'

# Escape the special characters in the password
escaped_password = quote(password, safe='')

# Create the database engine
engine = create_engine(f'postgresql://{username}:{escaped_password}@{hostname}:{port}/{database_name}')

# Establish a connection
with engine.connect() as connection:
    # Query the data and load it into a pandas DataFrame
    query = """
        SELECT *
        FROM xdr_data
    """
    df = pd.read_sql(query, connection)

df.head()

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
0,1.311448e+19,4/4/2019 12:01,770.0,4/25/2019 14:35,662.0,1823652.0,208201400000000.0,33664960000.0,35521210000000.0,9.16456699548519E+015,...,15854611.0,2501332.0,8198936.0,9656251.0,278082303.0,14344150.0,171744450.0,8814393.0,36749741.0,308879636.0
1,1.311448e+19,4/9/2019 13:04,235.0,4/25/2019 8:15,606.0,1365104.0,208201900000000.0,33681850000.0,35794010000000.0,L77566A,...,20247395.0,19111729.0,18338413.0,17227132.0,608750074.0,1170709.0,526904238.0,15055145.0,53800391.0,653384965.0
2,1.311448e+19,4/9/2019 17:42,1.0,4/25/2019 11:58,652.0,1361762.0,208200300000000.0,33760630000.0,35281510000000.0,D42335A,...,19725661.0,14699576.0,17587794.0,6163408.0,229584621.0,395630.0,410692588.0,4215763.0,27883638.0,279807335.0
3,1.311448e+19,4/10/2019 0:31,486.0,4/25/2019 7:36,171.0,1321509.0,208201400000000.0,33750340000.0,35356610000000.0,T21824A,...,21388122.0,15146643.0,13994646.0,1097942.0,799538153.0,10849722.0,749039933.0,12797283.0,43324218.0,846028530.0
4,1.311448e+19,4/12/2019 20:10,565.0,4/25/2019 10:40,954.0,1089009.0,208201400000000.0,33699800000.0,35407010000000.0,D88865A,...,15259380.0,18962873.0,17124581.0,415218.0,527707248.0,3529801.0,550709500.0,13910322.0,38542814.0,569138589.0


### Top 10 customers per engagement metric 

The top 10 engagement metric we are required to aggregate are:

- Sessions frequency 
- The duration of the session 
- The sessions total traffic (download and upload (bytes))


Let's aggregate the above metrics per customer id (MSISDN/Number) and report the top 10 customers per engagement metric.


In [11]:
# Calculate session frequency for each customer
session_frequency = df.groupby('MSISDN/Number')['IMEI'].nunique()

# Group the data by "MSISDN/Number" and calculate engagement metrics
engagement_metrics = df.groupby('MSISDN/Number').agg({
    'IMEI': 'nunique',                             # Sessions frequency
    'Dur. (ms)': 'sum',                            # Duration of the session
    'Total UL (Bytes)': 'sum',                      # Sessions total traffic (upload)
    'Total DL (Bytes)': 'sum'                       # Sessions total traffic (download)
})

# Merge session frequency with other engagement metrics
engagement_metrics['Session Frequency'] = session_frequency

# Calculate total data volume (UL + DL)
engagement_metrics['Total Data Volume'] = engagement_metrics['Total UL (Bytes)'] + engagement_metrics['Total DL (Bytes)']

# Sort the customers based on each engagement metric
top_10_frequency = engagement_metrics.sort_values('IMEI', ascending=False).head(10)
top_10_duration = engagement_metrics.sort_values('Dur. (ms)', ascending=False).head(10)
top_10_traffic = engagement_metrics.sort_values('Total Data Volume', ascending=False).head(10)

# Print the top 10 customers for each engagement metric
print("Top 10 Customers by Sessions Frequency:")
print(top_10_frequency[['IMEI', 'Session Frequency']])

print("\nTop 10 Customers by Session Duration:")
print(top_10_duration[['IMEI', 'Dur. (ms)']])

print("\nTop 10 Customers by Sessions Total Traffic:")
print(top_10_traffic[['IMEI', 'Total Data Volume']])

Top 10 Customers by Sessions Frequency:
               IMEI  Session Frequency
MSISDN/Number                         
3.368132e+10      2                  2
3.369948e+10      2                  2
3.365077e+10      2                  2
3.360192e+10      2                  2
3.365803e+10      2                  2
3.365982e+10      2                  2
3.368732e+10      2                  2
3.360100e+10      1                  1
3.366916e+10      1                  1
3.366916e+10      1                  1

Top 10 Customers by Session Duration:
               IMEI   Dur. (ms)
MSISDN/Number                  
3.362578e+10      1  18553754.0
3.361489e+10      1   9966898.0
3.376054e+10      1   9279434.0
3.362632e+10      1   8791927.0
3.366716e+10      1   8744914.0
3.366284e+10      1   6614270.0
3.366469e+10      1   6288730.0
3.360313e+10      1   6287761.0
3.366746e+10      1   5649882.0
3.376041e+10      1   5321667.0

Top 10 Customers by Sessions Total Traffic:
               IMEI  Tot