In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

In [7]:
load_dotenv('../.venv/.env')

True

**Retrieve database connection details from environment variables**

In [8]:
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
db_name = os.getenv('DB_NAME')
table_name = 'xdr_data'

**Create a connection string**

In [9]:
connection_string = f'postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'
engine = create_engine(connection_string)

In [19]:

# Query the table to verify the write
query = 'SELECT * FROM xdr_data'
data = pd.read_sql(query, engine)

** **

In [22]:
print(data.head)

<bound method NDFrame.head of            Bearer Id            Start  Start ms              End     End ms  \
0       1.311448e+19   4/4/2019 12:01  770.0000  4/25/2019 14:35  662.00000   
1       1.311448e+19   4/9/2019 13:04  235.0000   4/25/2019 8:15  606.00000   
2       1.311448e+19   4/9/2019 17:42    1.0000  4/25/2019 11:58  652.00000   
3       1.311448e+19   4/10/2019 0:31  486.0000   4/25/2019 7:36  171.00000   
4       1.311448e+19  4/12/2019 20:10  565.0000  4/25/2019 10:40  954.00000   
...              ...              ...       ...              ...        ...   
149996  7.277826e+18   4/29/2019 7:28  451.0000   4/30/2019 6:02  214.00000   
149997  7.349883e+18   4/29/2019 7:28  483.0000  4/30/2019 10:41  187.00000   
149998  1.311448e+19   4/29/2019 7:28  283.0000  4/30/2019 10:46  810.00000   
149999  1.311448e+19   4/29/2019 7:28  696.0000  4/30/2019 10:40  327.00000   
150000  1.013887e+19              N/A  499.1882              N/A  498.80088   

           Dur. (ms) 

In [42]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 55 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Bearer Id                                 150001 non-null  float64
 1   Start                                     150001 non-null  object 
 2   Start ms                                  150001 non-null  float64
 3   End                                       150001 non-null  object 
 4   End ms                                    150001 non-null  float64
 5   Dur. (ms)                                 150001 non-null  float64
 6   IMSI                                      150001 non-null  float64
 7   MSISDN/Number                             150001 non-null  float64
 8   IMEI                                      150001 non-null  float64
 9   Last Location Name                        150001 non-null  object 
 10  Avg RTT DL (ms)     

**Task 3 - Experience Analytics**

**Task 3.1: Aggregate metrics per customer**

In [43]:
# Aggregate per customer
aggregated_data = data.groupby('IMSI').agg({
    'TCP DL Retrans. Vol (Bytes)': 'mean',
    'TCP UL Retrans. Vol (Bytes)': 'mean',
    'Avg RTT DL (ms)': 'mean',
    'Avg RTT UL (ms)': 'mean',
    'Avg Bearer TP DL (kbps)': 'mean',
    'Avg Bearer TP UL (kbps)': 'mean',
    'Handset Type': lambda x: x.mode()[0]
}).reset_index()

# Calculate total TCP retransmission, RTT, and throughput
aggregated_data['Total_TCP'] = (aggregated_data['TCP DL Retrans. Vol (Bytes)'] + aggregated_data['TCP UL Retrans. Vol (Bytes)'])
aggregated_data['Total_RTT'] = (aggregated_data['Avg RTT DL (ms)'] + aggregated_data['Avg RTT UL (ms)'])
aggregated_data['Total_Throughput'] = (aggregated_data['Avg Bearer TP DL (kbps)'] + aggregated_data['Avg Bearer TP UL (kbps)'])

# Drop the intermediary columns
aggregated_data = aggregated_data.drop(columns=[
    'TCP DL Retrans. Vol (Bytes)',
    'TCP UL Retrans. Vol (Bytes)',
    'Avg RTT DL (ms)',
    'Avg RTT UL (ms)',
    'Avg Bearer TP DL (kbps)',
    'Avg Bearer TP UL (kbps)'
])

aggregated_data.head()

Unnamed: 0,IMSI,Handset Type,Total_TCP,Total_RTT,Total_Throughput
0,204047100000000.0,Quectel Wireless. Quectel Ec21-E,21569570.0,127.458589,2.0
1,204080800000000.0,Quectel Wireless. Quectel Ec25-E,21569570.0,127.458589,1.0
2,208200100000000.0,Dn Electronics Danew Konnect 350,762355.7,125.795706,109.0
3,208200100000000.0,Samsung Galaxy Grand (Gt-I9060X),27979.0,84.0,754.0
4,208200100000000.0,Apple iPhone 6S (A1688),21569570.0,92.0,80.5


**Task 3.2: Compute and list 10 of the top, bottom, and most frequent:**

In [71]:
def compute_top_bottom_frequent(data, column, top_n=10):
    top_values = data[column].nlargest(top_n).reset_index(drop=True)
    bottom_values = data[column].nsmallest(top_n).reset_index(drop=True)
    most_frequent_values = data[column].mode()
    
    # If there are fewer than top_n most frequent values, adjust
    if len(most_frequent_values) > top_n:
        most_frequent_values = most_frequent_values.head(top_n)
    
    return {
        'top': top_values,
        'bottom': bottom_values,
        'most_frequent': most_frequent_values
    }

# Compute statistics for each metric
tcp_stats = compute_top_bottom_frequent(aggregated_data, 'Total_TCP')
rtt_stats = compute_top_bottom_frequent(aggregated_data, 'Total_RTT')
throughput_stats = compute_top_bottom_frequent(aggregated_data, 'Total_Throughput')

# Function to print the results in a readable format
def print_stats(stats, title):
    print(f"\n{title}")
    print("\nTop 10 values:")
    print(stats['top'])
    print("\nBottom 10 values:")
    print(stats['bottom'])
    print("\nMost Frequent values:")
    print(stats['most_frequent'])

# Print the results for each metric
print_stats(tcp_stats, "Total TCP Stats")
print_stats(rtt_stats, "Total RTT Stats")
print_stats(throughput_stats, "Total Throughput Stats")


Total TCP Stats

Top 10 values:
0   50604849.00
1   50317993.00
2   50259399.00
3   50167726.66
4   50136015.00
5   50133425.00
6   50087802.00
7   50039092.66
8   50004063.00
9   49842840.00
Name: Total_TCP, dtype: float64

Bottom 10 values:
0    97.00
1   128.00
2   129.00
3   134.00
4   143.00
5   176.00
6   176.00
7   177.00
8   179.00
9   182.00
Name: Total_TCP, dtype: float64

Most Frequent values:
0   21569572.94
Name: Total_TCP, dtype: float64

Total RTT Stats

Top 10 values:
0   258.00
1   255.00
2   251.00
3   251.00
4   249.00
5   248.00
6   248.00
7   247.00
8   246.00
9   243.00
Name: Total_RTT, dtype: float64

Bottom 10 values:
0    0.00
1    0.00
2    2.00
3    4.00
4    5.00
5    6.00
6    8.00
7    9.00
8    9.00
9   10.00
Name: Total_RTT, dtype: float64

Most Frequent values:
0   127.46
Name: Total_RTT, dtype: float64

Total Throughput Stats

Top 10 values:
0   51675.00
1   51513.00
2   51440.00
3   51423.00
4   51383.00
5   51230.00
6   51225.00
7   51220.00
8   511

**The distribution of the average throughput per handset type**

In [73]:
def report_distribution(df, group_col, value_col, top_n=10):
    distribution = df.groupby(group_col)[value_col].mean()
    top_distribution = distribution.sort_values(ascending=False).head(top_n)
    return top_distribution

throughput_distribution = report_distribution(agg_df, 'Handset Type', 'total_throughput')

print("\nTop 10 Average Throughput per Handset Type:\n")
print(throughput_distribution)


Top 10 Average Throughput per Handset Type:

Handset Type
Xiaomi Communica. M1803E1A           50942.43
Xiaomi Communica. Redmi Note 2       49381.00
Huawei Nova 2I Huawei Mate 10 Lite   47661.00
Htc 2Q6E100                          47493.00
Lephone U Pro                        45669.00
Huawei Stf-Tl10                      45257.00
Huawei Vns-Dl00                      45143.00
Xiaomi Communica. B5                 44366.00
Samsung Galaxy S9 Sm-G960N           44355.00
Samsung Galaxy On 7                  43773.43
Name: total_throughput, dtype: float64


**Average TCP per handset**

In [74]:
def report_distribution(df, group_col, value_col, top_n=10):
    distribution = df.groupby(group_col)[value_col].mean()
    top_distribution = distribution.sort_values(ascending=False).head(top_n)
    return top_distribution


tcp_distribution = report_distribution(agg_df, 'Handset Type', 'total_tcp')

print("\nTop 10 Average TCP Retransmission per Handset Type:\n")
print(tcp_distribution)


Top 10 Average TCP Retransmission per Handset Type:

Handset Type
Samsung Galaxy Core 2 (Sm-G355X)        46559007.00
A-Link Telecom I. Cubot Note S          41411731.00
Spa Condor Elect. Allure M1 Plus        31770168.00
Quartel Infotech. Maximus M84           31293828.47
Tcl Communicatio. Pixi 4 6 3G Android   30861953.00
Asustek Asus Zenfone Selfie Zd551Kl     29993503.30
Lg-M400Dy                               28412371.00
Rim Blackberry Stl100-2 Z10 Rfh121Lw    28249175.70
Lg G6+                                  27079473.00
Samsung Galaxy J3 (Sm-J327)             25369808.00
Name: total_tcp, dtype: float64


**k-means clustering**