In [11]:
import os, sys
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 60)

In [12]:
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

In [13]:
# load databse connection class
from src.loader import ConnectToDatabase
# load util functions
from src.utils import UtilFunctions

In [14]:
# load env varibales
load_dotenv()

True

In [15]:
# Allows a user to define a custom database definition based on their configuration
db_host = os.getenv("DB_HOST")
db_user = os.getenv("DB_USER")
db_password = os.getenv("DB_PASSWORD")
db_port = os.getenv("DB_PORT")
db_database = os.getenv("DB_NAME")
db_params = {
    'host': db_host,
    'user': db_user,
    'password': db_password,
    'port': db_port,
    'database': db_database
}

In [16]:
connect_to_database = ConnectToDatabase(db_params)
engine = connect_to_database.get_engine()

In [17]:
# connect with utils class
utils = UtilFunctions()

In [18]:
table_name = 'xdr_data'

In [19]:
df = pd.read_sql_table(table_name, con=engine)

OperationalError: (psycopg2.OperationalError) could not connect to server: Connection refused
	Is the server running on host "localhost" (::1) and accepting
	TCP/IP connections on port 5439?

(Background on this error at: https://sqlalche.me/e/20/e3q8)

## Explore the given dataset

In [None]:
# Return the number of columns and rows
df.shape

In [None]:
# Return all non null row count and data type of each column
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
# return the first five rcords og the data
df.head()

#### Advantages of calculating skew
- it helps us assess the symmetry of the distribution
- it will help us fill the misisng value with mean median based on the distribution

In [None]:
# Email UL (Bytes) uniform distrubition
# Filling missing value on uniform distribution may need more investigating # todo
plt.hist(df["Email UL (Bytes)"], color = 'blue', edgecolor='black', bins=int(180/5))
plt.title("Email data volume (in Bytes) sent by the MS during this session")
plt.xlabel("Email UL in Bytes")
plt.ylabel("Frequency")
plt.show()

In [None]:
# This is an example of left skewed distrubtion
# DL TP < 50 Kbps (%) 
plt.hist(df["DL TP < 50 Kbps (%)"], color = 'green', edgecolor='orange', bins=int(180/5))
plt.title("Duration ratio when Bearer Downlink Throughput < ….")
plt.xlabel("DL TP < 50 Kbps")
plt.ylabel("Frequency")
plt.show()

In [None]:
# This is an example of right skewed distrubtion
# 1250B < Vol UL < 6250B # need to find better example # todo
plt.hist(df["Nb of sec with 1250B < Vol UL < 6250B"], color = 'blue', edgecolor="orange", bins=int(180/5))
plt.title("Nb of sec with 1250B < Vol UL < 6250B")
plt.xlabel("1250B < Vol UL < 6250B")
plt.ylabel("Frequency")
plt.show()

## Data Cleaning

In [None]:
# drop dublicate column
df.drop('Dur. (ms).1', axis=1, inplace=True)
df.head()

In [None]:
# Dropin columns with a lot of missing value
# can't drop this column since they are needed for task 4
# columns_to_drop = [
#     'Nb of sec with 125000B < Vol DL',
#     'Nb of sec with 1250B < Vol UL < 6250B',
#     'Nb of sec with 31250B < Vol DL < 125000B',
#     'Nb of sec with 37500B < Vol UL',
#     'Nb of sec with 6250B < Vol DL < 31250B',
#     'Nb of sec with 6250B < Vol UL < 37500B'
# ]
# df = df.drop(columns=columns_to_drop, axis=1)

In [None]:
df.head()

In [None]:
# Check for skew to find out about data distribution
numeric_columns = df.select_dtypes(include=['float', 'int'])
# numeric_columns.skew(axis=0).round(1)
numeric_columns = df.select_dtypes(include=['float', 'int']).columns

In [None]:
object_columns  = df.select_dtypes(include=["object"]).columns

In [None]:
# check for column skew and add fill the missing value with mean or median
for column_name in numeric_columns:
    column_skew = df[column_name].skew().round()

    if column_skew > 0:
        fill_value = df[column_name].median()
    elif column_skew < 0:
        fill_value = df[column_name].median()
    else:
        fill_value = df[column_name].mean()

    df[column_name].fillna(fill_value, inplace=True)

In [None]:
#set undeifined value with mode
# Assuminng there is no phone brand called undefined
undefined_rows  = df[df['Handset Type'] == 'undefined']
handest_type_mode = df['Handset Type'].mode()[0]
df.loc[undefined_rows.index, 'Handset Type'] = handest_type_mode

In [None]:
# Extract columns needed for user analysis
all_columns_for_user_analysis = ['MSISDN/Number', 'Handset Type', 'Handset Manufacturer',
    'Bearer Id', 'Dur. (ms)','Total DL (Bytes)','Total UL (Bytes)',
    'Social Media DL (Bytes)','Social Media UL (Bytes)', 'Google DL (Bytes)',
    'Google UL (Bytes)', 'Email DL (Bytes)','Email UL (Bytes)','Youtube DL (Bytes)',
    'Youtube UL (Bytes)', 'Netflix DL (Bytes)', 'Netflix UL (Bytes)','Gaming DL (Bytes)',
    'Gaming UL (Bytes)','Other UL (Bytes)','Total UL (Bytes)', 'Other DL (Bytes)']
df_user_analysis = df[all_columns_for_user_analysis]

In [None]:
# Extract columns needed for user engagement analysis
all_columns_for_user_engagement_analysis = ['MSISDN/Number','Bearer Id','Dur. (ms)','Total DL (Bytes)',
    'Total UL (Bytes)','Social Media DL (Bytes)','Social Media UL (Bytes)',
    'Youtube DL (Bytes)','Youtube UL (Bytes)','Netflix DL (Bytes)','Netflix UL (Bytes)','Google DL (Bytes)','Google UL (Bytes)','Email DL (Bytes)',
    'Email UL (Bytes)','Gaming DL (Bytes)','Gaming UL (Bytes)','Other DL (Bytes)',
    'Other UL (Bytes)']
df_user_engagment_analysis = df[all_columns_for_user_engagement_analysis]

In [None]:
# Extract columns need for Experience Analytics 
all_columns_needed_for_user_experience_analysis = [
    'MSISDN/Number',
    'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)',
    'Avg RTT DL (ms)', 'Avg RTT UL (ms)',
    'Handset Type',
    'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
    'Total DL (Bytes)', 'Total UL (Bytes)',
    'Social Media DL (Bytes)', 'Social Media UL (Bytes)',
    'Youtube DL (Bytes)', 'Youtube UL (Bytes)',
    'Netflix DL (Bytes)', 'Netflix UL (Bytes)',
    'Google DL (Bytes)', 'Google UL (Bytes)',
    'Email DL (Bytes)', 'Email UL (Bytes)',
    'Gaming DL (Bytes)', 'Gaming UL (Bytes)',
    'Other DL (Bytes)', 'Other UL (Bytes)'
]
df_user_experience_analysis = df[all_columns_needed_for_user_experience_analysis]

In [None]:
# Export csv for User analysis
file_path = '../data/user_analysis.csv'
df_user_analysis.to_csv(file_path, index=False)

In [None]:
# Export csv for User Engagement analysis
file_path = '../data/user_engagement.csv'
df_user_engagment_analysis.to_csv(file_path, index=False)

In [None]:
# Export csv for User Experience analysis

In [None]:
# Export csv for User Satisfaction analysis