In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from process_data import  get_df

pd.set_option('display.float_format', lambda x: '%.0f' % x)

df = get_df()


In [None]:
df.columns

In [None]:
#  Conduct an exploratory data analysis on those data & communicate useful insights. Ensure that you 
# identify and treat all missing values and outliers in the dataset by replacing by the mean of the corresponding column.

non_numeric_columns = df.select_dtypes(exclude=['number']).columns

df_numeric = df.drop(non_numeric_columns, axis=1)

cleaned_data = df_numeric.fillna(df_numeric.mean())

In [None]:
# In the current dataset you’re expected to track the user’s engagement using : 
# sessions frequency 

session_frequency = df['MSISDN/Number'].value_counts()
session_frequency = session_frequency .reset_index()
session_frequency.columns = ['MSISDN/Number', 'session_frequency']
session_frequency.head(10)


In [None]:
# In the current dataset you’re expected to track the user’s engagement using : 
# sessions  session duration
session_frequency = df['MSISDN/Number'].value_counts()
session_frequency = session_frequency .reset_index()
session_frequency.columns = ['MSISDN/Number', 'session_frequency']

# In the current dataset you’re expected to track the user’s engagement using the following engagement metrics: 
# total data usage (DL and UL)
import pandas as pd


df['Total Data Usage'] = df['Total UL (Bytes)'] + df['Total DL (Bytes)'] 

# Group by customer_id and sum the session durations and total data usage
total_data_usage_dur = df.groupby('MSISDN/Number')[['Total Data Usage', 'Dur. (ms)']].sum()

total_data_usage_dur

In [None]:

merged_df = pd.merge(session_frequency, total_data_usage_dur,  on='MSISDN/Number', how='outer')
merged_df

In [None]:
# In the current dataset you’re expected to track the user’s engagement using : 
# total data usage (DL and UL)

total_duration = merged_df[['MSISDN/Number', 'Dur. (ms)']].sort_values(by='Dur. (ms)', ascending=False)
total_duration.head(10)

In [None]:
total_Total_Data_Usage = merged_df[['MSISDN/Number', 'Total Data Usage']].sort_values(by='Total Data Usage', ascending=False)
total_Total_Data_Usage.head(10)

In [None]:
# Normalize each engagement metric and run a k-means (k=3) to classify customers in three groups of engagement. 

# Select the engagement metrics for clustering
engagement_metrics = [
    'Total Data Usage',
    'Dur. (ms)',
    'session_frequency',
]

# Drop rows with missing values in the selected columns
df_cluster = merged_df[engagement_metrics].dropna()

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_cluster)

# Apply k-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df_cluster['Cluster'] = kmeans.fit_predict(scaled_data)

# Visualize the clusters using a rearranged 3D scatter plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

scatter = ax.scatter(
    df_cluster['session_frequency'],
    df_cluster['Dur. (ms)'],
    df_cluster['Total Data Usage'],
    c=df_cluster['Cluster'],
    cmap='viridis',
    s=50,
    alpha=0.6,
    edgecolors='w'
)

ax.set_xlabel('Session Frequency')
ax.set_ylabel('Duration (ms)')
ax.set_zlabel('Total Data Usage')
ax.set_title('K-Means Clustering of User Experiences')

legend1 = ax.legend(*scatter.legend_elements(), title='Clusters')
ax.add_artist(legend1)

plt.show()

