In [15]:
import sys
import os
import pandas as pd # type: ignore
import numpy as np # type: ignore
from sklearn.cluster import KMeans # type: ignore
from sklearn.metrics.pairwise import euclidean_distances # type: ignore
from sklearn.linear_model import LinearRegression # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.metrics import mean_squared_error # type: ignore
import matplotlib.pyplot as plt # type: ignore
import seaborn as sns # type: ignore
sys.path.append(os.path.abspath('C:/Users/nejat/AIM Projects/Telecommunication Data Analysis/src'))
from db_connection import PostgresConnection
from data_cleaning import preprocess_data  

engagement_df = pd.read_csv('user_engagement_results.csv')
experience_df = pd.read_csv('experience_analysis_results.csv')

print(engagement_df.head())
print(experience_df.head())

   MSISDN/Number  Dur. (ms)  Total DL (Bytes)  Total UL (Bytes)  Start  \
0   3.360100e+10   116720.0      8.426375e+08        36053108.0      1   
1   3.360100e+10   181230.0      1.207552e+08        36104459.0      1   
2   3.360100e+10   134969.0      5.566597e+08        39306820.0      1   
3   3.360101e+10    49878.0      4.019932e+08        20327526.0      1   
4   3.360101e+10    37104.0      1.363130e+09        94280527.0      2   

   cluster  
0        0  
1        0  
2        0  
3        0  
4        2  
   MSISDN/Number  TCP DL Retrans. Vol (Bytes)  Avg RTT DL (ms)  \
0   3.360100e+10                 2.080991e+07        46.000000   
1   3.360100e+10                 2.080991e+07        30.000000   
2   3.360100e+10                 2.080991e+07       109.795706   
3   3.360101e+10                 1.066000e+03        69.000000   
4   3.360101e+10                 1.507977e+07        57.000000   

   Avg Bearer TP DL (kbps)                    Handset Type  cluster  
0         

In [14]:
def calculate_engagement_experience_scores(engagement_df, experience_df):
    kmeans_engagement = KMeans(n_clusters=3, random_state=42)
    engagement_clusters = kmeans_engagement.fit_predict(combined_df[['Dur. (ms)', 'Total DL (Bytes)', 'Total UL (Bytes)']])
    least_engaged_centroid = kmeans_engagement.cluster_centers_[0] 

    combined_df['engagement_score'] = euclidean_distances(combined_df[['Dur. (ms)', 'Total DL (Bytes)', 'Total UL (Bytes)']], 
                                                         [least_engaged_centroid]).flatten()

    kmeans_experience = KMeans(n_clusters=3, random_state=42)
    experience_clusters = kmeans_experience.fit_predict(combined_df[['TCP DL Retrans. Vol (Bytes)', 'Avg RTT DL (ms)', 'Avg Bearer TP DL (kbps)']])
    worst_experience_centroid = kmeans_experience.cluster_centers_[0]  

    combined_df['experience_score'] = euclidean_distances(combined_df[['TCP DL Retrans. Vol (Bytes)', 'Avg RTT DL (ms)', 'Avg Bearer TP DL (kbps)']], 
                                                           [worst_experience_centroid]).flatten()
    combined_df['satisfaction_score'] = (combined_df['engagement_score'] + combined_df['experience_score']) / 2
    return combined_df


combined_df = calculate_engagement_experience_scores(engagement_df, experience_df)

top_10_satisfied_customers = combined_df[['MSISDN/Number', 'satisfaction_score']].sort_values(by='satisfaction_score').head(10)
print(top_10_satisfied_customers)



        MSISDN/Number  satisfaction_score
30153    3.365825e+10        2.380003e+06
42669    3.366106e+10        3.454811e+06
67560    3.366795e+10        4.755155e+06
49573    3.366268e+10        4.755995e+06
67561    3.366795e+10        4.908502e+06
100494   3.376388e+10        5.522642e+06
46237    3.366191e+10        6.153185e+06
59908    3.366541e+10        6.256232e+06
41908    3.366086e+10        6.385396e+06
88677    3.369947e+10        6.416834e+06


In [21]:
X = combined_df[['engagement_score', 'experience_score']]
y = combined_df['satisfaction_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error of the model: {mse:.4f}")

Mean Squared Error of the model: 0.0000


In [25]:
features = combined_df[['engagement_score', 'experience_score']]

kmeans = KMeans(n_clusters=2, random_state=42)
combined_df['experience_engagement_cluster'] = kmeans.fit_predict(features)

print(combined_df.head())



   MSISDN/Number  Dur. (ms)  Total DL (Bytes)  Total UL (Bytes)  Start  \
0   3.360100e+10   116720.0      8.426375e+08        36053108.0      1   
1   3.360100e+10   181230.0      1.207552e+08        36104459.0      1   
2   3.360100e+10   134969.0      5.566597e+08        39306820.0      1   
3   3.360101e+10    49878.0      4.019932e+08        20327526.0      1   
4   3.360101e+10    37104.0      1.363130e+09        94280527.0      2   

   cluster_x  TCP DL Retrans. Vol (Bytes)  Avg RTT DL (ms)  \
0          0                 2.080991e+07        46.000000   
1          0                 2.080991e+07        30.000000   
2          0                 2.080991e+07       109.795706   
3          0                 1.066000e+03        69.000000   
4          2                 1.507977e+07        57.000000   

   Avg Bearer TP DL (kbps)                    Handset Type  cluster_y  \
0                     37.0  Huawei P20 Lite Huawei Nova 3E          0   
1                     48.0          

In [26]:
print(combined_df.columns)

Index(['MSISDN/Number', 'Dur. (ms)', 'Total DL (Bytes)', 'Total UL (Bytes)',
       'Start', 'cluster_x', 'TCP DL Retrans. Vol (Bytes)', 'Avg RTT DL (ms)',
       'Avg Bearer TP DL (kbps)', 'Handset Type', 'cluster_y',
       'engagement_score', 'experience_score', 'satisfaction_score',
       'experience_engagement_cluster'],
      dtype='object')


In [27]:
cluster_summary = combined_df.groupby('experience_engagement_cluster')[['satisfaction_score', 'experience_score']].mean()
print("Cluster Summary:\n", cluster_summary)

Cluster Summary:
                                satisfaction_score  experience_score
experience_engagement_cluster                                      
0                                    6.857367e+08      1.313037e+07
1                                    3.825166e+08      1.239207e+07
