In [14]:
# my_notebook.ipynb

import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.cluster import KMeans


In [15]:


# Load the saved CSV file into a new DataFrame
new_df = pd.read_csv('C:\\Users\\Nole\\Desktop\\Teleco\\Notebook\\Cleaned_data_csv\\cleaned_data.csv')

# Now you can use the new_df DataFrame for further analysis in pandas


In [16]:
user_engagement_table = pd.read_csv('C:/Users/Nole/Desktop/Teleco/Notebook/Cleaned_data_csv/user_engagement_table.csv')

user_engagement_table

Unnamed: 0,MSISDN/Number,session frequency,Dur. (ms),Total UL (Bytes),session total traffic,mean_engagement,engagement_group
0,3.366496e+10,47.0,1823652.0,36749741.0,345629377.0,1.158177e+08,1
1,3.368185e+10,70.0,1365104.0,53800391.0,707185356.0,2.361835e+08,2
2,3.376063e+10,70.0,1361762.0,27883638.0,307690973.0,1.030176e+08,1
3,3.375034e+10,70.0,1321509.0,43324218.0,889352748.0,2.968914e+08,2
4,3.369980e+10,70.0,1089009.0,38542814.0,607681403.0,2.029235e+08,0
...,...,...,...,...,...,...,...
146886,3.366865e+10,322.0,59587.0,53745392.0,872761860.0,2.909406e+08,2
146887,3.365069e+10,32.0,81230.0,57628851.0,631804110.0,2.106285e+08,0
146888,3.366345e+10,29.0,97970.0,39135081.0,705783925.0,2.352940e+08,2
146889,3.362189e+10,49.0,98249.0,34912224.0,627698629.0,2.092656e+08,0


In [17]:
user_experience_table=pd.read_csv('C:/Users/Nole/Desktop/Teleco/Notebook/Cleaned_data_csv/user_experience_table.csv')
user_experience_table

Unnamed: 0,Average TCP retransmission,Average RTT,Average throughput
0,13375.0,23.5,33.5
1,13375.0,35.0,21.0
2,13375.0,35.0,7.5
3,13375.0,35.0,44.0
4,13375.0,35.0,7.5
...,...,...,...
146886,8357731.0,161.0,32406.5
146887,8357731.0,16.0,58.5
146888,8357731.0,14.5,38.5
146889,8357731.0,24.5,45.0


# engagement_metrics

In [18]:

# Define or assign engagement_metrics with the appropriate data
engagement_metrics = user_engagement_table[['session total traffic', 'session frequency', 'Dur. (ms)']]


# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Normalize the engagement metrics
engagement_metrics_normalized = scaler.fit_transform(engagement_metrics)

# Perform K-means clustering with k=3
kmeans = KMeans(n_clusters=3, random_state=42)
cluster_labels = kmeans.fit_predict(engagement_metrics_normalized)

# Add the cluster labels to the original DataFrame
new_df['Engagement_Cluster'] = cluster_labels

# Calculate the centroids of the clusters
cluster_centroids = []
for label in range(3):
    cluster_centroids.append(np.mean(engagement_metrics_normalized[cluster_labels == label], axis=0))

# Calculate the Euclidean distance from each user data point to each cluster centroid
distances = np.linalg.norm(engagement_metrics_normalized - cluster_centroids[0], axis=1)

# Assign the engagement score to each user
new_df['Engagement_Score'] = distances

# Display the DataFrame with the assigned engagement scores
print(new_df[['MSISDN/Number', 'Engagement_Score']])


        MSISDN/Number  Engagement_Score
0        3.366496e+10          0.942178
1        3.368185e+10          0.716448
2        3.376063e+10          0.708781
3        3.375034e+10          0.780384
4        3.369980e+10          0.543981
...               ...               ...
146886   3.366865e+10          0.404678
146887   3.365069e+10          0.144584
146888   3.366345e+10          0.223831
146889   3.362189e+10          0.139617
146890   3.361962e+10          0.104500

[146891 rows x 2 columns]


# experience_metrics

In [19]:


# Assuming `experience_metrics` and `new_df` are already defined as per your previous code

# Create a StandardScaler object
scaler = StandardScaler()

# Standardize the experience metrics
experience_metrics_standardized = scaler.fit_transform(user_experience_table)

# Perform K-means clustering with k=3
kmeans = KMeans(n_clusters=3, random_state=42)
cluster_labels = kmeans.fit_predict(experience_metrics_standardized)

# Add the cluster labels to the original DataFrame
new_df['Experience_Cluster'] = cluster_labels

# Calculate the centroids of the clusters
cluster_centroids = []
for label in range(3):
    cluster_centroids.append(np.mean(experience_metrics_standardized[cluster_labels == label], axis=0))

# Calculate the Euclidean distance from each user data point to the centroids
distances = np.zeros(len(experience_metrics_standardized))
for label, centroid in enumerate(cluster_centroids):
    cluster_points = experience_metrics_standardized[cluster_labels == label]
    distances[cluster_labels == label] = np.linalg.norm(cluster_points - centroid, axis=1)

# Assign the experience score to each user
new_df['Experience_Score'] = distances

# Display the DataFrame with the assigned experience scores
print(new_df[['MSISDN/Number', 'Experience_Score']])


        MSISDN/Number  Experience_Score
0        3.366496e+10          0.199729
1        3.368185e+10          0.182025
2        3.376063e+10          0.182928
3        3.375034e+10          0.180490
4        3.369980e+10          0.182928
...               ...               ...
146886   3.366865e+10          0.501974
146887   3.365069e+10          0.215946
146888   3.366345e+10          0.220402
146889   3.362189e+10          0.199400
146890   3.361962e+10          0.206704

[146891 rows x 2 columns]


# Calculate the satisfaction score as the average of engagement and experience scores

In [23]:
new_df['Satisfaction_Score'] = new_df[['Engagement_Score', 'Experience_Score']].mean(axis=1)
new_df['Satisfaction_Score']


# # Report the top 10 satisfied customers
top_10_satisfied_customers = new_df.nlargest(10, 'Satisfaction_Score')[['MSISDN/Number', 'Satisfaction_Score']]

# # Display the top 10 satisfied customers
print(top_10_satisfied_customers)


0         0.570954
1         0.449236
2         0.445854
3         0.480437
4         0.363454
            ...   
146886    0.453326
146887    0.180265
146888    0.222117
146889    0.169509
146890    0.155602
Name: Satisfaction_Score, Length: 146891, dtype: float64

# Linear Regression model 

In [None]:


# Assuming `user_engagement_table` and `user_experience_table` contain the respective data

# Select specific columns from user_engagement_table
engagement_features = user_engagement_table[['session frequency', 'session total traffic']]

# Select the last column from user_experience_table
experience_features = user_experience_table[['Average RTT']]

# Combine the selected columns into the feature set X
X = pd.concat([engagement_features, experience_features], axis=1)

# Assuming `new_df` contains the satisfaction scores
y = new_df['Satisfaction_Score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


Mean Squared Error: 0.3042275603022315
R-squared: 0.12307112421965127


# Run a k-means (k=2) on the engagement & the experience score . 

In [24]:

# Assuming `new_df` contains the engagement and experience scores
engagement_experience_scores = new_df[['Engagement_Score', 'Experience_Score']]

# # Initialize and fit the K-means model with k=2
kmeans = KMeans(n_clusters=2, random_state=42)
cluster_labels = kmeans.fit_predict(engagement_experience_scores)

# # Add the cluster labels to the original DataFrame
new_df['Engagement_Experience_Cluster'] = cluster_labels

# # Display the updated DataFrame with the cluster labels
print(new_df[['MSISDN/Number', 'Engagement_Score', 'Experience_Score', 'Engagement_Experience_Cluster']])


        MSISDN/Number  Engagement_Score  Experience_Score  \
0        3.366496e+10          0.942178          0.199729   
1        3.368185e+10          0.716448          0.182025   
2        3.376063e+10          0.708781          0.182928   
3        3.375034e+10          0.780384          0.180490   
4        3.369980e+10          0.543981          0.182928   
...               ...               ...               ...   
146886   3.366865e+10          0.404678          0.501974   
146887   3.365069e+10          0.144584          0.215946   
146888   3.366345e+10          0.223831          0.220402   
146889   3.362189e+10          0.139617          0.199400   
146890   3.361962e+10          0.104500          0.206704   

        Engagement_Experience_Cluster  
0                                   0  
1                                   0  
2                                   0  
3                                   0  
4                                   0  
...                        

# Aggregate the average satisfaction & experience score per cluster. 

In [None]:
# Calculate the average satisfaction and experience scores per cluster
cluster_aggregates = new_df.groupby('Engagement_Experience_Cluster').agg({'Satisfaction_Score': 'mean', 'Experience_Score': 'mean'})

# Display the aggregated scores per cluster
print(cluster_aggregates)


                               Satisfaction_Score  Experience_Score
Engagement_Experience_Cluster                                      
0                                        0.348590          0.463430
1                                        6.694859         13.144885


In [None]:

# Create a new DataFrame with the specified columns
scores_table = new_df[['MSISDN/Number', 'Engagement_Score', 'Experience_Score', 'Satisfaction_Score']]
scores_table.to_csv('scores_table.csv', index=False)

