# 1)**Introduction:** Why Product Simplification link here

In [3]:
from google.colab import files

# New data file upload. Do not execute this unless you want to change the  datafile.

uploaded = files.upload() #Uncheck only if you want to upload a new file.

Saving mobileData.csv to mobileData.csv


# 2)	Data Gathering & Preparation: Explain key product data, handling multiple sources, establishing features.

In [4]:
import pandas as pd
import io

df = pd.read_csv(io.BytesIO(uploaded['mobileData.csv']))
print(df.head())

  ProductID               ProductName  \
0     P0001      Mobile Postpaid Fish   
1     P0002      Mobile Prepaid Chair   
2     P0003  Mobile Postpaid Keyboard   
3     P0004        Mobile Prepaid Hat   
4     P0005     Mobile Postpaid Shoes   

                                  ProductDescription ProductType ServiceType  \
0  Mobile Postpaid plan for Residential customers...      Mobile    Postpaid   
1  Mobile Prepaid plan for Residential customers....      Mobile     Prepaid   
2  Mobile Postpaid plan for Business customers. 1...      Mobile    Postpaid   
3  Mobile Prepaid plan for Enterprise customers. ...      Mobile     Prepaid   
4  Mobile Postpaid plan for Enterprise customers....      Mobile    Postpaid   

  LifecycleStatus TargetCustomerSegment  Price Currency  BandwidthDataLimit  \
0        Obsolete           Residential  46.20      USD                  46   
1       In Design           Residential  54.71      USD                  15   
2          Active              Busi

# 3)	**Feature engineering:** Normalize feature to make it suitable for the AI training

In [5]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
def normalize_features(df):
    """Normalizes features for K-means clustering."""
    numerical_features = ['Price', 'BandwidthDataLimit', 'PS1001:DataLimit', 'PS1001:CallMinutes',
                          'PS2001:DownloadSpeed', 'PS2001:UploadSpeed', 'PS4001:StorageCapacity',
                          'PS4001:CameraResolution', 'PS2002:SLA']

    categorical_features = ['ServiceType', 'LifecycleStatus', 'TargetCustomerSegment',
                            'SalesChannel', 'GeographicAvailability', 'Unit',
                            'RS1001:ResourceType', 'SS2001:ServiceName', 'SR3001:ResourceStatus']

    boolean_features = ['PS1002:InternationalRoaming', 'PS3001:AdFree', 'PS2002:StaticIP']

    text_features = ['ProductDescription', 'TermsAndConditions', 'ProductFeatures', 'SR3001:ResourceID', 'SS2001:ServiceDescription']

    imputer_numerical = SimpleImputer(strategy='mean')
    df[numerical_features] = imputer_numerical.fit_transform(df[numerical_features])

    for feature in boolean_features:
        df[feature] = df[feature].fillna(False).astype(int)

    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])

    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_categorical = encoder.fit_transform(df[categorical_features])
    encoded_columns = encoder.get_feature_names_out(categorical_features)
    encoded_df = pd.DataFrame(encoded_categorical, columns=encoded_columns)
    df = pd.concat([df, encoded_df], axis=1)

    tfidf_vectorizer = TfidfVectorizer(max_features=100)
    for text_feature in text_features:
        tfidf_matrix = tfidf_vectorizer.fit_transform(df[text_feature].fillna(''))
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'{text_feature}_tfidf_{i}' for i in range(tfidf_matrix.shape[1])])
        df = pd.concat([df, tfidf_df], axis=1)

    fields_to_remove = ['ProductName', 'Currency', 'RelatedProducts', 'ProductVersion',
                        'ProductType', 'IncludedServices','PO1001:OfferName','PO1001:OfferDescription','PO1001:MarketingCampaign'] + categorical_features + text_features

    df = df.drop(fields_to_remove, axis=1, errors='ignore')

    return df, scaler, encoder, tfidf_vectorizer

df_normalized, scaler, encoder, tfidf_vectorizer = normalize_features(df)
df_normalized.head(1)


Unnamed: 0,ProductID,Price,BandwidthDataLimit,PS1001:DataLimit,PS1001:CallMinutes,PS1002:InternationalRoaming,PS2001:DownloadSpeed,PS2001:UploadSpeed,PS3001:AdFree,PS4001:StorageCapacity,...,SR3001:ResourceID_tfidf_2,SR3001:ResourceID_tfidf_3,SR3001:ResourceID_tfidf_4,SR3001:ResourceID_tfidf_5,SR3001:ResourceID_tfidf_6,SR3001:ResourceID_tfidf_7,SS2001:ServiceDescription_tfidf_0,SS2001:ServiceDescription_tfidf_1,SS2001:ServiceDescription_tfidf_2,SS2001:ServiceDescription_tfidf_3
0,P0001,-0.624616,1.738799,1.738799,0.624033,1,-1.436556,-1.479591,0,-0.246321,...,0.0,0.0,0.0,0.0,0.0,0.0,0.377964,0.377964,0.377964,0.755929


Now let us makesure we have productID separately mapped and we create a dataframe for kmeans

In [6]:
product_ids = df["ProductID"]
kmeans_data = df.drop("ProductID",axis =1)
# Validate data
print(kmeans_data.isnull().sum()) #check for nulls
print(kmeans_data.describe()) #check for statistical anomolies.


ProductName                    0
ProductDescription             0
ProductType                    0
ServiceType                    0
LifecycleStatus                0
TargetCustomerSegment          0
Price                          0
Currency                       0
BandwidthDataLimit             0
Unit                           0
IncludedServices               0
SalesChannel                   0
GeographicAvailability         0
RelatedProducts                0
ProductVersion                 0
TermsAndConditions             0
ProductFeatures                0
PS1001:DataLimit               0
PS1001:CallMinutes             0
PS1002:InternationalRoaming    0
PS2001:DownloadSpeed           0
PS2001:UploadSpeed             0
PS3001:AdFree                  0
PS4001:StorageCapacity         0
PS4001:CameraResolution        0
PS2002:StaticIP                0
PS2002:SLA                     0
PO1001:OfferName               0
PO1001:OfferDescription        0
PO1001:MarketingCampaign       0
RS1001:Res

# 4)**Training**: Cluster products with similar characteristics, guidelines to evaluate the outcome

In [7]:

from sklearn.cluster import KMeans
# Set the number of clusters
k = 3

# Instantiate the KMeans model
kmeans = KMeans(n_clusters=k, random_state=42)  # Set random_state for reproducibility

# Train the model (fit the model to the data)
kmeans.fit(df_normalized.drop('ProductID', axis=1)) #remove product ID as it is not needed for training.

# Add the cluster labels to the DataFrame
df_normalized['cluster'] = kmeans.labels_

# Print the cluster assignments
print(df_normalized[['ProductID', 'cluster']])

# Map the clusters to the products.
df_original = pd.read_csv("mobileData.csv")
df_clustered = pd.merge(df_original, df_normalized[['ProductID', 'cluster']], on='ProductID')
df_clustered.to_csv("mobileData_clustered.csv")




  ProductID  cluster
0     P0001        1
1     P0002        2
2     P0003        0
3     P0004        2
4     P0005        0
5     P0006        1
6     P0007        2
7     P0008        0


Cluster validation:

In [8]:

from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity


# Cluster Size
print("Cluster Sizes:\n", df_clustered['cluster'].value_counts())

# Use the numerical_features list
numerical_features = ['Price', 'BandwidthDataLimit', 'PS1001:DataLimit', 'PS1001:CallMinutes',
                        'PS2001:DownloadSpeed', 'PS2001:UploadSpeed', 'PS4001:StorageCapacity',
                        'PS4001:CameraResolution', 'PS2002:SLA']

# Filter df_clustered to include only numerical and cluster columns
df_clustered_numeric = df_clustered[['cluster'] + numerical_features]

# Cluster Means
print("\nCluster Means:\n", df_clustered_numeric.groupby('cluster').mean())

# Silhouette Score
silhouette_avg = silhouette_score(df_normalized.drop(['ProductID', 'cluster'], axis=1), df_normalized['cluster'])
print("\nSilhouette Score:", silhouette_avg)

# WCSS
print("\nWCSS:", kmeans.inertia_)

# Example showing how to get the similarity matrix for one cluster.
cluster_number = 0
cluster_data = df_normalized[df_normalized['cluster'] == cluster_number].drop(['ProductID', 'cluster'], axis=1)
similarity_matrix = cosine_similarity(cluster_data)
print(similarity_matrix)

Cluster Sizes:
 cluster
2    3
0    3
1    2
Name: count, dtype: int64

Cluster Means:
          Price  BandwidthDataLimit  PS1001:DataLimit  PS1001:CallMinutes  \
cluster                                                                    
0        71.45           20.333333         20.333333           43.666667   
1        56.58           40.500000         40.500000           80.000000   
2        53.41           25.666667         25.666667           49.333333   

         PS2001:DownloadSpeed  PS2001:UploadSpeed  PS4001:StorageCapacity  \
cluster                                                                     
0                   24.000000           12.000000              149.333333   
1                   11.000000            5.500000               40.000000   
2                   19.333333            9.666667               42.666667   

         PS4001:CameraResolution  PS2002:SLA  
cluster                                       
0                      25.333333         0.0  
1   

This step saves our kmeans model for future use.

In [9]:
import joblib

joblib.dump(kmeans, 'kmeans_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(encoder, 'encoder.joblib')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
# Save the clustered data
df_normalized['cluster'] = kmeans.labels_
df_normalized.to_csv("mobileData_clustered.csv", index=False)



# 5)	**Recommendations engine:** Show similar products to business while they are creating. So that they can take conscious decision.


As a next step we shall build a system which takes new products as input and give a list of similar products which are already present. This helps marketing team for decision making.

Select a file that contants new products in the same format as original file we used for training.

In [10]:
new = files.upload() #uncheck only if you want to upload another file.

Saving NewProduct.csv to NewProduct.csv


In [21]:
import io
df_new = pd.read_csv(io.BytesIO(new['NewProduct.csv']))


In [23]:
import pandas as pd
import io
import joblib
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer

# Load saved models
kmeans = joblib.load('kmeans_model.joblib')
scaler = joblib.load('scaler.joblib')
encoder = joblib.load('encoder.joblib')
tfidf_vectorizer = joblib.load('tfidf_vectorizer.joblib')

original_columns = df_normalized.drop('ProductID', axis=1).columns  # Get columns from training data

def normalize_new_product_features(new_product_df, scaler, encoder, tfidf_vectorizer, original_columns):
    """Normalizes new product features using saved models."""
    numerical_features = ['Price', 'BandwidthDataLimit', 'PS1001:DataLimit', 'PS1001:CallMinutes',
                          'PS2001:DownloadSpeed', 'PS2001:UploadSpeed', 'PS4001:StorageCapacity',
                          'PS4001:CameraResolution', 'PS2002:SLA']
    categorical_features = ['ServiceType', 'LifecycleStatus', 'TargetCustomerSegment',
                            'SalesChannel', 'GeographicAvailability', 'Unit',
                            'RS1001:ResourceType', 'SS2001:ServiceName', 'SR3001:ResourceStatus']
    boolean_features = ['PS1002:InternationalRoaming', 'PS3001:AdFree', 'PS2002:StaticIP']
    text_features = ['ProductDescription', 'TermsAndConditions', 'ProductFeatures', 'SR3001:ResourceID', 'SS2001:ServiceDescription']

    imputer_numerical = SimpleImputer(strategy='mean')
    imputer_numerical.fit(df[numerical_features])
    new_product_df[numerical_features] = imputer_numerical.transform(new_product_df[numerical_features])

    for feature in boolean_features:
        new_product_df[feature] = new_product_df[feature].fillna(False).astype(int)

    new_product_df[numerical_features] = scaler.transform(new_product_df[numerical_features])

    encoded_categorical = encoder.transform(new_product_df[categorical_features])
    encoded_columns = encoder.get_feature_names_out(categorical_features)
    encoded_df = pd.DataFrame(encoded_categorical, columns=encoded_columns)
    new_product_df = pd.concat([new_product_df, encoded_df], axis=1)

    for text_feature in text_features:
        tfidf_matrix = tfidf_vectorizer.transform(new_product_df[text_feature].fillna(''))
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'{text_feature}_tfidf_{i}' for i in range(tfidf_matrix.shape[1])])
        new_product_df = pd.concat([new_product_df, tfidf_df], axis=1)

    fields_to_remove = ['ProductName', 'Currency', 'RelatedProducts', 'ProductVersion',
                        'ProductType', 'IncludedServices', 'PO1001:OfferName', 'PO1001:OfferDescription',
                        'PO1001:MarketingCampaign'] + categorical_features + text_features

    new_product_df = new_product_df.drop(fields_to_remove, axis=1, errors='ignore')

    new_product_df = new_product_df.reindex(columns=original_columns, fill_value=0) # ensure column alignment.

    return new_product_df

def recommend_similar_products(new_product_features, df_normalized, kmeans, product_id):

    new_product_features = new_product_features.drop('cluster', axis=1, errors='ignore')
    new_product_cluster = kmeans.predict(new_product_features)[0]
    cluster_products = df_normalized[df_normalized['cluster'] == new_product_cluster].drop(['ProductID', 'cluster'], axis=1)
    similarity_scores = cosine_similarity(new_product_features, cluster_products)[0]
    product_ids = df_normalized.loc[df_normalized['cluster'] == new_product_cluster, 'ProductID'].tolist()
    similar_products = sorted(zip(product_ids, similarity_scores), key=lambda x: x[1], reverse=True)

    print(f"New product (ProductID: {product_id}) belongs to cluster {new_product_cluster}.")
    print("Similar products:")
    for similar_product_id, similarity in similar_products:
        print(f"  ProductID: {similar_product_id}, Similarity: {similarity:.4f}")

# Process new products and generate recommendations
for index, row in df_new.iterrows():
    product_id = row['ProductID']
    new_product_features = normalize_new_product_features(row.drop('ProductID').to_frame().T, scaler, encoder, tfidf_vectorizer, original_columns)
    recommend_similar_products(new_product_features, df_normalized, kmeans, product_id)
    print("-" * 20)

New product (ProductID: P0012) belongs to cluster 1.
Similar products:
  ProductID: P0001, Similarity: 0.9031
  ProductID: P0006, Similarity: 0.6793
--------------------


  new_product_df[feature] = new_product_df[feature].fillna(False).astype(int)
  new_product_df[feature] = new_product_df[feature].fillna(False).astype(int)
  new_product_df[feature] = new_product_df[feature].fillna(False).astype(int)


Ends of Demonstration.