In [None]:
!pip install sentence_transformers
!pip install pandas

In [None]:
import pandas as pd

from sentence_transformers import SentenceTransformer, util

In [None]:
cluster_accuracy = .75  
min_cluster_size = 2  

In [None]:
transformer = 'all-mpnet-base-v2'  
# the best model as mentioned at https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models

## Load Data

In [None]:
df = pd.read_csv('keywords.csv')

In [None]:
assert len(df.columns) == 1, "dataframe must contain one column"
assert 'keyword' in df.columns[0], "column name must be 'keyword'"

In [None]:
df.head()

## Clustering One Shot

In [None]:
corpus = df.keyword.str.lower().drop_duplicates().values

In [None]:
def cluster_corpus(_corpus: list, _transformer, _min_cluster_size=2, accuracy=.75, batch=256, debug=True, enumerate_start=0):
    
    model = SentenceTransformer(_transformer)

    embeddings = model.encode(
        _corpus, 
        batch_size=batch, 
        show_progress_bar=debug, 
        convert_to_tensor=debug
    )

    clustered_corpus = util.community_detection(
        embeddings, 
        min_community_size=_min_cluster_size,
        threshold=accuracy, 
        init_max_size=len(embeddings)
    )
    
    clusters = [{'cluster': cluster_id, 'keyword': _corpus[member_index]} 
                for cluster_id, members in enumerate(clustered_corpus, start=enumerate_start) 
                for member_index in members]
            
    return pd.DataFrame(clusters)

In [None]:
df_clusters = cluster_corpus(corpus, transformer, min_cluster_size, accuracy=.85, batch=256)

In [None]:
df_clusters.head()

In [None]:
df_clusters.tail()

In [None]:
df_clusters.cluster.nunique(), df_clusters.shape

In [None]:
keyword_remaining = list(set(corpus) - set(df_clusters.keyword))

In [None]:
df_remaining_clusters = cluster_corpus(
    keyword_remaining, 
    transformer, 
    min_cluster_size, 
    accuracy=.85, 
    batch=256,
    enumerate_start=df_clusters.cluster.nunique()
)

In [None]:
df_remaining_clusters.shape, df_remaining_clusters.cluster.nunique()

In [None]:
df_remaining_clusters.head()

In [None]:
df_remaining_clusters.tail()

## Find Clusters Iteratively

In [None]:
corpus = df.keyword.str.lower().drop_duplicates().values

In [None]:
run = True
cluster_number_start_at = 0
clusters_items = []

while run:
    
    df_clusters = cluster_corpus(
        corpus,
        transformer, 
        min_cluster_size, 
        accuracy=.85, 
        batch=256, 
        enumerate_start=cluster_number_start_at
    )
    
    if df_clusters.shape[0] == 0:
        run = False
    else:
        cluster_number_start_at += df_clusters.cluster.nunique()
        corpus = list(set(corpus) - set(df_clusters.keyword))
        clusters_items.append(df_clusters)
        print(f"{len(corpus)} remaining keywords")
        

df_all_clusters = pd.concat(clusters_items)
print("Done.")

In [None]:
df_all_clusters.shape
# nOf clustered keywords

## Save

In [None]:
df['keyword'] = df.keyword.str.lower()
df_ready = (
    df.merge(df_all_clusters, on='keyword', how='left')
    .fillna(-1) #set unclustered keywords to -1
    .sort_values('cluster', ascending=True)
)

In [None]:
df_ready[['cluster', 'keyword']].to_csv('keywords_with_clusters.csv', index=False)

## Future Work

1. find duplicated keywords/products/questions/answers
2. categorize keywords/products/questions/answers
3. similarity search 

## Resources

1. https://www.sbert.net/examples/applications/clustering/README.html#fast-clustering
2. https://quoraengineering.quora.com/A-Machine-Learning-Approach-to-Ranking-Answers-on-Quora
3. https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs