# Importing libraries

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import re
import os
import string

from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from collections import defaultdict, Counter

from matplotlib.colors import to_hex
import matplotlib.cm as cm

from plotly import graph_objects as go

import umap
from umap import UMAP
import hdbscan
from hdbscan import HDBSCAN
import community as community_louvain

from nltk.corpus import stopwords
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

In [6]:
from utilities import (
    get_data,
    num_of_cluster,
    preprocess,
    get_top_tfidf_words_per_cluster,
    plot_clusters,
    visualize_clusters_with_communities
)

In [7]:
########### Topic modelling ################################################
stopwords = list(stopwords.words('english'))
def bertopicModelling(n_components, min_cluster_size, top_n_words, data):
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    umap_model = UMAP(n_components=n_components)
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, gen_min_span_tree=True, prediction_data=True)
    vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords)
    
    model = BERTopic(
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        top_n_words=top_n_words,
        language='english',
        calculate_probabilities=True,
        verbose=True
    )
    topics, probs = model.fit_transform(data)
    return model, topics, probs

In [3]:
output_path = "../data dir/hdbscan_desc_labels.npy"
input_path  = '../data dir/desc_embeddings.npy'

get_data(input_path, output_path, 128, 34)

(array([[9.493816 , 3.2410421, 5.2901673, ..., 7.2728086, 3.307083 ,
         4.2876344],
        [9.507233 , 3.2492406, 5.28946  , ..., 7.2824283, 3.3078213,
         4.2744827],
        [9.435705 , 3.2293234, 5.29892  , ..., 7.2340937, 3.3009698,
         4.293758 ],
        ...,
        [9.370011 , 3.2297873, 5.2959065, ..., 7.2290783, 3.3645866,
         4.2819724],
        [9.486353 , 3.2454233, 5.2920256, ..., 7.3052917, 3.298674 ,
         4.284469 ],
        [9.541647 , 3.2443283, 5.275956 , ..., 7.2736764, 3.3334036,
         4.3023067]], dtype=float32),
 array([-1, -1, 47, ..., -1, -1, 32], dtype=int64))

In [58]:
reduced_embeddings, cluster_labels = np.load('../data dir/desc_embeddings.npy'), np.load('../data dir/hdbscan_desc_labels.npy')

In [59]:
df = pd.read_csv("../data dir/translated_desc.csv")

In [60]:
df['cluster_labels'] = cluster_labels
df['translated_description'] = df['translated_description'].fillna('')
df

Unnamed: 0.1,Unnamed: 0,translated_description,cluster_labels
0,0,kuhudi chapter1 this raja on 14th june 2024,-1
1,1,bhai is so kind to kids in real lifein reel li...,-1
2,2,he duly worshiped with his children at shri ka...,47
3,3,tree tree we have solery forests the same line...,-1
4,4,,0
...,...,...,...
15348,15348,honorable prime ministerji shri jí aspires dis...,-1
15349,15349,jai mahadev,1
15350,15350,tawang marathon is one of the most exciting ma...,-1
15351,15351,talented literary songwriter cthegreetings to ...,-1


In [61]:
df['translated_description'] = df['translated_description'].apply(preprocess)

In [62]:
top_words = get_top_tfidf_words_per_cluster(df, cluster_labels, 'translated_description')

In [None]:

plot_clusters(reduced_embeddings, cluster_labels)

# Description clustering

In [64]:
visualize_clusters_with_communities(reduced_embeddings, cluster_labels, top_words, 0.95)

unique communities are: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 3, 6: 5, 7: 6, 8: 7, 9: 8, 10: 8, 11: 8, 12: 6, 13: 8, 14: 8, 15: 2, 16: 2, 17: 2, 18: 2, 19: 2, 20: 2, 21: 8, 22: 8, 23: 9, 24: 6, 25: 6, 26: 6, 27: 6, 28: 8, 29: 12, 30: 6, 31: 12, 32: 12, 33: 8, 34: 12, 35: 8, 36: 8, 37: 2, 38: 2, 39: 6, 40: 2, 41: 2, 42: 6, 43: 10, 44: 11, 45: 11, 46: 12, 47: 12, 48: 12, 49: 6, 50: 12, 51: 11, 52: 5}
Counter({2: 11, 8: 11, 6: 10, 12: 8, 11: 3, 3: 2, 5: 2, 0: 1, 1: 1, 4: 1, 7: 1, 9: 1, 10: 1})


In [65]:
visualize_clusters_with_communities(reduced_embeddings, cluster_labels, top_words, 0.90)

unique communities are: {0: 0, 1: 1, 2: 2, 3: 2, 4: 3, 5: 2, 6: 3, 7: 2, 8: 3, 9: 3, 10: 2, 11: 2, 12: 3, 13: 3, 14: 2, 15: 2, 16: 2, 17: 2, 18: 2, 19: 2, 20: 2, 21: 2, 22: 2, 23: 3, 24: 2, 25: 3, 26: 3, 27: 2, 28: 3, 29: 3, 30: 3, 31: 3, 32: 3, 33: 3, 34: 3, 35: 2, 36: 2, 37: 2, 38: 3, 39: 2, 40: 2, 41: 2, 42: 2, 43: 3, 44: 3, 45: 3, 46: 3, 47: 3, 48: 3, 49: 3, 50: 3, 51: 3, 52: 3}
Counter({3: 27, 2: 24, 0: 1, 1: 1})


In [66]:
visualize_clusters_with_communities(reduced_embeddings, cluster_labels, top_words, 0.99)

unique communities are: {0: 0, 1: 1, 2: 17, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 19, 17: 17, 18: 19, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 16, 51: 2, 52: 18}
Counter({19: 3, 17: 2, 0: 1, 1: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1, 28: 1, 29: 1, 30: 1, 31: 1, 32: 1, 33: 1, 34: 1, 35: 1, 36: 1, 37: 1, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1, 45: 1, 46: 1, 47: 1, 48: 1, 49: 1, 16: 1, 2: 1, 18: 1})


In [67]:
visualize_clusters_with_communities(reduced_embeddings, cluster_labels, top_words, 0.96)

unique communities are: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 3, 6: 5, 7: 6, 8: 7, 9: 2, 10: 18, 11: 2, 12: 10, 13: 11, 14: 2, 15: 2, 16: 2, 17: 2, 18: 2, 19: 2, 20: 2, 21: 18, 22: 2, 23: 12, 24: 3, 25: 3, 26: 14, 27: 3, 28: 3, 29: 15, 30: 3, 31: 16, 32: 17, 33: 18, 34: 16, 35: 18, 36: 18, 37: 3, 38: 3, 39: 3, 40: 3, 41: 3, 42: 3, 43: 19, 44: 20, 45: 20, 46: 15, 47: 15, 48: 9, 49: 3, 50: 15, 51: 13, 52: 8}
Counter({3: 14, 2: 11, 18: 5, 15: 4, 16: 2, 20: 2, 0: 1, 1: 1, 4: 1, 5: 1, 6: 1, 7: 1, 10: 1, 11: 1, 12: 1, 14: 1, 17: 1, 19: 1, 9: 1, 13: 1, 8: 1})


### Topic modelling for full description

In [77]:
model, _, _  = bertopicModelling(128, 34, 5, df['translated_description'])
model.visualize_barchart()

2024-06-22 13:43:43,477 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/480 [00:00<?, ?it/s]

2024-06-22 13:44:11,714 - BERTopic - Embedding - Completed ✓
2024-06-22 13:44:11,717 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-22 13:44:33,095 - BERTopic - Dimensionality - Completed ✓
2024-06-22 13:44:33,113 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-22 13:45:17,295 - BERTopic - Cluster - Completed ✓
2024-06-22 13:45:17,340 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-22 13:45:20,111 - BERTopic - Representation - Completed ✓


## Cluster analyis for Left Alignment

In [78]:
output_path = "../data dir/hdbscan_left_desc_labels.npy"
input_path  = '../data dir/left_desc_embeddings.npy'

left_embeddings, left_labels = get_data(input_path, output_path, 128, 34)

In [79]:
num_of_cluster(left_labels)

Number of clusters: 7


In [80]:
df = pd.read_csv("../data dir/translated_description_left.csv")
df['cluster_labels'] = left_labels
df['translated_description'] = df['translated_description'].fillna('')
top_words = get_top_tfidf_words_per_cluster(df, left_labels, 'translated_description')

In [81]:
visualize_clusters_with_communities(left_embeddings, left_labels, top_words, 0.95)

unique communities are: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
Counter({0: 7})


In [82]:
visualize_clusters_with_communities(left_embeddings, left_labels, top_words, 0.99)

unique communities are: {0: 0, 1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 1}
Counter({0: 4, 1: 3})


### Topic modelling for left description

In [83]:
model, _, _  = bertopicModelling(128, 34, 5, df['translated_description'])
model.visualize_barchart()

2024-06-22 13:47:04,404 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/56 [00:00<?, ?it/s]

2024-06-22 13:47:07,922 - BERTopic - Embedding - Completed ✓
2024-06-22 13:47:07,923 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-22 13:47:25,484 - BERTopic - Dimensionality - Completed ✓
2024-06-22 13:47:25,493 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-22 13:47:26,161 - BERTopic - Cluster - Completed ✓
2024-06-22 13:47:26,172 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-22 13:47:26,433 - BERTopic - Representation - Completed ✓


## Cluster analyis for Centre Alignment

In [38]:
output_path = "../data dir/hdbscan_centre_desc_labels.npy"
input_path  = '../data dir/centre_desc_embeddings.npy'

centre_embeddings, centre_labels = get_data(input_path, output_path, 128, 15)

In [39]:
print(len(centre_embeddings), len(centre_labels))

3478 3478


In [40]:
num_of_cluster(centre_labels)

Number of clusters: 31


In [41]:
df = pd.read_csv("../data dir/translated_description_centre.csv")
df['cluster_labels'] = centre_labels
df['translated_description'] = df['translated_description'].fillna('')
top_words = get_top_tfidf_words_per_cluster(df, centre_labels, 'translated_description')

In [42]:
visualize_clusters_with_communities(centre_embeddings, centre_labels, top_words, 0.95)

unique communities are: {0: 0, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1, 28: 1, 29: 1, 30: 1}
Counter({1: 30, 0: 1})


In [None]:
visualize_clusters_with_communities(centre_embeddings, centre_labels, top_words, 0.99)

### Topic Modelling on Center Align description

In [46]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = UMAP(n_components=128)
hdbscan_model = HDBSCAN(min_cluster_size=15, gen_min_span_tree=True, prediction_data=True)
stopwords = list(stopwords.words('english'))
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords)

In [89]:
model, _, _  = bertopicModelling(128, 15, 5, df['translated_description'])
model.visualize_barchart()

2024-06-22 14:10:43,181 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/109 [00:00<?, ?it/s]

2024-06-22 14:10:51,383 - BERTopic - Embedding - Completed ✓
2024-06-22 14:10:51,386 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-22 14:11:05,319 - BERTopic - Dimensionality - Completed ✓
2024-06-22 14:11:05,328 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-22 14:11:07,640 - BERTopic - Cluster - Completed ✓
2024-06-22 14:11:07,661 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-22 14:11:08,553 - BERTopic - Representation - Completed ✓


## Cluster analyis for Right Alignment

In [90]:
output_path = "../data dir/hdbscan_right_desc_labels.npy"
input_path  = '../data dir/right_desc_embeddings.npy'

right_embeddings, right_labels = get_data(input_path, output_path, 128, 20)

In [91]:
num_of_cluster(right_labels)

Number of clusters: 15


In [92]:
df = pd.read_csv("../data dir/translated_description_right.csv")
df['cluster_labels'] = right_labels
df['translated_description'] = df['translated_description'].fillna('')
top_words = get_top_tfidf_words_per_cluster(df, right_labels, 'translated_description')

In [93]:
visualize_clusters_with_communities(right_embeddings, right_labels, top_words, 0.95)

unique communities are: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0}
Counter({0: 15})


### Topic modelling for Right descriptions


In [94]:
model, _, _  = bertopicModelling(128, 20, 5, df['translated_description'])
model.visualize_barchart()

2024-06-22 14:13:24,185 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/316 [00:00<?, ?it/s]

2024-06-22 14:13:54,300 - BERTopic - Embedding - Completed ✓
2024-06-22 14:13:54,301 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-22 14:14:10,085 - BERTopic - Dimensionality - Completed ✓
2024-06-22 14:14:10,100 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-22 14:14:27,976 - BERTopic - Cluster - Completed ✓
2024-06-22 14:14:28,018 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-22 14:14:31,086 - BERTopic - Representation - Completed ✓


# OCR TEXT Clustering

In [95]:
output_path = "../data dir/hdbscan_ocr_labels.npy"
input_path  = '../data dir/ocr_embeddings.npy'
ocr_embeddings, cluster_labels = get_data(input_path, output_path, 128, 7)

we are getting __ clusters with 13 and from 14 onwards we are getting 4 clusters 

In [96]:
ocr_embeddings.shape, cluster_labels.shape

((2715, 128), (2715,))

In [97]:
num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
print(f"Number of clusters: {num_clusters}")

Number of clusters: 52


In [98]:
ocr_df = pd.read_csv("../data dir/translated_ocr.csv")

In [99]:
ocr_df['translated_image_ocr']

0        ASK mt MAR RNY: SORT messy on\n\nomc ST TTHAK,...
1        CAMERA QUEEN PRODUCTION\n\n‘LyRit VIDEO’ 3 4\n...
2                                                      NaN
3                                                      NaN
4                                                      NaN
                               ...                        
15348    सिर्फ बजट नहीं लाता बदलाव, संसाधन\nके सही प्रय...
15349                                                  NaN
15350                                                  NaN
15351                                                  NaN
15352                                                  NaN
Name: translated_image_ocr, Length: 15353, dtype: object

In [100]:
ocr_df = ocr_df.dropna(subset=['translated_image_ocr'])

In [101]:
ocr_df['translated_image_ocr'] = ocr_df['translated_image_ocr'].dropna()
ocr_df

Unnamed: 0.1,Unnamed: 0,translated_image_ocr
0,0,"ASK mt MAR RNY: SORT messy on\n\nomc ST TTHAK,..."
1,1,CAMERA QUEEN PRODUCTION\n\n‘LyRit VIDEO’ 3 4\n...
6,6,4 is ASR Hum\n\nKachchchchchchni sis
11,11,KUHUDI DRAWING CONTEST\n\nSUBMIT YOUR DRAWING ...
13,13,"asadowaisioff 9 V\n\nTHANK YOU\n\n6,850\nFollo..."
...,...,...
15313,15313,(Governor of xarnatake)\n\nOn ist October 2023...
15315,15315,cl\n& eek
15330,15330,"Wes o FPTE TON On, ORIEN] mae Ier) Dare,\n\n:"
15335,15335,The river has 1\n\nAad sports\n\nMen's Trap Te...


In [102]:
ocr_df['cluster_labels'] = cluster_labels
ocr_df['translated_image_ocr'] = ocr_df['translated_image_ocr'].dropna()
ocr_df = ocr_df.drop("Unnamed: 0", axis=1)

In [103]:
ocr_df['translated_image_ocr'] = ocr_df['translated_image_ocr'].apply(preprocess)

In [104]:
ocr_df

Unnamed: 0,translated_image_ocr,cluster_labels
0,ask mt mar rny sort messy on\n\nomc st tthak h...,-1
1,camera queen production\n\nlyrit video 3 4\n\n...,1
6,4 is asr hum\n\nkachchchchchchni sis,-1
11,kuhudi drawing contest\n\nsubmit your drawing ...,1
13,asadowaisioff 9 v\n\nthank you\n\n6850\nfollow...,-1
...,...,...
15313,governor of xarnatake\n\non ist october 2023\n...,-1
15315,cl\n eek,-1
15330,wes o fpte ton on orien mae ier dare\n\n,-1
15335,the river has 1\n\naad sports\n\nmens trap tea...,25


In [105]:
top_words = get_top_tfidf_words_per_cluster(ocr_df, cluster_labels, 'translated_image_ocr')

In [106]:

visualize_clusters_with_communities(ocr_embeddings, cluster_labels, top_words, 0.9996)

unique communities are: {0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 3, 6: 4, 7: 5, 8: 4, 9: 4, 10: 4, 11: 4, 12: 4, 13: 1, 14: 4, 15: 6, 16: 4, 17: 4, 18: 4, 19: 4, 20: 4, 21: 4, 22: 5, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1, 28: 6, 29: 6, 30: 6, 31: 6, 32: 1, 33: 1, 34: 5, 35: 5, 36: 5, 37: 5, 38: 5, 39: 5, 40: 6, 41: 1, 42: 1, 43: 1, 44: 1, 45: 1, 46: 6, 47: 6, 48: 6, 49: 6, 50: 6, 51: 6}
Counter({1: 15, 4: 13, 6: 12, 5: 8, 3: 2, 0: 1, 2: 1})


### Topic modelling for full OCR text

In [109]:
model, _, _  = bertopicModelling(128, 7, 5, ocr_df['translated_image_ocr'])
model.visualize_barchart()

2024-06-22 14:26:34,575 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/85 [00:00<?, ?it/s]

2024-06-22 14:26:40,035 - BERTopic - Embedding - Completed ✓
2024-06-22 14:26:40,037 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-22 14:26:50,798 - BERTopic - Dimensionality - Completed ✓
2024-06-22 14:26:50,806 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-22 14:26:52,452 - BERTopic - Cluster - Completed ✓
2024-06-22 14:26:52,497 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-22 14:26:53,497 - BERTopic - Representation - Completed ✓


## Left OCR clustering

In [14]:
output_path = "../data dir/hdbscan_left_ocr_labels.npy"
input_path  = '../data dir/left_ocr_embeddings.npy'
left_ocr_embeddings, left_cluster_labels = get_data(input_path, output_path, 128, 5)

In [15]:
num_of_cluster(left_cluster_labels)

Number of clusters: 2


In [18]:
df = pd.read_csv("../data dir/left_ocr.csv")
df['cluster_labels'] = left_cluster_labels
df['translated_image_ocr'] = df['translated_image_ocr'].fillna('')
top_words = get_top_tfidf_words_per_cluster(df, left_cluster_labels, 'translated_image_ocr')

In [19]:
visualize_clusters_with_communities(left_ocr_embeddings, left_cluster_labels, top_words, 0.95)

unique communities are: {0: 0, 1: 0}
Counter({0: 2})


In [20]:
model, _, _  = bertopicModelling(128, 5, 5, df['translated_image_ocr'])
model.visualize_barchart()

2024-06-27 13:08:12,756 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/56 [00:00<?, ?it/s]

2024-06-27 13:08:14,245 - BERTopic - Embedding - Completed ✓
2024-06-27 13:08:14,245 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-27 13:08:23,526 - BERTopic - Dimensionality - Completed ✓
2024-06-27 13:08:23,526 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-27 13:08:24,225 - BERTopic - Cluster - Completed ✓
2024-06-27 13:08:24,235 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-27 13:08:24,299 - BERTopic - Representation - Completed ✓


## Centre OCR Clustring

In [27]:
output_path = "../data dir/hdbscan_centre_ocr_labels.npy"
input_path  = '../data dir/centre_ocr_embeddings.npy'
centre_ocr_embeddings, centre_cluster_labels = get_data(input_path, output_path, 128, 10)

In [28]:
num_of_cluster(centre_cluster_labels)

Number of clusters: 2


In [26]:
df = pd.read_csv("../data dir/centre_ocr.csv")
df['cluster_labels'] = centre_cluster_labels
df['translated_image_ocr'] = df['translated_image_ocr'].fillna('')
top_words = get_top_tfidf_words_per_cluster(df, centre_cluster_labels, 'translated_image_ocr')

In [29]:
visualize_clusters_with_communities(left_ocr_embeddings, left_cluster_labels, top_words, 0.95)

unique communities are: {0: 0, 1: 0}
Counter({0: 2})


In [30]:
model, _, _  = bertopicModelling(128, 5, 5, df['translated_image_ocr'])
model.visualize_barchart()

2024-06-27 13:16:14,823 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

2024-06-27 13:16:15,900 - BERTopic - Embedding - Completed ✓
2024-06-27 13:16:15,901 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-27 13:16:19,227 - BERTopic - Dimensionality - Completed ✓
2024-06-27 13:16:19,227 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-27 13:16:19,312 - BERTopic - Cluster - Completed ✓
2024-06-27 13:16:19,315 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-27 13:16:19,403 - BERTopic - Representation - Completed ✓


## Right Ocr clustring


In [31]:
output_path = "../data dir/hdbscan_right_ocr_labels.npy"
input_path  = '../data dir/right_ocr_embeddings.npy'
right_ocr_embeddings, right_cluster_labels = get_data(input_path, output_path, 128, 10)

In [34]:
num_of_cluster(right_cluster_labels)

Number of clusters: 2


In [35]:
df = pd.read_csv("../data dir/right_ocr.csv")
df['cluster_labels'] = right_cluster_labels
df['translated_image_ocr'] = df['translated_image_ocr'].fillna('')
top_words = get_top_tfidf_words_per_cluster(df, right_cluster_labels, 'translated_image_ocr')

In [36]:
visualize_clusters_with_communities(left_ocr_embeddings, left_cluster_labels, top_words, 0.95)

unique communities are: {0: 0, 1: 0}
Counter({0: 2})


In [37]:
model, _, _  = bertopicModelling(128, 10, 5, df['translated_image_ocr'])
model.visualize_barchart()

2024-06-27 13:20:15,808 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/65 [00:00<?, ?it/s]

2024-06-27 13:20:20,092 - BERTopic - Embedding - Completed ✓
2024-06-27 13:20:20,093 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-27 13:20:26,404 - BERTopic - Dimensionality - Completed ✓
2024-06-27 13:20:26,406 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-27 13:20:27,090 - BERTopic - Cluster - Completed ✓
2024-06-27 13:20:27,093 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-27 13:20:27,503 - BERTopic - Representation - Completed ✓


# ITT Clustering