# This script is used to perform LDA clustering on the data

In [9]:
# Importing the necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import LdaModel
from gensim import corpora
from pprint import pprint
from loguru import logger
import numpy as np
from scipy.stats import chi2
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Download the important files for NLTK
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shubhamluharuka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/shubhamluharuka/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shubhamluharuka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
STOP_WORDS = set(stopwords.words('english'))

def preprocess(text):
    try:
        tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
        tokens = [word for word in tokens if word.isalnum()]  # Remove non-alphanumeric tokens
        tokens = [word for word in tokens if word not in STOP_WORDS]  # Remove stopwords
        return tokens
    except:
        print(text)
        raise

def dictionary_creation(df,text_column=None):
    dictionary = corpora.Dictionary(df[text_column])
    corpus = [dictionary.doc2bow(text) for text in df[text_column]]
    return dictionary,corpus

In [11]:
def start_clustering(df,text_column=None,lda_config:dict={}):
    df[text_column] = df[text_column].apply(preprocess)
    logger.info("Preprocessing done")
    dictionary,corpus = dictionary_creation(df,text_column)
    logger.info("Dictionary  and corpus created")

    logger.info("Starting LDA model for clustering to find the outliers")
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, **lda_config)
    pprint(lda_model.print_topics(num_words=5,num_topics=15),width=500)

    df['bow'] = df[text_column].apply(lambda x: dictionary.doc2bow(x))
    df['topic_distribution'] = df['bow'].apply(lambda x: lda_model.get_document_topics(x))
    label=[]
    for i in df['topic_distribution']:
        label.append(max(i, key=lambda item: item[1])[0])
    df['lda_label']=label

    return lda_model,dictionary,corpus,df

In [17]:
FILE = "../data.csv"
TEXT_COLUMN = "crimeaditionalinfo"
df = pd.read_csv(FILE)[[TEXT_COLUMN]]
LDA_CONFIG = {
    "num_topics":57, # Mention the number of topics to be used for clustering
    "passes":10, # Number of passes through the corpus
    "random_state":42 # Random state for reproducibility
}
lda_model,dictionary,corpus,df = start_clustering(df,text_column=TEXT_COLUMN,lda_config=LDA_CONFIG)
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)


[32m2025-02-20 00:41:31.479[0m | [1mINFO    [0m | [36m__main__[0m:[36mstart_clustering[0m:[36m3[0m - [1mPreprocessing done[0m
[32m2025-02-20 00:41:35.042[0m | [1mINFO    [0m | [36m__main__[0m:[36mstart_clustering[0m:[36m5[0m - [1mDictionary  and corpus created[0m
[32m2025-02-20 00:41:35.042[0m | [1mINFO    [0m | [36m__main__[0m:[36mstart_clustering[0m:[36m7[0m - [1mStarting LDA model for clustering to find the outliers[0m


[(5, '0.129*"locked" + 0.126*"customer" + 0.105*"care" + 0.092*"app" + 0.063*"google"'),
 (56, '0.114*"u" + 0.081*"blackmail" + 0.054*"record" + 0.053*"accept" + 0.045*"vedio"'),
 (6, '0.192*"bank" + 0.161*"reverse" + 0.060*"account" + 0.053*"hdfc" + 0.048*"name"'),
 (36, '0.210*"utr" + 0.112*"accidentally" + 0.112*"citizen" + 0.062*"mother" + 0.060*"sim"'),
 (32, '0.133*"glitch" + 0.057*"iam" + 0.044*"frouder" + 0.044*"game" + 0.043*"ur"'),
 (47, '0.280*"transaction" + 0.148*"strange" + 0.076*"refund" + 0.065*"account" + 0.051*"amount"'),
 (27, '0.112*"pan" + 0.096*"without" + 0.089*"aadhar" + 0.052*"documents" + 0.051*"photo"'),
 (26, '0.330*"card" + 0.124*"credit" + 0.068*"debit" + 0.044*"bank" + 0.037*"otp"'),
 (11, '0.110*"health" + 0.046*"dec" + 0.036*"manager" + 0.028*"pnb" + 0.024*"hrs"'),
 (31, '0.126*"usually" + 0.070*"punjab" + 0.055*"house" + 0.053*"several" + 0.037*"false"'),
 (3, '0.048*"ani" + 0.043*"ki" + 0.041*"e" + 0.032*"lo" + 0.031*"ni"'),
 (53, '0.167*"account" + 0

  default_term_info = default_term_info.sort_values(


In [18]:
# Do the prediction of all the data in the df
cluster_id = []
for i in corpus:
    topics = lda_model.get_document_topics(i)
    dominant_topic = max(topics, key=lambda x: x[1])  # Find topic with highest probability
    cluster_id.append(dominant_topic[0])
df['cluster_id'] = cluster_id
df.head()

Unnamed: 0,crimeaditionalinfo,bow,topic_distribution,lda_label,cluster_id
0,"[sir, get, sms, pre, apporved, loan, ijust, cl...","[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[(13, 0.14422968), (20, 0.063852094), (21, 0.2...",21,21
1,"[number, frauder, call, ordered, amazon, money...","[(10, 3), (11, 1), (20, 1), (33, 1), (44, 3), ...","[(2, 0.09144272), (5, 0.15966721), (9, 0.05929...",34,34
2,"[received, notification, chrome, say, scratch,...","[(0, 1), (45, 2), (50, 1), (54, 1), (55, 1), (...","[(10, 0.22455561), (15, 0.050755057), (18, 0.0...",10,10
3,"[app, playstore, name, five, plus, download, p...","[(45, 1), (54, 1), (55, 2), (56, 1), (69, 1), ...","[(5, 0.16419528), (20, 0.044141524), (21, 0.10...",34,34
4,"[tr, id, sbiupi, id, mpmtirbaqptvuzxbawlwhdqcs...","[(37, 1), (93, 1), (94, 2), (95, 1), (96, 1), ...","[(17, 0.078253046), (20, 0.24160783), (26, 0.2...",55,55


In [19]:
# Now seperate the df into two parts validated and unvalidated
# Validated if they belong to cluster where total number of complaints are atleast 100
# Unvalidated if they belong to cluster where total number of complaints are less than 100
validated_df= []
non_validated_df= []

for cluster_id, sub_df in df.groupby('cluster_id'):
    if len(sub_df) >= 100:
        validated_df.append(sub_df)
    else:
        non_validated_df.append(sub_df)

validated_df = pd.concat(validated_df)
print("Total length of validated df",len(validated_df))

non_validated_df = pd.concat(non_validated_df)
print("Total length of non validated df",len(non_validated_df))


Total length of validated df 103246
Total length of non validated df 569


In [20]:
validated_df.to_csv("../validated_df.csv",index=False)
non_validated_df.to_csv("../non_validated_df.csv",index=False)

In [None]:
# Nonvalidated data will now be manually validated and corrected. 

# END