## Load the Frameworks

In [1]:
# Load all the important libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
from gensim.models import LdaModel
from pprint import pprint

# Donload the important files for NLTK
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load the dataset for the clustering

In [13]:
train_df=pd.read_csv("data/train.csv")
test_df=pd.read_csv("data/test.csv")
train_df.rename(columns={'crimeadditionalinfo':'processed_content'},inplace=True)
test_df.rename(columns={'crimeadditionalinfo':'processed_content'},inplace=True)

In [17]:
# Processing the content given the dataset 
stop_words = set(stopwords.words('english'))

def preprocess(text):
    try:
        tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
        tokens = [word for word in tokens if word.isalnum()]  # Remove non-alphanumeric tokens
        tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
        return tokens
    except:
        print(text)
        raise

# Apply preprocessing
df['processed_text'] = df['processed_content'].apply(preprocess)
print(df['processed_text'])

0         [occured, jan, asleep, got, call, hdfc, bank, ...
1         [mentioned, site, impersonating, hitchi, servi...
2         [fouji, h, jo, apni, bullat, sail, kar, rha, t...
3         [maine, dhani, credit, card, ko, deactivate, k...
4         [website, kncchaicom, platform, ask, mining, m...
                                ...                        
117080    [mera, yono, app, kam, nahi, kar, raha, tha, g...
117081    [namesher, singh, cheeta, add, ajay, sir, badi...
117082    [taking, online, loan, critical, condition, re...
117083    [govind, singh, lured, naked, screen, video, c...
117084    [identity, theft, fake, customer, care, servic...
Name: processed_text, Length: 117085, dtype: object


In [None]:
# Create a dictionary and corpus for LDA

dictionary = corpora.Dictionary(df['processed_text'])
print("Dictionary created")
corpus = [dictionary.doc2bow(text) for text in df['processed_text']]
print("Corpus created")

In [None]:
# Train the LDA model with (67 sub categories cluster)
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=67, passes=10, random_state=42)
pprint(lda_model.print_topics(num_words=5,num_topics=15),width=500)

In [None]:
# Visualization of the LDA clusters
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Prepare the visualization
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)  # View the visualization

In [25]:
# segrgrgate the datapoints by mapping to respective cluster.

df['bow'] = df['processed_text'].apply(lambda x: dictionary.doc2bow(x))
df['topic_distribution'] = df['bow'].apply(lambda x: lda_model.get_document_topics(x))
label=[]
for i in df['topic_distribution']:
    label.append(max(i, key=lambda item: item[1])[0])

In [29]:
df['label']=label
df_i=df[['processed_content','sub_category','category','label']]
df_i[(df_i['label']==14)]['category'].value_counts()

In [52]:
df_i[(df_i['label']==14)]['category'].value_counts()

category
online financial fraud                                  9462
any other cyber crime                                    921
online and social media related crime                    516
hacking  damage to computercomputer system etc           227
sexually explicit act                                     47
sexually obscene material                                 42
crime against women & children                            39
cyber attack/ dependent crimes                            31
online gambling  betting                                  29
cryptocurrency crime                                      28
online cyber trafficking                                  21
child pornography cpchild sexual abuse material csam      16
cyber terrorism                                           13
rapegang rape rgrsexually abusive content                  2
Name: count, dtype: int64

In [None]:
# Remove the data points belongs to one cluster and having the frequency in the grouped data more than 40%.
# Validated data will be considered as proper GT
# Nonvalidated data will be retagged by using the Human feedback and semi supervised learning

validated=[]
non_validated=[]
for group, group_df in df_i.groupby("label"):
    category_counts = group_df["category"].value_counts()
    max_category = category_counts.idxmax()
    filtered_df = group_df[group_df["category"] == max_category]
    print(max_category,group,len(filtered_df),(len(filtered_df)*100)/len(group_df))
    
    # filtered_df.loc['validated']=True
    validated.append(filtered_df)
    false_df = group_df[~group_df.isin(filtered_df.to_dict(orient='list')).all(axis=1)]
    # false_df.loc['validated']=False
    non_validated.append(false_df)
    print(len(false_df),(len(false_df)*100)/len(group_df))

In [104]:
validated_df=pd.concat(validated,ignore_index=True)
nonvaludated_df=pd.concat(non_validated,ignore_index=True)
validated_df.to_csv("validated_data_point1.csv")
nonvaludated_df.to_csv("nonvalidated_data_point1.csv")

In [106]:
# Save the validated data for first iteration 
validated_df.to_csv("validated_data_point1.csv")
nonvaludated_df.to_csv("nonvalidated_data_point1.csv")

In [110]:
# Apply preprocessing on remaining data
nonvaludated_df['processed_text'] = nonvaludated_df['processed_content'].apply(preprocess)
print(nonvaludated_df['processed_text'])

0        [wrhjbdfg, ggthjfdgh, fghvddhv, cfgnbdghn, ggh...
1                                              [amazonpay]
2        [httpheroinexxxcomtagyashumashettyporn, httpsh...
3        [deceived, person, known, ever, knows, well, f...
4        [someone, sent, cod, order, pages, name, peopl...
                               ...                        
33055    [payment, deducted, account, found, checking, ...
33056    [autodebit, rs, setup, bhanix, finance, plz, h...
33057                         [pappu, yadav, mobile, lakh]
33058    [sir, someone, illegally, log, flipkart, accou...
33059    [fraud, happened, us, rs, dec, ten, days, also...
Name: processed_text, Length: 33060, dtype: object


In [None]:
# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(df['processed_text'])
corpus = [dictionary.doc2bow(text) for text in df['processed_text']]

In [112]:
# Unique values
nonvaludated_df['category'].unique()

array(['online financial fraud', 'sexually explicit act',
       'any other cyber crime', 'online and social media related crime',
       'crime against women & children', 'report unlawful content',
       'child pornography cpchild sexual abuse material csam',
       'online gambling  betting',
       'hacking  damage to computercomputer system etc',
       'sexually obscene material', 'cyber attack/ dependent crimes',
       'online cyber trafficking', 'cyber terrorism',
       'rapegang rape rgrsexually abusive content',
       'cryptocurrency crime'], dtype=object)

In [119]:
# Train the LDA model 
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=15, passes=10, random_state=42)
# Print the topics discovered
print("\nLDA Topics")


LDA Topics:


In [121]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Prepare the visualization
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)  # View the visualization

In [122]:
nonvaludated_df['bow'] = nonvaludated_df['processed_text'].apply(lambda x: dictionary.doc2bow(x))
nonvaludated_df['topic_distribution'] = nonvaludated_df['bow'].apply(lambda x: lda_model.get_document_topics(x))
label=[]
for i in nonvaludated_df['topic_distribution']:
    try:
        label.append(max(i, key=lambda item: item[1])[0])
    except:
        print("Not found")
        label.append(-1)

In [124]:
nonvaludated_df['label']=label
nonvaludated_df_i=nonvaludated_df[['processed_content','sub_category','category','label']]

In [125]:
nonvaludated_df_i[(nonvaludated_df_i['label']==0)]['category'].value_counts()

category
online financial fraud                                  50
online and social media related crime                   25
any other cyber crime                                   21
sexually explicit act                                    4
crime against women & children                           3
hacking  damage to computercomputer system etc           3
child pornography cpchild sexual abuse material csam     2
report unlawful content                                  1
online gambling  betting                                 1
Name: count, dtype: int64

In [131]:
# Categorization between validationand nonvalidated dataset
validated=[]
non_validated=[]
for group, group_df in nonvaludated_df_i.groupby("label"):
    category_counts = group_df["category"].value_counts()
    max_category = category_counts.idxmax()
    filtered_df = group_df[group_df["category"] == max_category]
    # print(max_category,group,len(filtered_df),(len(filtered_df)*100)/len(group_df))
    if (len(filtered_df)*100)/len(group_df)>40:
        validated.append(filtered_df)
        false_df = group_df[~group_df.isin(filtered_df.to_dict(orient='list')).all(axis=1)]
        non_validated.append(false_df)
    else:
        non_validated.append(group_df)
    # print(len(false_df),(len(false_df)*100)/len(group_df))

In [132]:
validated_df=pd.concat(validated,ignore_index=True)
nonvaludated_df=pd.concat(non_validated,ignore_index=True)

In [136]:
validated_df.to_csv("validated_data_point2.csv")
nonvaludated_df.to_csv("nonvalidated_data_point2.csv")

#### After 2 clustering iteration data is not further clustered because the datapoint to each cluster is less tha 40%. We will use the validated data as the groud truth adn nonvalidated data for further retagging using human feedback.


# End of the clustering