In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import RandomOverSampler

import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
# Preprocess text data
max_words = 200000
max_len = 500
# Load your datasets
df = pd.read_csv('../../processed_data/full_data.csv')  





[nltk_data] Downloading package stopwords to /home/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/admin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# Step 1: Text Cleaning and Tokenization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [3]:
import re
def clean_text(text):
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    # Lemmatization
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
    return text

In [15]:
df['category'].value_counts()

category
online financial fraud                            76306
online and social media related crime             16277
any other cyber crime                             14547
women/child related crime                          8826
cyber attack/ dependent crimes                     4869
hacking  damage to computercomputer system etc     1710
cryptocurrency crime                                646
hacking  damage to computer system etc              592
online gambling  betting                            578
online cyber trafficking                            244
cyber terrorism                                     213
ransomware                                           74
crime against women & children                        4
report unlawful content                               1
Name: count, dtype: int64

In [16]:
df.shape

(124887, 3)

In [17]:
df['sub_category'].value_counts()


sub_category
upi related frauds                                                      35729
other                                                                   14547
debit/credit card fraud or sim swap fraud                               14357
internet banking related fraud                                          11844
fraud call/vishing                                                       7628
cyber bullying/stalking/sexting                                          5455
ewallet related fraud                                                    5385
rape/gang rape-sexually abusive content                                  3734
fakeimpersonating profile                                                3062
profile hacking identity theft                                           2823
cheating by impersonation                                                2706
sexually obscene material                                                2503
sexually explicit act                              

In [18]:
df['crimeaditionalinfo'] = df['crimeaditionalinfo'].astype(str).apply(clean_text)


In [19]:
df['crimeaditionalinfo'].head()

0    continue received random call abusive message ...
1    fraudster continuously messaging asking pay mo...
2    acting like police demanding money adding sect...
3    apna job applied job interview telecalling res...
4    received call lady stating send new phone vivo...
Name: crimeaditionalinfo, dtype: object

In [23]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['crimeaditionalinfo'])

# Get all unique words (vocabulary)
unique_words = vectorizer.get_feature_names_out()

# Display the unique words
print("Total unique words:", len(unique_words))


Total unique words: 132377


In [35]:
X=df['crimeaditionalinfo']
y=df['category']
# Encode category and subcategory labels
label_encoder_category = LabelEncoder()
y_category = label_encoder_category.fit_transform(df['category'])
ros = RandomOverSampler(sampling_strategy='minority', random_state=42)
X_resampled, y_resampled = ros.fit_resample(X.values.reshape(-1, 1), y_category)
X_resampled = X_resampled.flatten()

In [36]:


tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(X_resampled)

X_seq= tokenizer.texts_to_sequences(X_resampled)
X_pad = pad_sequences(X_seq, maxlen=max_len)


In [38]:
from tensorflow.keras.utils import to_categorical

y_cat = to_categorical(y_resampled)


In [40]:
from sklearn.utils import class_weight

# Step 6: Calculate Class Weights for Model Training
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_resampled),
    y=y_resampled
)
class_weights = {i: class_weights[i] for i in range(len(class_weights))}


In [45]:
class_weights

{0: np.float64(0.9878914651032613),
 1: np.float64(3592.714285714286),
 2: np.float64(22.24590888987174),
 3: np.float64(2.951500748173576),
 4: np.float64(67.46881287726359),
 5: np.float64(24.275096525096526),
 6: np.float64(8.404010025062657),
 7: np.float64(0.882893478089153),
 8: np.float64(58.89695550351288),
 9: np.float64(0.18833194169340736),
 10: np.float64(24.863074641621356),
 11: np.float64(194.2007722007722),
 12: np.float64(0.18833194169340736),
 13: np.float64(1.6282412353112556)}

In [41]:
X_train, X_val, y_train, y_val = train_test_split(X_pad, y_cat, test_size=0.2, random_state=42, stratify=y_resampled)



In [42]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
# Define the model architecture
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(max_words, 128, input_length=max_len)(input_layer)
lstm_layer = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)
dense_layer = Dense(64, activation='relu')(lstm_layer)
dropout_layer = Dropout(0.5)(dense_layer)
output_layer = Dense(len(np.unique(y_resampled)), activation='softmax')(dropout_layer)

# Compile the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(
    loss='categorical_crossentropy',  # Suitable for multiclass classification
    optimizer='adam',
    metrics=['accuracy']
)

# Display the model summary
model.summary()

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_val, y_val),
    class_weight=class_weights,
    callbacks=[early_stopping]
)

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val, verbose=0)
print(f"Validation Loss: {loss:.4f}, Validation Accuracy: {accuracy:.4f}")

2024-11-13 20:33:24.026137: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Epoch 1/10
[1m5030/5030[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1570s[0m 311ms/step - accuracy: 0.6118 - loss: 2.4220 - val_accuracy: 0.6631 - val_loss: 0.9526
Epoch 2/10
[1m5030/5030[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1573s[0m 313ms/step - accuracy: 0.6645 - loss: 3.1594 - val_accuracy: 0.6857 - val_loss: 0.9151
Epoch 3/10
[1m5030/5030[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1580s[0m 314ms/step - accuracy: 0.6691 - loss: 1.7516 - val_accuracy: 0.6676 - val_loss: 0.9343
Epoch 4/10
[1m5030/5030[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1577s[0m 314ms/step - accuracy: 0.6867 - loss: 1.9436 - val_accuracy: 0.6985 - val_loss: 0.7774
Epoch 5/10
[1m5030/5030[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1574s[0m 313ms/step - accuracy: 0.7092 - loss: 1.2918 - val_accuracy: 0.7024 - val_loss: 0.7873
Epoch 6/10
[1m5030/5030[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1574s[0m 313ms/step - accuracy: 0.7224 - loss: 1.6901 - val_accuracy: 0.7289 - val

In [46]:
# Save the model in keras format

# model.save('category_text_classification_model.keras')


In [50]:
# import pickle

# with open('category_tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [49]:
# # Save the label encoders to files
# with open('label_encoder_category.pickle', 'wb') as handle:
#     pickle.dump(label_encoder_category, handle, protocol=pickle.HIGHEST_PROTOCOL)



In [1]:
import pickle
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

# Load the saved model
model = load_model('category_text_classification_model.keras')

# Load the tokenizer
with open('category_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load the label encoder
with open('label_encoder_category.pickle', 'rb') as handle:
    label_encoder = pickle.load(handle)


2024-11-15 15:57:34.987768: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-15 15:57:35.109262: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731666455.177837    8297 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731666455.200079    8297 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-15 15:57:35.317767: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [21]:
# Sample new data
new_data = ["""hello sir and mam
this very urgent and emergency  to inform you
some one created a facebook fake account of my name and he demand for money of my facebook friends 
kindly request to you please take serious action on this matter"""]

# Clean and preprocess
new_data_cleaned = [clean_text(text) for text in new_data]

# Convert text to sequences
new_sequences = tokenizer.texts_to_sequences(new_data_cleaned)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_len)  

In [22]:
# Make predictions
predictions = model.predict(new_padded_sequences)

# Convert predictions to label indices
predicted_indices = np.argmax(predictions, axis=1)

# Decode indices to original category labels
predicted_labels = label_encoder.inverse_transform(predicted_indices)

# Display results
for i, text in enumerate(new_data):
    print(f"Text: {new_data_cleaned}")
    print(f"Predicted Category: {predicted_labels[i]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Text: ['hello sir mam urgent emergency inform one created facebook fake account name demand money facebook friend kindly request please take serious action matter']
Predicted Category: cyber terrorism
