In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Load your datasets
df_train = pd.read_csv('../../processed_data/processed_train.csv')  
df_test = pd.read_csv('../../processed_data/processed_test.csv')  

# Combine both datasets
df = pd.concat([df_train, df_test], ignore_index=True)
df.to_csv('../../processed_data/full_data.csv', index=False)



2024-11-07 07:18:49.932711: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-07 07:18:49.937920: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-07 07:18:49.990177: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-07 07:18:50.027423: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730944130.082438    6729 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730944130.09

In [2]:
df['category'].value_counts()

category
online financial fraud                            76306
online and social media related crime             16277
any other cyber crime                             14547
women/child related crime                          8826
cyber attack/ dependent crimes                     4869
hacking  damage to computercomputer system etc     1710
cryptocurrency crime                                646
hacking  damage to computer system etc              592
online gambling  betting                            578
online cyber trafficking                            244
cyber terrorism                                     213
ransomware                                           74
crime against women & children                        4
report unlawful content                               1
Name: count, dtype: int64

In [3]:
df.shape

(124887, 3)

In [4]:
df['sub_category'].value_counts()


sub_category
upi related frauds                                                      35729
other                                                                   14547
debit/credit card fraud or sim swap fraud                               14357
internet banking related fraud                                          11844
fraud call/vishing                                                       7628
cyber bullying/stalking/sexting                                          5455
ewallet related fraud                                                    5385
rape/gang rape-sexually abusive content                                  3734
fakeimpersonating profile                                                3062
profile hacking identity theft                                           2823
cheating by impersonation                                                2706
sexually obscene material                                                2503
sexually explicit act                              

In [5]:
df['category'].value_counts()


category
online financial fraud                            76306
online and social media related crime             16277
any other cyber crime                             14547
women/child related crime                          8826
cyber attack/ dependent crimes                     4869
hacking  damage to computercomputer system etc     1710
cryptocurrency crime                                646
hacking  damage to computer system etc              592
online gambling  betting                            578
online cyber trafficking                            244
cyber terrorism                                     213
ransomware                                           74
crime against women & children                        4
report unlawful content                               1
Name: count, dtype: int64

In [6]:

# Preprocess text data
max_words = 5000
max_len = 200
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df['crimeaditionalinfo'])

X = tokenizer.texts_to_sequences(df['crimeaditionalinfo'])
X = pad_sequences(X, maxlen=max_len)


In [7]:

# Encode category and subcategory labels
label_encoder_category = LabelEncoder()
y_category = label_encoder_category.fit_transform(df['category'])

label_encoder_subcategory = LabelEncoder()
y_subcategory = label_encoder_subcategory.fit_transform(df['sub_category'])



In [8]:
# Split the combined dataset into training and testing sets
X_train, X_test, y_train_category, y_test_category, y_train_subcategory, y_test_subcategory = train_test_split(
    X, y_category, y_subcategory, test_size=0.2, random_state=42
)


In [18]:
len(y_test_category)

24978

In [32]:

# Build a multi-output model
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(max_words, 128, input_length=max_len)(input_layer)
lstm_layer = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)
dense_layer = Dense(64, activation='relu')(lstm_layer)
dropout_layer = Dropout(0.5)(dense_layer)




In [33]:

# Separate output layers for each model
category_output = Dense(len(np.unique(y_train_category)), activation='softmax')(dropout_layer)
subcategory_output = Dense(len(np.unique(y_train_subcategory)), activation='softmax')(dropout_layer)

# Define two separate models
category_model = Model(inputs=input_layer, outputs=category_output)
subcategory_model = Model(inputs=input_layer, outputs=subcategory_output)

# Compute class weights for each output
category_class_weights = compute_class_weight('balanced', classes=np.unique(y_train_category), y=y_train_category)
subcategory_class_weights = compute_class_weight('balanced', classes=np.unique(y_train_subcategory), y=y_train_subcategory)


In [34]:
category_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

subcategory_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [37]:
category_model.fit(
    X_train, y_train_category,
    batch_size=32,
    epochs=5,
    validation_data=(X_test, y_test_category),
    class_weight=dict(enumerate(category_class_weights))
)

# Save the entire model to a file
category_model.save('category_model.h5')

Epoch 1/5
[1m3123/3123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 56ms/step - accuracy: 0.5019 - loss: 1.5601 - val_accuracy: 0.4471 - val_loss: 1.5038
Epoch 2/5
[1m3123/3123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 53ms/step - accuracy: 0.5119 - loss: 1.9586 - val_accuracy: 0.5635 - val_loss: 1.2216
Epoch 3/5
[1m3123/3123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 50ms/step - accuracy: 0.5355 - loss: 1.5742 - val_accuracy: 0.5649 - val_loss: 1.1677
Epoch 4/5
[1m3123/3123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 50ms/step - accuracy: 0.5313 - loss: 1.2843 - val_accuracy: 0.5473 - val_loss: 1.2334
Epoch 5/5
[1m3123/3123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 51ms/step - accuracy: 0.5557 - loss: 1.4510 - val_accuracy: 0.5479 - val_loss: 1.1985




In [38]:
subcategory_model.fit(
    X_train, y_train_subcategory,
    batch_size=32,
    epochs=5,
    validation_data=(X_test, y_test_subcategory),
    class_weight=dict(enumerate(subcategory_class_weights))
)
# Save the entire model to a file
subcategory_model.save('subcategory_model_model.h5')


Epoch 1/5
[1m3123/3123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 51ms/step - accuracy: 0.2104 - loss: 2.3780 - val_accuracy: 0.3256 - val_loss: 2.2129
Epoch 2/5
[1m3123/3123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 53ms/step - accuracy: 0.2829 - loss: 2.1115 - val_accuracy: 0.3327 - val_loss: 2.1592
Epoch 3/5
[1m3123/3123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 53ms/step - accuracy: 0.3147 - loss: 2.3114 - val_accuracy: 0.3157 - val_loss: 2.2162
Epoch 4/5
[1m3123/3123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 53ms/step - accuracy: 0.3264 - loss: 2.1440 - val_accuracy: 0.3568 - val_loss: 2.1099
Epoch 5/5
[1m3123/3123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 54ms/step - accuracy: 0.3353 - loss: 2.0888 - val_accuracy: 0.3466 - val_loss: 2.1286




In [39]:
import pickle

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [40]:
# Save the label encoders to files
with open('label_encoder_category.pickle', 'wb') as handle:
    pickle.dump(label_encoder_category, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('label_encoder_subcategory.pickle', 'wb') as handle:
    pickle.dump(label_encoder_subcategory, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [43]:
# Function to preprocess and classify new text
def classify_text(text):
    # Tokenize and pad the input text
    text_sequence = tokenizer.texts_to_sequences([text])
    text_padded = pad_sequences(text_sequence, maxlen=max_len)

    # Make predictions
    category_pred= category_model.predict(text_padded)
    subcategory_pred=subcategory_model.predict(text_padded)

    # Get the class with the highest probability
    category_index = category_pred.argmax(axis=-1)[0]
    subcategory_index = subcategory_pred.argmax(axis=-1)[0]

    # Decode the labels
    category_label = label_encoder_category.inverse_transform([category_index])[0]
    subcategory_label = label_encoder_subcategory.inverse_transform([subcategory_index])[0]

    return category_label, subcategory_label

# Example usage
new_text = "In apna Job I have applied for job interview for telecalling and the resource management wrote that twelve hundred will be charged for security amount of laptop and work from home when I have given interview on the given address next day they charged twelve hundred and six hundred more money in the name of insurance after that they have referred me to the job calling there is no work of laptop neither a work from home kindly please take action against it as soon as possible and if possible please help me to recover my financial loss"
category, subcategory = classify_text(new_text)
print(f"Category: {category}, Subcategory: {subcategory}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Category: any other cyber crime, Subcategory: business email compromise/email takeover
