In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load your datasets
df_train = pd.read_csv('../../processed_data/processed_train.csv')  
df_test = pd.read_csv('../../processed_data/processed_test.csv')  

# Combine both datasets
df = pd.concat([df_train, df_test], ignore_index=True)

# Preprocess text data
max_words = 5000
max_len = 200
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df['crimeaditionalinfo'])

X = tokenizer.texts_to_sequences(df['crimeaditionalinfo'])
X = pad_sequences(X, maxlen=max_len)

# Encode category and subcategory labels
label_encoder_category = LabelEncoder()
y_category = label_encoder_category.fit_transform(df['category'])

label_encoder_subcategory = LabelEncoder()
y_subcategory = label_encoder_subcategory.fit_transform(df['sub_category'])

# Split the combined dataset into training and testing sets
X_train, X_test, y_train_category, y_test_category, y_train_subcategory, y_test_subcategory = train_test_split(
    X, y_category, y_subcategory, test_size=0.2, random_state=42
)

# Build a multi-output model
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(max_words, 128, input_length=max_len)(input_layer)
lstm_layer = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)
dense_layer = Dense(64, activation='relu')(lstm_layer)
dropout_layer = Dropout(0.5)(dense_layer)

# Output layer for category
output_category = Dense(len(label_encoder_category.classes_), activation='softmax', name='category_output')(dropout_layer)

# Output layer for sub-category
output_subcategory = Dense(len(label_encoder_subcategory.classes_), activation='softmax', name='subcategory_output')(dropout_layer)

# Combine into a model
model = Model(inputs=input_layer, outputs=[output_category, output_subcategory])

# Compile the model
# Compile the model
model.compile(
    loss={
        'category_output': 'sparse_categorical_crossentropy', 
        'subcategory_output': 'sparse_categorical_crossentropy'
    },
    optimizer='adam',
    metrics={
        'category_output': 'accuracy', 
        'subcategory_output': 'accuracy'
    }
)
# Train the model
model.fit(
    X_train, 
    {'category_output': y_train_category, 'subcategory_output': y_train_subcategory},
    batch_size=32,
    epochs=5,
    validation_data=(X_test, {'category_output': y_test_category, 'subcategory_output': y_test_subcategory})
)


Epoch 1/5




[1m3123/3123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 62ms/step - category_output_accuracy: 0.7001 - category_output_loss: 0.9487 - loss: 3.1381 - subcategory_output_accuracy: 0.3412 - subcategory_output_loss: 2.1894 - val_category_output_accuracy: 0.7457 - val_category_output_loss: 0.7055 - val_loss: 2.3814 - val_subcategory_output_accuracy: 0.4836 - val_subcategory_output_loss: 1.6756
Epoch 2/5
[1m3123/3123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 62ms/step - category_output_accuracy: 0.7481 - category_output_loss: 0.7078 - loss: 2.3780 - subcategory_output_accuracy: 0.4936 - subcategory_output_loss: 1.6701 - val_category_output_accuracy: 0.7493 - val_category_output_loss: 0.6622 - val_loss: 2.2044 - val_subcategory_output_accuracy: 0.5252 - val_subcategory_output_loss: 1.5420
Epoch 3/5
[1m3123/3123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m370s[0m 119ms/step - category_output_accuracy: 0.7592 - category_output_loss: 0.6522 - loss: 2.1991 - subc

<keras.src.callbacks.history.History at 0x7cf274753fe0>

In [10]:
# Save the entire model to a file
model.save('multi_output_model.h5')




In [16]:
import pickle

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
# Save the label encoders to files
with open('label_encoder_category.pickle', 'wb') as handle:
    pickle.dump(label_encoder_category, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('label_encoder_subcategory.pickle', 'wb') as handle:
    pickle.dump(label_encoder_subcategory, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Function to preprocess and classify new text
def classify_text(text):
    # Tokenize and pad the input text
    text_sequence = tokenizer.texts_to_sequences([text])
    text_padded = pad_sequences(text_sequence, maxlen=max_len)

    # Make predictions
    category_pred, subcategory_pred = model.predict(text_padded)

    # Get the class with the highest probability
    category_index = category_pred.argmax(axis=-1)[0]
    subcategory_index = subcategory_pred.argmax(axis=-1)[0]

    # Decode the labels
    category_label = label_encoder_category.inverse_transform([category_index])[0]
    subcategory_label = label_encoder_subcategory.inverse_transform([subcategory_index])[0]

    return category_label, subcategory_label

# Example usage
new_text = "Your input text here"
category, subcategory = classify_text(new_text)
print(f"Category: {category}, Subcategory: {subcategory}")


In [13]:
# Example input text data
input_text = ["I am amit kumar from karwi chitrakoot I am totally depressed on fraud calls and msgs who harrashed me on phone plz arrest fraud caller"]

input_text=[input_text[0].lower]
# Preprocess the input text
input_sequence = tokenizer.texts_to_sequences(input_text)
input_sequence = pad_sequences(input_sequence, maxlen=max_len)

# Make predictions
predictions = model.predict(input_sequence)

# Extract predictions for category and subcategory
category_prediction = predictions[0]  # Predictions for category output
subcategory_prediction = predictions[1]  # Predictions for subcategory output

# Get the index of the class with the highest probability
predicted_category_index = category_prediction.argmax(axis=-1)[0]
predicted_subcategory_index = subcategory_prediction.argmax(axis=-1)[0]

# Decode the index back to the original label
predicted_category = label_encoder_category.inverse_transform([predicted_category_index])[0]
predicted_subcategory = label_encoder_subcategory.inverse_transform([predicted_subcategory_index])[0]

# Print the results
print("Predicted Category:", predicted_category)
print("Predicted Subcategory:", predicted_subcategory)


AttributeError: 'builtin_function_or_method' object has no attribute 'lower'