In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import joblib

# Load your datasets
df_train = pd.read_csv('../../processed_data/processed_train.csv')  
df_test = pd.read_csv('../../processed_data/processed_test.csv')  


In [2]:
df_train['sub_category'].unique()

array(['cyber bullying/stalking/sexting', 'fraud call/vishing',
       'online gambling  betting', 'online job fraud',
       'upi related frauds', 'internet banking related fraud',
       'rape/gang rape-sexually abusive content', 'other',
       'profile hacking identity theft',
       'debit/credit card fraud or sim swap fraud',
       'ewallet related fraud', 'data breach/theft',
       'cheating by impersonation',
       'denial of service (dos)/distributed denial of service (ddos) attacks',
       'fakeimpersonating profile', 'cryptocurrency fraud',
       'sexually explicit act', 'sexually obscene material',
       'malware attack', 'business email compromise/email takeover',
       'email hacking', 'hacking/defacement',
       'unauthorised access/data breach', 'sql injection',
       'provocative speech for unlawful acts', 'ransomware attack',
       'cyber terrorism',
       'child pornography/child sexual abuse material (csam)',
       'tampering with computer source documen

In [3]:

# Define features and targets for training
X_train = df_train['crimeaditionalinfo']  # Use the appropriate text column
y_sub_train = df_train['sub_category']

# Create pipelines for subcategory and category classification
sub_category_model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])




In [4]:

# Train models
sub_category_model.fit(X_train, y_sub_train)
joblib.dump(sub_category_model, 'sub_category_model.pkl')




['sub_category_model.pkl']

In [5]:
# Optional: Make predictions on the test set
X_test = df_test['crimeaditionalinfo']  # Use the appropriate text column
y_sub_pred = sub_category_model.predict(X_test)






In [6]:
from sklearn.metrics import accuracy_score

# Correct target for sub-category accuracy calculation
y_sub_test = df_test['sub_category']


# Calculate accuracy for sub-category
sub_category_accuracy = accuracy_score(y_sub_test, y_sub_pred)
print(f'Sub-category Accuracy: {sub_category_accuracy:.2f}')

Sub-category Accuracy: 0.47


In [9]:

# Define features and targets for training

X_train = df_train['sub_category']
y_cat_train = df_train['category']

from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string

stop_words = stopwords.words('english')

# Custom tokenizer to clean text
def custom_tokenizer(text):
    text = text.lower()  # Lowercasing
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    tokens = [word for word in text.split() if word not in stop_words]  # Remove stopwords
    return tokens

category_model = Pipeline([
    ('vectorizer',TfidfVectorizer(tokenizer=custom_tokenizer)),
    ('classifier', MultinomialNB())
])


[nltk_data] Downloading package stopwords to /home/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
category_model.fit(X_train, y_cat_train)



In [11]:
X_test = df_test['sub_category']  # Use the appropriate text column
y_cat_pred = category_model.predict(X_test)



In [12]:
from sklearn.metrics import accuracy_score

y_cat_test = df_test['sub_category']

# Calculate accuracy for sub-category
category_accuracy = accuracy_score(y_cat_test, y_cat_pred)
print(f'Category Accuracy: {category_accuracy:.2f}')

Category Accuracy: 0.01


In [13]:
joblib.dump(sub_category_model, 'category_model.pkl')


['category_model.pkl']

In [20]:
import joblib

# Load the model
sub_category_model = joblib.load('category_model.pkl')

# Sample subcategory input
sample_subcategory = ["other"]  # Replace with actual subcategory input

# Predict the category
predicted_category = sub_category_model.predict(sample_subcategory)

print(f"Predicted Category: {predicted_category[0]}")


Predicted Category: upi related frauds
