<a href="https://colab.research.google.com/github/lpon3692-svg/AI-DRIVEN-CYBER-CRIME-REPORTING-AND-CLASSIFICATION/blob/main/Untitled13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =====================================================
# AI DRIVEN CYBERCRIME REPORTING AND CLASSIFICATION SYSTEM
# Using 560 Dataset + LinearSVC
# =====================================================

import pandas as pd
import numpy as np
import random
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# =====================================================
# STEP 1: GENERATE 560 DATASET
# =====================================================

categories = {
    "Phishing": [
        "Received fake email asking for bank account details",
        "Suspicious OTP request from unknown website",
        "Fake banking website collected my login credentials",
        "Scam message pretending to be from RBI",
        "Phishing SMS asking for KYC update",
        "Fraud link sent through WhatsApp",
        "Fake customer care executive requested OTP"
    ],

    "Malware": [
        "Malicious software infected my laptop",
        "Virus attacked my computer system",
        "Trojan detected in downloaded attachment",
        "Spyware installed automatically",
        "System slowed due to malware infection",
        "Unknown application stealing data",
        "Antivirus detected harmful program"
    ],

    "Ransomware": [
        "Files encrypted and ransom demanded in bitcoin",
        "System locked asking for payment",
        "All documents inaccessible due to ransomware",
        "Ransom note displayed on computer screen",
        "Data locked by hacker group",
        "Server encrypted and ransom requested",
        "Important files renamed with unknown extension"
    ],

    "Online Fraud": [
        "Lost money in online shopping scam",
        "Fake investment platform cheated me",
        "UPI fraud transaction occurred",
        "Credit card fraud detected",
        "Online job scam took registration fee",
        "Fake loan app threatened me",
        "E-commerce refund scam"
    ],

    "Identity Theft": [
        "Someone misused my PAN card details",
        "Aadhar card information stolen",
        "Fake bank account opened in my name",
        "Personal documents used illegally",
        "SIM card issued without my permission",
        "Impersonation fraud reported",
        "Digital signature misused"
    ],

    "Cyberbullying": [
        "Receiving threatening messages on Instagram",
        "Harassed continuously on social media",
        "Abusive comments posted publicly",
        "Online blackmail with morphed images",
        "Fake rumors spread about me",
        "Cyberstalking incident reported",
        "Defamation through fake account"
    ],

    "Hacking": [
        "My Gmail account was hacked",
        "Facebook account compromised",
        "Unauthorized login from unknown location",
        "Password changed without consent",
        "Website hacked by attackers",
        "Server breach detected",
        "Admin access stolen"
    ],

    "Data Breach": [
        "Company database leaked my personal data",
        "Sensitive customer information exposed",
        "Personal email address leaked online",
        "Banking details leaked from server",
        "Confidential records shared publicly",
        "Employee data breach incident",
        "Database accessed illegally"
    ]
}

data = []

for label, texts in categories.items():
    for _ in range(70):   # 70 Ã— 8 = 560
        complaint = random.choice(texts)
        data.append([complaint, label])

df = pd.DataFrame(data, columns=["text", "label"])

print("Dataset Size:", len(df))

# =====================================================
# STEP 2: TEXT PREPROCESSING
# =====================================================

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]', '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['clean_text'] = df['text'].apply(preprocess)

# =====================================================
# STEP 3: TF-IDF VECTORIZATION
# =====================================================

vectorizer = TfidfVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

# =====================================================
# STEP 4: TRAIN TEST SPLIT
# =====================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# =====================================================
# STEP 5: MODEL TRAINING (Linear SVM)
# =====================================================

model = LinearSVC()
model.fit(X_train, y_train)

# =====================================================
# STEP 6: MODEL EVALUATION
# =====================================================

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("\nModel Accuracy:", round(accuracy * 100, 2), "%")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# =====================================================
# STEP 7: USER INPUT PREDICTION
# =====================================================

def predict_crime(text):
    text = preprocess(text)
    vector = vectorizer.transform([text])
    prediction = model.predict(vector)
    return prediction[0]

# Example Test
user_input = input("\nEnter Cybercrime Complaint: ")
print("Predicted Category:", predict_crime(user_input))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Dataset Size: 560

Model Accuracy: 100.0 %

Classification Report:

                precision    recall  f1-score   support

 Cyberbullying       1.00      1.00      1.00        12
   Data Breach       1.00      1.00      1.00        14
       Hacking       1.00      1.00      1.00        11
Identity Theft       1.00      1.00      1.00        13
       Malware       1.00      1.00      1.00        23
  Online Fraud       1.00      1.00      1.00        11
      Phishing       1.00      1.00      1.00        11
    Ransomware       1.00      1.00      1.00        17

      accuracy                           1.00       112
     macro avg       1.00      1.00      1.00       112
  weighted avg       1.00      1.00      1.00       112

