<a href="https://colab.research.google.com/github/lpon3692-svg/AI-DRIVEN-CYBER-CRIME-REPORTING-AND-CLASSIFICATION/blob/main/Untitled16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import random
import re
import nltk
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download NLTK stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

# =========================================================
# PART 1: CREATE LARGE CYBERCRIME DATASET (2500+ RECORDS)
# =========================================================

categories_keywords = {
    "Phishing": ["OTP", "KYC", "bank verification", "UPI link", "lottery", "refund", "bank email"],
    "Financial Fraud": ["credit card", "debit card", "UPI", "loan app", "investment", "ATM", "online payment"],
    "Identity Theft": ["Aadhaar", "PAN", "SIM card", "Gmail", "Facebook", "Instagram", "LinkedIn"],
    "Cyberbullying": ["Instagram", "WhatsApp", "Facebook", "college group", "gaming app", "TikTok", "Snapchat"],
    "Malware Attack": ["ransomware", "trojan", "virus", "spyware", "malicious app", "keylogger", "malware link"]
}

templates = [
    "I received a fake {item} message.",
    "My {item} was hacked yesterday.",
    "Fraud happened through {item}.",
    "Suspicious activity detected in {item}.",
    "Unauthorized access in my {item}.",
    "Online scam related to {item}.",
    "{item} related cybercrime complaint.",
    "Fake notification regarding {item}.",
    "My account compromised via {item}.",
    "Hacker tried to steal my {item}."
]

# Generate 500 samples per category â†’ total 2500 complaints
data = []
for cat, items in categories_keywords.items():
    for _ in range(500):
        template = random.choice(templates)
        item = random.choice(items)
        complaint = template.replace("{item}", item)
        data.append([complaint, cat])

df = pd.DataFrame(data, columns=["complaint", "category"])
print("Total Dataset Size:", len(df))  # Should print 2500

# =========================================================
# PART 2: TEXT CLEANING
# =========================================================

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['cleaned_text'] = df['complaint'].apply(clean_text)

# =========================================================
# PART 3: TRAIN LOGISTIC REGRESSION CLASSIFIER
# =========================================================

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=3000, class_weight='balanced')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("\n==== CYBERCRIME CLASSIFICATION METRICS ====")
print("Accuracy:", round(accuracy_score(y_test, y_pred)*100,2), "%")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# =========================================================
# PART 4: CYBERCRIME RATE DATASET (2015-2024)
# =========================================================

rate_data = {
    "Year": [2015,2016,2017,2018,2019,2020,2021,2022,2023,2024],
    "Internet_Users_Million": [320,350,380,410,450,500,550,600,650,700],
    "Digital_Transactions_Billion": [10,15,20,30,50,80,120,160,210,270],
    "Cybercrime_Cases": [300,420,550,700,950,1300,1900,2600,3400,4300]
}

df_rate = pd.DataFrame(rate_data)
X_rate = df_rate[['Year','Internet_Users_Million','Digital_Transactions_Billion']]
y_rate = df_rate['Cybercrime_Cases']

rate_model = LinearRegression()
rate_model.fit(X_rate, y_rate)

def risk_level(cases):
    if cases < 2000:
        return "Low Risk"
    elif cases < 4000:
        return "Medium Risk"
    else:
        return "High Risk"

# =========================================================
# PART 5: REAL-TIME USER INPUT
# =========================================================

print("\n==== AI-DRIVEN CYBERCRIME REPORTING SYSTEM ====")

while True:
    report = input("\nEnter Cybercrime Report (type 'exit' to quit):\n> ")
    if report.lower() == 'exit':
        print("Exiting system...")
        break

    cleaned = clean_text(report)
    vec = vectorizer.transform([cleaned])
    prediction = clf.predict(vec)[0]
    print(f"Predicted Crime Type: {prediction}")

    try:
        future_year = int(input("Enter year to predict Cybercrime Cases (2015-2025): "))
        future_internet_users = int(input("Enter Internet Users (Million): "))
        future_transactions = int(input("Enter Digital Transactions (Billion): "))

        future_df = pd.DataFrame({
            "Year": [future_year],
            "Internet_Users_Million": [future_internet_users],
            "Digital_Transactions_Billion": [future_transactions]
        })

        predicted_cases = int(rate_model.predict(future_df)[0])
        print(f"Predicted Cybercrime Cases in {future_year}: {predicted_cases}")
        print(f"Risk Level: {risk_level(predicted_cases)}")
    except:
        print("Invalid input for prediction.")

# =========================================================
# PART 6: PREDICT 2025 CASES AND VISUALIZATION
# =========================================================

future_2025 = pd.DataFrame({
    "Year": [2025],
    "Internet_Users_Million": [750],
    "Digital_Transactions_Billion": [340]
})
pred_2025 = int(rate_model.predict(future_2025)[0])
print(f"\nPredicted Cybercrime Cases in 2025: {pred_2025}")
print(f"Risk Level for 2025: {risk_level(pred_2025)}")

years = df_rate['Year'].tolist() + [2025]
cases = df_rate['Cybercrime_Cases'].tolist() + [pred_2025]

plt.figure()
plt.plot(years, cases, marker='o', linestyle='--', color='blue')
plt.xlabel("Year")
plt.ylabel("Cybercrime Cases")
plt.title("Cybercrime Cases Prediction (2015-2025)")
plt.grid(True)
plt.show()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Total Dataset Size: 2500

==== CYBERCRIME CLASSIFICATION METRICS ====
Accuracy: 93.4 %
Confusion Matrix:
 [[ 96   0  15   0   0]
 [  0  88   0   0   0]
 [ 18   0  90   0   0]
 [  0   0   0  93   0]
 [  0   0   0   0 100]]

Classification Report:
                  precision    recall  f1-score   support

  Cyberbullying       0.84      0.86      0.85       111
Financial Fraud       1.00      1.00      1.00        88
 Identity Theft       0.86      0.83      0.85       108
 Malware Attack       1.00      1.00      1.00        93
       Phishing       1.00      1.00      1.00       100

       accuracy                           0.93       500
      macro avg       0.94      0.94      0.94       500
   weighted avg       0.93      0.93      0.93       500


==== AI-DRIVEN CYBERCRIME REPORTING SYSTEM ====
