<a href="https://colab.research.google.com/github/lpon3692-svg/GI/blob/main/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import random
import re
import nltk
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download('stopwords')
from nltk.corpus import stopwords

# =========================================================
# PART 1: CYBERCRIME REPORT DATASET
# =========================================================

categories = {
    "Phishing": [
        "Received fake email asking for bank OTP",
        "Scam link sent through SMS",
        "Fraud email claiming lottery win",
        "Fake website collected my login credentials",
        "Suspicious email requesting verification"
    ],
    "Financial Fraud": [
        "Money withdrawn without authorization",
        "Debit card misused for online purchase",
        "Fraud transaction in bank account",
        "Unknown charges on credit card",
        "Online investment scam"
    ],
    "Identity Theft": [
        "Someone used my Aadhaar details",
        "Fake social media account created",
        "Personal details stolen online",
        "Account hacked and impersonated",
        "Email account compromised"
    ],
    "Cyberbullying": [
        "Threatened on social media",
        "Abusive messages repeatedly",
        "Online harassment in chat groups",
        "Fake rumors spread online",
        "Trolling and hate messages"
    ],
    "Malware Attack": [
        "Laptop infected with ransomware",
        "Downloaded file installed spyware",
        "System crashed after clicking link",
        "Virus detected in computer",
        "Mobile phone infected by malware"
    ]
}

# Create synthetic dataset
data = []
for cat, texts in categories.items():
    for _ in range(100):
        data.append([random.choice(texts), cat])

df = pd.DataFrame(data, columns=["complaint", "category"])

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['cleaned_text'] = df['complaint'].apply(clean_text)

# =========================================================
# PART 2: TRAIN LOGISTIC REGRESSION CLASSIFIER
# =========================================================

vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['category']

# Split dataset for accuracy evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate Accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\n==== CYBERCRIME CLASSIFICATION METRICS ====")
print("Accuracy:", round(accuracy * 100,2), "%\n")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred), "\n")
print("Classification Report:\n", classification_report(y_test, y_pred))

# =========================================================
# PART 3: CYBERCRIME RATE DATASET (2015-2024)
# =========================================================

rate_data = {
    "Year": [2015,2016,2017,2018,2019,2020,2021,2022,2023,2024],
    "Internet_Users_Million": [320,350,380,410,450,500,550,600,650,700],
    "Digital_Transactions_Billion": [10,15,20,30,50,80,120,160,210,270],
    "Cybercrime_Cases": [300,420,550,700,950,1300,1900,2600,3400,4300]
}

df_rate = pd.DataFrame(rate_data)

X_rate = df_rate[['Year','Internet_Users_Million','Digital_Transactions_Billion']]
y_rate = df_rate['Cybercrime_Cases']

rate_model = LinearRegression()
rate_model.fit(X_rate, y_rate)

def risk_level(cases):
    if cases < 2000:
        return "Low Risk"
    elif cases < 4000:
        return "Medium Risk"
    else:
        return "High Risk"

# =========================================================
# PART 4: REAL-TIME USER INPUT
# =========================================================

print("\n==== AI-DRIVEN CYBERCRIME REPORTING & PREDICTION SYSTEM ====")

while True:
    report = input("\nEnter Cybercrime Report (or type 'exit' to quit):\n> ")
    if report.lower() == 'exit':
        print("Exiting system...")
        break

    # Classify report
    cleaned = clean_text(report)
    vec = vectorizer.transform([cleaned])
    prediction = clf.predict(vec)[0]
    print(f"\nPredicted Crime Type: {prediction}")

    # Predict future cybercrime cases
    try:
        future_year = int(input("Enter year to predict Cybercrime Cases (2015-2025): "))
        future_internet_users = int(input("Enter Internet Users (Million): "))
        future_transactions = int(input("Enter Digital Transactions (Billion): "))

        future_df = pd.DataFrame({
            "Year": [future_year],
            "Internet_Users_Million": [future_internet_users],
            "Digital_Transactions_Billion": [future_transactions]
        })

        predicted_cases = int(rate_model.predict(future_df)[0])
        print(f"Predicted Cybercrime Cases in {future_year}: {predicted_cases}")
        print(f"Risk Level: {risk_level(predicted_cases)}")
    except:
        print("Invalid input for prediction.")

# =========================================================
# PART 5: PREDICT 2025 CASES AND VISUALIZATION
# =========================================================

# Predict 2025
future_2025 = pd.DataFrame({
    "Year": [2025],
    "Internet_Users_Million": [750],
    "Digital_Transactions_Billion": [340]
})
pred_2025 = int(rate_model.predict(future_2025)[0])
print(f"\nPredicted Cybercrime Cases in 2025: {pred_2025}")
print(f"Risk Level for 2025: {risk_level(pred_2025)}")

# Visualization 2015-2025
years = df_rate['Year'].tolist() + [2025]
cases = df_rate['Cybercrime_Cases'].tolist() + [pred_2025]

plt.figure()
plt.plot(years, cases, marker='o', linestyle='--', color='blue')
plt.xlabel("Year")
plt.ylabel("Cybercrime Cases")
plt.title("Cybercrime Cases Prediction (2015-2025)")
plt.grid(True)
plt.show()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.



==== CYBERCRIME CLASSIFICATION METRICS ====
Accuracy: 100.0 %

Confusion Matrix:
 [[24  0  0  0  0]
 [ 0 14  0  0  0]
 [ 0  0 10  0  0]
 [ 0  0  0 24  0]
 [ 0  0  0  0 28]] 

Classification Report:
                  precision    recall  f1-score   support

  Cyberbullying       1.00      1.00      1.00        24
Financial Fraud       1.00      1.00      1.00        14
 Identity Theft       1.00      1.00      1.00        10
 Malware Attack       1.00      1.00      1.00        24
       Phishing       1.00      1.00      1.00        28

       accuracy                           1.00       100
      macro avg       1.00      1.00      1.00       100
   weighted avg       1.00      1.00      1.00       100


==== AI-DRIVEN CYBERCRIME REPORTING & PREDICTION SYSTEM ====

Enter Cybercrime Report (or type 'exit' to quit):
> sexual

Predicted Crime Type: Malware Attack
Enter year to predict Cybercrime Cases (2015-2025): 2026
Enter Internet Users (Million): 987
Enter Digital Transactions (Bil