<a href="https://colab.research.google.com/github/lpon3692-svg/AI-DRIVEN-CYBER-CRIME-REPORTING-AND-CLASSIFICATION/blob/main/Untitled14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import re
import nltk
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download('stopwords')
from nltk.corpus import stopwords

# =========================================================
# PART 1: LOAD YOUR DATASET
# =========================================================

df = pd.read_csv("Enhanced_Cybercrime_Dataset.csv")

print("Dataset Loaded Successfully!")
print("Total Records:", len(df))
print(df.head())

# Ensure correct column names
# (Make sure your dataset has these columns)
# Complaint_Description, Crime_Category

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', str(text))
    text = text.lower()
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['cleaned_text'] = df['Complaint_Description'].apply(clean_text)

# =========================================================
# PART 2: TRAIN CLASSIFICATION MODEL
# =========================================================

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['Crime_Category']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("\n==== CYBERCRIME CLASSIFICATION METRICS ====")
print("Accuracy:", round(accuracy * 100,2), "%\n")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred), "\n")
print("Classification Report:\n", classification_report(y_test, y_pred))

# =========================================================
# PART 3: CYBERCRIME RATE PREDICTION MODEL
# =========================================================

rate_data = {
    "Year": [2015,2016,2017,2018,2019,2020,2021,2022,2023,2024],
    "Internet_Users_Million": [320,350,380,410,450,500,550,600,650,700],
    "Digital_Transactions_Billion": [10,15,20,30,50,80,120,160,210,270],
    "Cybercrime_Cases": [300,420,550,700,950,1300,1900,2600,3400,4300]
}

df_rate = pd.DataFrame(rate_data)

X_rate = df_rate[['Year','Internet_Users_Million','Digital_Transactions_Billion']]
y_rate = df_rate['Cybercrime_Cases']

rate_model = LinearRegression()
rate_model.fit(X_rate, y_rate)

def risk_level(cases):
    if cases < 2000:
        return "Low Risk"
    elif cases < 4000:
        return "Medium Risk"
    else:
        return "High Risk"

# =========================================================
# PART 4: REAL-TIME USER INPUT
# =========================================================

print("\n==== AI-DRIVEN CYBERCRIME REPORTING & PREDICTION SYSTEM ====")

while True:
    report = input("\nEnter Cybercrime Report (or type 'exit' to quit):\n> ")
    if report.lower() == 'exit':
        print("Exiting system...")
        break

    cleaned = clean_text(report)
    vec = vectorizer.transform([cleaned])
    prediction = clf.predict(vec)[0]

    print(f"\nPredicted Crime Type: {prediction}")

    try:
        future_year = int(input("Enter year to predict Cybercrime Cases (2015-2025): "))
        future_internet_users = int(input("Enter Internet Users (Million): "))
        future_transactions = int(input("Enter Digital Transactions (Billion): "))

        future_df = pd.DataFrame({
            "Year": [future_year],
            "Internet_Users_Million": [future_internet_users],
            "Digital_Transactions_Billion": [future_transactions]
        })

        predicted_cases = int(rate_model.predict(future_df)[0])
        print(f"Predicted Cybercrime Cases in {future_year}: {predicted_cases}")
        print(f"Risk Level: {risk_level(predicted_cases)}")

    except:
        print("Invalid input for prediction.")

# =========================================================
# PART 5: PREDICT 2025 & VISUALIZATION
# =========================================================

future_2025 = pd.DataFrame({
    "Year": [2025],
    "Internet_Users_Million": [750],
    "Digital_Transactions_Billion": [340]
})

pred_2025 = int(rate_model.predict(future_2025)[0])

print(f"\nPredicted Cybercrime Cases in 2025: {pred_2025}")
print(f"Risk Level for 2025: {risk_level(pred_2025)}")

years = df_rate['Year'].tolist() + [2025]
cases = df_rate['Cybercrime_Cases'].tolist() + [pred_2025]

plt.figure()
plt.plot(years, cases, marker='o', linestyle='--')
plt.xlabel("Year")
plt.ylabel("Cybercrime Cases")
plt.title("Cybercrime Cases Prediction (2015-2025)")
plt.grid(True)
plt.show()


Dataset Loaded Successfully!
Total Records: 2000
                                           Complaint             Category  \
0  I entered my login credentials on a website th...  Phishing & Spoofing   
1  Got a text message about my IRCTC account bein...  Phishing & Spoofing   
2  Received a fake email from HDFC asking for my ...  Phishing & Spoofing   
3  Received a WhatsApp message claiming I won a G...  Phishing & Spoofing   
4  A caller impersonating Amazon customer care as...  Phishing & Spoofing   

                                      Recommendation  
0  [Incident REF-32019] Change your login credent...  
1  [Incident REF-66103] Change your login credent...  
2  [Incident REF-32262] Change your login credent...  
3  [Incident REF-83367] Change your login credent...  
4  [Incident REF-95247] Change your login credent...  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyError: 'Complaint_Description'