## Define Problem Statement: 

## Load Dataset

In [22]:
import pandas as pd

# Load the dataset
file_path = 'Dataset_CyberCrime_Sean.csv'
data = pd.read_csv(file_path)


## Explore Dataset

In [23]:
# Check basic information about the dataset
print(data.info())

# Display the first few rows of the dataset
print(data.head())

# Check for missing values in each column
print(data.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191 entries, 0 to 190
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   City                         190 non-null    object 
 1   Personal Revenge             190 non-null    float64
 2   Anger                        190 non-null    float64
 3   Fraud                        190 non-null    float64
 4   Extortion                    190 non-null    float64
 5   Causing Disrepute            190 non-null    float64
 6   Prank                        190 non-null    float64
 7   Sexual Exploitation          190 non-null    float64
 8   Disrupt Public Service       190 non-null    float64
 9   Sale purchase illegal drugs  190 non-null    float64
 10  Developing own business      190 non-null    float64
 11  Spreading Piracy             190 non-null    float64
 12  Psycho or Pervert            190 non-null    float64
 13  Steal Information   

## Data Cleaning

In [31]:
# Drop rows where the target variable 'Total' is missing
data = data.dropna(subset=['Total'])

# Fill missing values in other columns with 0
data = data.fillna(0)
print(data.isnull().sum())


City                           0
Personal Revenge               0
Anger                          0
Fraud                          0
Extortion                      0
Causing Disrepute              0
Prank                          0
Sexual Exploitation            0
Disrupt Public Service         0
Sale purchase illegal drugs    0
Developing own business        0
Spreading Piracy               0
Psycho or Pervert              0
Steal Information              0
Abetment to Suicide            0
Others                         0
Total                          0
dtype: int64


## Feature Engineering

In [25]:
from sklearn.preprocessing import StandardScaler

# Separate features and target variable
X = data.drop(['City', 'Total'], axis=1)  # Features
y = data['Total']  # Target

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


## Split Data

In [26]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


## Select Models

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize multiple models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(max_depth=20, min_samples_split=2, n_estimators=200, random_state=42)
}

# Evaluate each model
model_performance = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    model_performance[model_name] = {
        "Mean Squared Error": mse,
        "R-squared": r2
    }

# Display performance of all models
for model_name, metrics in model_performance.items():
    print(f"{model_name}:")
    print(f"  Mean Squared Error: {metrics['Mean Squared Error']:.2f}")
    print(f"  R-squared: {metrics['R-squared']:.2f}")
    print()

# Choose the best model (Random Forest) based on performance metrics
best_model_name = "Random Forest"
best_model = models[best_model_name]


Linear Regression:
  Mean Squared Error: 307.61
  R-squared: 1.00

Decision Tree:
  Mean Squared Error: 1227228.97
  R-squared: 0.98

Random Forest:
  Mean Squared Error: 587787.83
  R-squared: 0.99



## Train Model

In [28]:
# Train the Random Forest model on the training data
model.fit(X_train, y_train)


## Evaluate Model

In [29]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")


Mean Squared Error: 587787.83
R-squared: 0.99


## Make Predictions

In [33]:
import joblib

# Save the trained model and scaler
model_filename = "best_random_forest_model.pkl"
scaler_filename = "scaler.pkl"
joblib.dump(model, model_filename)
joblib.dump(scaler, scaler_filename)

def load_model_and_predict_city():
    # Load the saved model and scaler
    loaded_model = joblib.load(model_filename)
    loaded_scaler = joblib.load(scaler_filename)
    
    # Prompt user for city name
    city_name = input("Enter the city name: ")
    
    # Retrieve and preprocess data for the selected city
    city_data = data[data['City'] == city_name]
    
    if city_data.empty:
        print(f"City '{city_name}' not found in the dataset.")
        return
    
    city_features = city_data.drop(['City', 'Total'], axis=1)
    scaled_city_features = loaded_scaler.transform(city_features)
    
    # Predict total crimes
    predicted_total_crimes = loaded_model.predict(scaled_city_features)[0]
    
    # Calculate crime category percentages
    total_category_crimes = city_features.iloc[0]
    category_percentages = (total_category_crimes / total_category_crimes.sum()) * 100
    
    # Sort categories by percentage in descending order
    sorted_categories = category_percentages.sort_values(ascending=False)
    
    # Display results
    print(f"\nPredicted Total Crimes for {city_name}: {int(predicted_total_crimes)}")
    print("\nPercentage Breakdown of Crimes by Category:")
    
    for category, percentage in sorted_categories.items():
        if percentage > 0 and category != 'Others':
            print(f"  {category}: {percentage:.2f}%")
    
    if 'Others' in sorted_categories and sorted_categories['Others'] > 0:
        print(f"  Others: {sorted_categories['Others']:.2f}%")

# Call the function to get predictions for a city
load_model_and_predict_city()


Enter the city name:  Odisha



Predicted Total Crimes for Odisha: 1574

Percentage Breakdown of Crimes by Category:
  Fraud: 64.99%
  Extortion: 6.98%
  Spreading Piracy: 5.26%
  Sexual Exploitation: 1.71%
  Developing own business: 1.59%
  Personal Revenge: 1.35%
  Causing Disrepute: 0.49%
  Steal Information: 0.12%
  Others: 17.50%
