<a href="https://colab.research.google.com/github/naman3309/MajorProject/blob/colab-v2/CYBER_VIGILANTE_v2_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

# Set your Gmail and App Password as environment variables
os.environ["GMAIL_ADDRESS"] = "majorproject015@gmail.com"  # Replace with your Gmail address
os.environ["GMAIL_APP_PASSWORD"] = "MajorProject@121212"  # Replace with your App Password


In [3]:
import pandas as pd
import random
from datetime import datetime, timedelta
from sklearn.ensemble import IsolationForest

In [4]:
# Generate synthetic data
activities = ['login', 'file_access', 'data_download', 'logout', 'restricted_access']

# random Valid public IP Address generation function

def gen_ip():
  private_ranges = [
        (10, 10),        # 10.0.0.0 – 10.255.255.255
      (172, 172),      # 172.16.0.0 – 172.31.255.255
      (192, 192)       # 192.168.0.0 – 192.168.255.255
  ]
  first_octet, second_octet = 0, 0
  # Pick a private IP range
  selected_range = random.choice(private_ranges)
  if selected_range[0] == 10:
        first_octet = 10
        second_octet = random.randint(0, 255)

  elif selected_range[0] == 172:
        first_octet = 172
        second_octet = random.randint(16, 31)  # 172.16.x.x – 172.31.x.x

  elif selected_range[0] == 192:
        first_octet = 192
        second_octet = 168  # 192.168.x.x

  third_octet = random.randint(0, 255)
  fourth_octet = random.randint(0, 255)
  return f"{first_octet}.{second_octet}.{third_octet}.{fourth_octet}"

# Create random activity logs
data = []
for i in range(1000):
    user = f'User{i+1}'
    activity = random.choice(activities)
    ip_address = gen_ip()
    timestamp = datetime.now() - timedelta(minutes=random.randint(0, 10000))  # Random timestamp generation
    data.append({'user_id': user, 'activity': activity, 'timestamp': timestamp, 'ip_addresses':ip_address})

# Create DataFrame
df = pd.DataFrame(data)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Display the first few rows of the dataset
print("Synthetic Data:")
print(df.head())


Synthetic Data:
  user_id           activity                  timestamp    ip_addresses
0   User1        file_access 2024-12-30 15:08:22.159813    10.163.21.78
1   User2              login 2024-12-25 14:34:22.159840  192.168.157.52
2   User3              login 2024-12-30 13:09:22.159858    10.61.96.178
3   User4  restricted_access 2024-12-27 12:28:22.159888    192.168.2.30
4   User5      data_download 2024-12-29 13:55:22.159904  172.29.199.158


In [5]:
# Extract useful features
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek

# Display preprocessed data
print(" \n Preprocessed Data:")
print(df.head())


 
 Preprocessed Data:
  user_id           activity                  timestamp    ip_addresses  hour  \
0   User1        file_access 2024-12-30 15:08:22.159813    10.163.21.78    15   
1   User2              login 2024-12-25 14:34:22.159840  192.168.157.52    14   
2   User3              login 2024-12-30 13:09:22.159858    10.61.96.178    13   
3   User4  restricted_access 2024-12-27 12:28:22.159888    192.168.2.30    12   
4   User5      data_download 2024-12-29 13:55:22.159904  172.29.199.158    13   

   day_of_week  
0            0  
1            2  
2            0  
3            4  
4            6  


In [6]:
# Prepare data by cleaning and classifing it.
features = df[['hour', 'day_of_week']]

# Using Isolation Forest Algo for anomaly detection
cont = random.randint(0,28)/100
print(cont)
model = IsolationForest(contamination=cont, random_state=42)
df['anomaly'] = model.fit_predict(features)

# Mark anomalies
df['anomaly_flag'] = df['anomaly'].apply(lambda x: 'Anomaly' if x == -1 else 'Normal')
count = df['anomaly_flag'].value_counts()

# Display results with anomaly flags
print("\nAnomaly Detection Results:")
print(df[['user_id', 'activity', 'hour', 'day_of_week', 'anomaly_flag']])
print(count)


0.18

Anomaly Detection Results:
      user_id           activity  hour  day_of_week anomaly_flag
0       User1        file_access    15            0       Normal
1       User2              login    14            2       Normal
2       User3              login    13            0       Normal
3       User4  restricted_access    12            4       Normal
4       User5      data_download    13            6       Normal
..        ...                ...   ...          ...          ...
995   User996              login    18            4       Normal
996   User997  restricted_access    10            6       Normal
997   User998      data_download     8            1       Normal
998   User999        file_access    15            3       Normal
999  User1000      data_download     7            0       Normal

[1000 rows x 5 columns]
anomaly_flag
Normal     820
Anomaly    180
Name: count, dtype: int64


In [7]:
# Define risk weights for activities
activity_risk = {
    'login': 1,
    'file_access': 2,
    'data_download': 3,
    'restricted_access': 5,
    'logout': 0
}

# Assign risk scores to each activity
df['risk_score'] = df['activity'].map(activity_risk)

# Aggregate risk score for each user
user_risk = df.groupby('user_id')['risk_score'].sum().reset_index()
user_risk.columns = ['user_id', 'total_risk_score']

# Display user risk scores
print("\nUser Risk Scores:")
print(user_risk)



User Risk Scores:
      user_id  total_risk_score
0       User1                 2
1      User10                 1
2     User100                 5
3    User1000                 3
4     User101                 0
..        ...               ...
995   User995                 0
996   User996                 1
997   User997                 5
998   User998                 3
999   User999                 2

[1000 rows x 2 columns]


In [17]:
from scipy.stats import zscore

# Calculate Z-scores for risk assessment
df['hour_zscore'] = zscore(df['hour'])
df['day_zscore'] = zscore(df['day_of_week'])

# Calculate overall risk score by combining Z-scores
df['risk_score'] = abs(df['hour_zscore']) + abs(df['day_zscore'])

# Dynamic risk threshold (mean + k * std)
threshold = df['risk_score'].mean() + 2 * df['risk_score'].std()  # 2-sigma rule
suspicious_threshold = threshold * 0.9  # 10% below the threshold

# Classify users based on dynamic threshold and suspicious range
def classify_risk(score):
    if score > threshold:
        return 'High'
    elif score > suspicious_threshold:
        return 'Suspicious'
    else:
        return 'Low'

df['risk_level'] = df['risk_score'].apply(classify_risk)

high_risk_count = df[df['risk_level'] == 'High'].shape[0]
suspicious_count = df[df['risk_level'] == 'Suspicious'].shape[0]

# Display results
high_risk_events = df[df['risk_level'] == 'High']
suspicious_events = df[df['risk_level'] == 'Suspicious']

print("\nHigh-Risk Events Detected:")
print(high_risk_events[['user_id', 'activity', 'timestamp', 'ip_addresses', 'risk_score', 'risk_level']])
print(f"\nTotal High-Risk Events: {high_risk_count}")

print("\nSuspicious Events Detected:")
print(suspicious_events[['user_id', 'activity', 'timestamp', 'ip_addresses', 'risk_score', 'risk_level']])
print(f"\nTotal Suspicious Events: {suspicious_count}")

print(f"\nDynamic Risk Threshold (Z-score based): {threshold}")
print(f"Suspicious Threshold (10% below threshold): {suspicious_threshold}")
zs = threshold

31      User32
34      User35
282    User283
305    User306
479    User480
497    User498
665    User666
681    User682
724    User725
781    User782
832    User833
835    User836
858    User859
939    User940
951    User952
Name: user_id, dtype: object

High-Risk Events Detected:
     user_id           activity                  timestamp     ip_addresses  \
31    User32              login 2024-12-29 00:42:22.160325   192.168.51.135   
34    User35  restricted_access 2024-12-29 00:02:22.160370  192.168.195.234   
282  User283      data_download 2024-12-29 00:24:22.167052    172.24.15.125   
305  User306      data_download 2024-12-29 00:18:22.167406  192.168.164.214   
479  User480  restricted_access 2024-12-30 00:30:22.170346     10.114.25.69   
497  User498  restricted_access 2024-12-29 00:29:22.170626      10.43.6.144   
665  User666  restricted_access 2024-12-30 00:25:22.173227   192.168.38.209   
681  User682             logout 2024-12-30 00:21:22.173461    172.24.79.199   
724  Us

In [34]:
import plotly.graph_objects as go
import numpy as np
import pandas as pd

def plot_scrollable_user_risk(events_df, threshold,tt):

    # Create Plotly figure
    fig = go.Figure()

    # Add user risk score line
    fig.add_trace(go.Scatter(
        x=events_df['user_id'],
        y=events_df['risk_score'],
        mode='lines+markers',
        name='User Risk Score',
        line=dict(color='blue',width = 1),
        marker=dict(size=3, color='blue')
    ))

    # Add threshold line
    fig.add_trace(go.Scatter(
        x=events_df['user_id'],
        y=[threshold] * len(events_df),
        mode='lines',
        name=f'At Risk ({threshold:.2f})',
        line=dict(color='red', dash='dash')
    ))

    fig.add_trace(go.Scatter(
        x=events_df['user_id'],
        y=[tt] * len(events_df),
        mode='lines',
        name=f'Suspicious ({tt:.2f})',
        line=dict(color='cyan', dash='dash')
    ))

    # Customize layout for scrollable plot
    fig.update_layout(
        title='User v/s Risk Scores ',
        xaxis_title='User ID',
        yaxis_title='Risk Score',
        xaxis=dict(
            tickangle=90,
            rangeslider=dict(visible=True),  # Scrollable feature
            type='category'  # Ensure x-axis treats user_id as categorical
        ),
        yaxis=dict(range=[0, events_df['risk_score'].max() + 2]),
        legend=dict(x=0.02, y=1.1),
        hovermode="x unified",
        height=500
    )

    fig.show()

threshold = df['risk_score'].mean() + 2 * df['risk_score'].std()  # 2-sigma rule
tt = threshold * 0.9

# Plot scrollable graph
plot_scrollable_user_risk(df, threshold,tt)
