In [26]:
# pip install faker

In [25]:
# pip install pycountry

Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully installed pycountry-24.6.1


In [27]:
from faker import Faker
import random
import datetime
import csv
import pycountry

In [78]:
fake = Faker()

def generate_users(num_users):
    users = []
    date_formats = ['%m/%d/%Y', '%m-%d-%Y', '%Y/%m/%d', '%Y-%m-%d'] #Date formats only

    for i in range(num_users):
        first_name = fake.first_name()
        last_name = fake.last_name() if random.random() < 0.9 else ""
        name_case = random.choice(['upper', 'lower', 'mixed'])
        if name_case == 'upper':
            first_name = first_name.upper()
            last_name = last_name.upper()
        elif name_case == 'lower':
            first_name = first_name.lower()
            last_name = last_name.lower()

        created_at = fake.date_time_between(start_date='-5y', end_date='now').date() #Get the date object
        timestamp_format = random.choice(date_formats)
        created_at = created_at.strftime(timestamp_format)

        user = {
            'user_id': i + 1,
            'username': fake.user_name(),
            'email': fake.email(),
            'first_name': first_name,
            'last_name': last_name,
            'created_at': created_at, #Inconsistent format, Date only
            'department': random.choice(['IT', 'Sales', 'Marketing', 'Engineering', 'HR']),
            'role': random.choice(['Employee', 'Manager', 'Analyst', 'Admin'])
        }
        users.append(user)
    return users

def generate_devices(users):
    devices = []
    device_types = ["Laptop", "Desktop", "Tablet", "Phone", "Server", "Router", "Firewall"]
    device_models = ["Pro", "Air", "Plus", "X", "Z", "Gamer", "Workstation"]  # Example models
    for user in users:
        num_devices = random.randint(1, 3)
        for i in range(num_devices):
            device_type = random.choice(device_types)
            device_model = random.choice(device_models) if random.random() < 0.7 else "" #70% chance of a device name
            device_number = random.randint(100, 999)
            device_name = f"{device_type} {device_model} {device_number}".strip() #Combine type and number
            device = {
                'device_id': len(devices) + 1,
                'user_id': user['user_id'],
                'device_name': device_name,
                'ip_address': fake.ipv4_private(),
                'mac_address': fake.mac_address(),
                'os': random.choice(['Windows', 'macOS', 'Linux', 'Chrome OS']),
                'model': random.choice(['Laptop', 'Desktop', 'Mobile']),
                'region': fake.city()
            }
            devices.append(device)
    return devices

def generate_events(users, devices, num_events):
    events = []
    event_types = ['login', 'logout', 'file_access', 'malware_detected', 'network_connection', 'software_update', 'vpn_connection']
    file_extensions = ['.txt', '.pdf', '.docx', '.exe', '.zip', '.log', '.csv']
    malware_names = ['Trojan', 'Virus', 'Ransomware', 'Spyware', 'Adware', 'Keylogger', 'Rootkit']
    date_formats = ['%Y-%m-%d %H:%M:%S', '%m/%d/%Y %H:%M', '%Y%m%d%H%M%S', '%Y-%m-%dT%H:%M:%S']

    for i in range(num_events):
        user = random.choice(users)
        device = random.choice(devices)
        details = ""
        event = {
            'event_id': i + 1,
            'user_id': user['user_id'],
            'device_id': device['device_id'],
            'event_type': random.choice(event_types),
            'timestamp': fake.date_time_between(start_date='-1y', end_date='now'),
            'details' : details,
            'location': fake.city(),
            'bytes_transferred': random.randint(0, 100000) if random.random() < 0.8 else random.randint(100000, 100000000),
            'severity': random.choice(['Low', 'Medium', 'High']),
            'source_ip': fake.ipv4_public(),
            'destination_ip': fake.ipv4_public() if random.random() < 0.5 else None
        }
        if event['event_type'] == 'login':
            event['details'] = f"User {user['username']} logged in from {event['location']} using device {device['device_name']}"
        elif event['event_type'] == 'logout':
            event['details'] = f"User {user['username']} logged out from {event['location']} using device {device['device_name']}"
        elif event['event_type'] == 'file_access':
            file_name = fake.file_name(extension=random.choice(file_extensions))
            event['details'] = f"User {user['username']} accessed file: {file_name} on device {device['device_name']}"
        elif event['event_type'] == 'malware_detected':
            malware_name = random.choice(malware_names)
            event['details'] = f"{malware_name} detected on device {device['device_name']} in {event['location']}"
            event['severity'] = 'High'  # Malware is usually high severity
        elif event['event_type'] == 'network_connection':
            connection_type = random.choice(['Incoming', 'Outgoing'])
            port = random.randint(1024, 65535)  # Realistic port range
            event['details'] = f"{connection_type} connection to {event['destination_ip']}:{port} from {event['source_ip']}"
        elif event['event_type'] == 'software_update':
            software_name = random.choice(['Operating System', 'Antivirus', 'Web Browser'])
            version = f"{random.randint(0, 5)}.{random.randint(0, 10)}.{random.randint(0, 20)}"
            event['details'] = f"{software_name} updated to version {version} on device {device['device_name']}"
        elif event['event_type'] == 'vpn_connection':
            vpn_location = fake.city()
            event['details'] = f"User {user['username']} connected to VPN server in {vpn_location}"
        events.append(event)
    return events

def write_to_csv(data, filename, header):
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=header)
        writer.writeheader()
        writer.writerows(data)

In [79]:
num_users = 500
users_data = generate_users(num_users)

num_devices = 2000
devices_data = generate_devices(users_data)

num_events = 100000
events_data = generate_events(users_data, devices_data, num_events)

users_header = ['user_id', 'username', 'email', 'first_name', 'last_name', 'created_at', 'department', 'role']
devices_header = ['device_id', 'user_id', 'device_name', 'ip_address', 'mac_address', 'os', 'model', 'region']
events_header = ['event_id', 'user_id', 'device_id', 'event_type', 'timestamp', 'details', 'location', 'bytes_transferred', 'severity', 'source_ip', 'destination_ip']

write_to_csv(users_data, 'users.csv', users_header)
write_to_csv(devices_data, 'devices.csv', devices_header)
write_to_csv(events_data, 'events.csv', events_header)

print(f"{num_events} events generated with anomalies.")


100000 events generated with anomalies.


In [80]:
#Example of how to get the devices for a user
user_id_to_find = 1
user_devices = [device for device in devices_data if device['user_id'] == user_id_to_find]
print(f"\nDevices for user {user_id_to_find}:")
print(user_devices)


Devices for user 1:
[{'device_id': 1, 'user_id': 1, 'device_name': 'Laptop Plus 608', 'ip_address': '192.168.13.228', 'mac_address': 'd6:2a:bf:bc:fc:ed', 'os': 'macOS', 'model': 'Laptop', 'region': 'Timothytown'}, {'device_id': 2, 'user_id': 1, 'device_name': 'Firewall Air 180', 'ip_address': '192.168.189.29', 'mac_address': '78:80:9a:7f:54:0c', 'os': 'Linux', 'model': 'Laptop', 'region': 'Klinebury'}]
