In [54]:
import random
import json
import string
from collections import defaultdict
from faker import Faker
import time
from datetime import datetime, timedelta
import pandas as pd

In [56]:
df = pd.read_csv("NumPhones.csv")

In [58]:
fake = Faker('fr_FR')

# Paramètres de génération
TOTAL_RECORDS = 4000
ERROR_RATIO = 0.05
DISTRIBUTION = {
    "voice": 0.6,
    "data": 0.3,
    "sms": 0.1
}
technologies = ["2G", "3G", "4G", "5G"]

In [60]:
start_date = datetime(2025, 4, 1)
end_date = datetime(2025, 4, 30, 23, 59, 59)

# Simulation de Cell IDs
cell_ids = ["Casablanca","Fès","Tanger","Marrakech",
            "Meknès","Agadir","Rabat","Oujda","Salé",
            "Nador","Tiznit","Sidi Kacem","Al Hoceïma"]

# Technologies valides
technologies = ["2G", "3G", "4G", "5G", "LTE"]

In [62]:
last_seen = defaultdict(lambda:start_date)

In [64]:
def get_next_timestamp(phone):
    last = last_seen[phone]
    next_time = last + timedelta(seconds = random.randint(30,172800))
    if next_time > end_date:
        next_time = end_date
    last_seen[phone] = next_time
    return next_time.isoformat() + "Z"

In [66]:
def generate_phone_callee():
    return "2127" + str(random.randint(10000000,99999999))
num_phones = list(df["msisdn"])
num_phones_callee = [generate_phone_callee() for _ in range(10)]

In [79]:
'''def generate_timestamp():
    delta = end_date - start_date
    random_seconds = random.randint(0,int(delta.total_seconds()))
    return (start_date + timedelta(seconds=random_second)).isoformat() + "Z"'''

'def generate_timestamp():\n    delta = end_date - start_date\n    random_seconds = random.randint(0,int(delta.total_seconds()))\n    return (start_date + timedelta(seconds=random_second)).isoformat() + "Z"'

In [68]:
def generate_voice():
    caller = random.choice(num_phones)
    callee = random.choice(num_phones_callee)
    return {
        "record_type":"voice",
        "timestamp":get_next_timestamp(caller),
        "caller_id":caller,
        "callee_id":callee,
        "duration_sec":random.randint(5,600),
        "cell_id":random.choice(cell_ids),
        "technology":random.choice(technologies)
    }

In [70]:
def generate_sms():
    sender = random.choice(num_phones)
    receiver = random.choice(num_phones_callee)
    return {
        "record_type":"sms",
        "timestamp":get_next_timestamp(sender),
        "sender_id":sender,
        "receiver_id":receiver,
        "cell_id":random.choice(cell_ids),
        "technology":random.choice(technologies)
    }

In [72]:
def generate_data():
    user = random.choice(num_phones)
    return {
        "record_type":"data",
        "timestamp":get_next_timestamp(user),
        "user_id":user,
        "data_volume_mb":round(random.uniform(1,500),2),
        "session_duration_sec":round(random.uniform(30,1000)),
        "cell_id":random.choice(cell_ids),
        "technology":random.choice(technologies)
    }                                

In [68]:
def corrupt_record(record):
    corrupted = record.copy()
    
    if corrupted["record_type"] == "voice":
        if random.choice([True,False]):
            corrupted["duration_sec"] = -corrupted["duration_sec"]
        if random.choice([True,False]):
            corrupted["callee_id"] = ""
        else:
            lettre = random.choice(string.ascii_letters)
            index = random.randint(0,len(corrupted["callee_id"]))
            corrupted["callee_id"] = corrupted["callee_id"][:index] + lettre + corrupted["callee_id"][index:]
    elif random.choice([True,False]):
        key_to_remove = random.choice(list(corrupted.keys()))
        del corrupted[key_to_remove]
    else:
        corrupted["record_type"] = random.choice(["Television","Lan","Cloud","Optics","DSL"])
    return corrupted

In [74]:
def corrupt_record_forced(record):
    corrupted = record.copy()
    
    # Always do something that will be caught by Spark mediation
    corruption_type = random.choice(["missing_field", "invalid_type", "negative_duration", "dummy_msisdn"])

    if corruption_type == "missing_field":
        key = random.choice(["timestamp", "caller_id", "record_type"])
        corrupted.pop(key, None)

    elif corruption_type == "invalid_type":
        corrupted["record_type"] = random.choice(["DSL", "Cloud", "Television", ""])

    elif corruption_type == "negative_duration":
        if corrupted.get("record_type") == "voice" and "duration_sec" in corrupted:
            corrupted["duration_sec"] = -abs(corrupted["duration_sec"])
        elif corrupted.get("record_type") == "data" and "session_duration_sec" in corrupted:
            corrupted["session_duration_sec"] = -abs(corrupted["session_duration_sec"])

    elif corruption_type == "dummy_msisdn":
        # Set a caller ID to dummy pattern Spark will catch
        if "caller_id" in corrupted:
            corrupted["caller_id"] = "999" + str(corrupted["caller_id"])[3:]

    return corrupted


In [76]:
def generate_records():

    records = []
    num_voice = int(TOTAL_RECORDS * DISTRIBUTION["voice"])
    num_data = int(TOTAL_RECORDS * DISTRIBUTION["data"])
    num_sms = TOTAL_RECORDS - num_voice - num_data
    
    for _ in range(num_voice):
        records.append(generate_voice())
    for _ in range(num_data):
        records.append(generate_data())
    for _ in range(num_sms):
        records.append(generate_sms())
    
    for _ in range(int(TOTAL_RECORDS * ERROR_RATIO)):
        idx = random.randint(0,len(records) - 1)
        records[idx] = corrupt_record_forced(records[idx])
       

    return records 

In [82]:
records = generate_records()
for i in range(len(records)):
    print(records[i])
 

{'record_type': 'voice', 'timestamp': '2025-04-01T21:13:53Z', 'caller_id': 212696281155, 'callee_id': '212764336755', 'duration_sec': 474, 'cell_id': 'Al Hoceïma', 'technology': '4G'}
{'record_type': 'voice', 'timestamp': '2025-04-05T10:49:37Z', 'caller_id': 212616729119, 'callee_id': '212741705738', 'duration_sec': 275, 'cell_id': 'Meknès', 'technology': '4G'}
{'record_type': 'voice', 'timestamp': '2025-04-10T08:22:13Z', 'caller_id': 212660261167, 'callee_id': '212742826308', 'duration_sec': 258, 'cell_id': 'Rabat', 'technology': '3G'}
{'record_type': 'voice', 'timestamp': '2025-04-02T18:04:54Z', 'caller_id': 212684712277, 'callee_id': '212727614241', 'duration_sec': 573, 'cell_id': 'Agadir', 'technology': '5G'}
{'record_type': 'voice', 'timestamp': '2025-04-08T23:44:55Z', 'caller_id': 212611107024, 'callee_id': '212741705738', 'duration_sec': 205, 'cell_id': 'Tanger', 'technology': '3G'}
{'record_type': 'voice', 'timestamp': '2025-04-08T19:35:29Z', 'caller_id': 212643423227, 'callee_

In [78]:
from kafka import KafkaProducer

In [80]:
producer = KafkaProducer(bootstrap_servers=['192.168.0.181:9092', '192.168.0.135:9092'])

In [82]:
from tqdm import tqdm

In [92]:
records = generate_records()
time_to_wait = TOTAL_RECORDS 
for i in tqdm(range(len(records))):
    msg = json.dumps(records[i],indent=2,ensure_ascii=False).encode('utf-8')
    future = producer.send('voice',value=msg)
    time.sleep(0.36)
producer.flush()
print("✅ All messages sent.")

100%|██████████| 4000/4000 [24:07<00:00,  2.76it/s]

✅ All messages sent.



