In [6]:
import random
import pandas as pd
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker for generating Indian names
fake = Faker("en_IN")

# List of sample airlines, cities, and classes
airlines = [
    "Air India", "IndiGo", "SpiceJet", "Go First", "Vistara", "AirAsia India", 
    "Alliance Air", "Star Air", "TruJet", "Akasa Air"
]
cities = [
    "Delhi", "Mumbai", "Chennai", "Kolkata", "Bangalore", "Hyderabad", 
    "Ahmedabad", "Pune", "Jaipur", "Lucknow", "Cochin", "Goa", "Patna", "Indore"
]
classes = ["Economy", "Business", "Premium Economy"]

# Generate random data
passenger_data = []
flight_booking_data = []

# Create data for passengers and flight bookings
for passenger_id in range(1, 201):  # Generate 200 passengers (each passenger can have multiple bookings)
    passenger_name = fake.name()  # Generate a fake Indian name
    travel_class = random.choice(classes)
    
    passenger_data.append({
        "Passenger_ID": passenger_id,
        "Passenger_Name": passenger_name,
        "Class": travel_class
    })
    
    # For each passenger, generate random flight bookings
    num_bookings = random.randint(1, 3)  # Each passenger can have 1 to 3 bookings
    
    for _ in range(num_bookings):
        airline = random.choice(airlines)
        source, destination = random.sample(cities, 2)  # Ensure source != destination

        # Generate booking date
        booking_date = datetime(2018, 1, 1) + timedelta(days=random.randint(0, 1825))  # Random date between 2018 and 2023

        # Generate travel date and time (departure)
        travel_date = booking_date + timedelta(days=random.randint(1, 30))  # Travel date after booking date
        departure_time = travel_date + timedelta(hours=random.randint(5, 21), minutes=random.randint(0, 59))  # Departure time

        # Generate arrival date and time
        duration = timedelta(hours=random.randint(1, 7), minutes=random.randint(0, 59))  # Max duration of 7-8 hours
        arrival_time = departure_time + duration  # Arrival time is after departure

        # Introduce "dirty data" by modifying these columns:
        # Randomly introduce null values in booking or travel dates, airline names, or price
        if random.random() < 0.1:  # 10% chance of null value in booking date
            booking_date = None
        if random.random() < 0.1:  # 10% chance of null value in travel date
            travel_date = None
        if random.random() < 0.1:  # 10% chance of corrupt airline name
            airline = airline[:random.randint(1, len(airline)-1)]  # Corrupt airline name by trimming it
        if random.random() < 0.05:  # 5% chance of negative price
            price = -random.randint(3000, 15000)
        else:
            price = random.randint(3000, 15000)

        # Randomly change the date format for booking and travel dates
        if random.random() < 0.05:  # 5% chance of incorrect date format
            booking_date = booking_date.strftime("%d/%m/%Y") if booking_date else None
        if random.random() < 0.05:  # 5% chance of incorrect travel date format
            travel_date = travel_date.strftime("%d/%m/%Y") if travel_date else None

        flight_booking_data.append({
            "Booking_ID": len(flight_booking_data) + 1,  # Unique Booking ID
            "Booking_Date": booking_date,
            "Airline": airline,
            "Travel_Date": departure_time.strftime("%Y-%m-%d %H:%M:%S") if travel_date else None,
            "Arrival_Date": arrival_time.strftime("%Y-%m-%d %H:%M:%S") if travel_date else None,
            "Source": source,
            "Destination": destination,
            "Price": price,
            "Passenger_ID": passenger_id  # Foreign key reference to Passenger_ID
        })

# Limit flight bookings to 500 records
flight_booking_data = flight_booking_data[:500]

# Create DataFrames for passengers and flight bookings
passenger_df = pd.DataFrame(passenger_data)
flight_booking_df = pd.DataFrame(flight_booking_data)

In [7]:
passenger_df.head(3)

Unnamed: 0,Passenger_ID,Passenger_Name,Class
0,1,Odika Rama,Premium Economy
1,2,Ryan Jha,Business
2,3,Megha Keer,Economy


In [9]:
flight_booking_df.head(50)

Unnamed: 0,Booking_ID,Booking_Date,Airline,Travel_Date,Arrival_Date,Source,Destination,Price,Passenger_ID
0,1,2021-01-09 00:00:00,SpiceJet,2021-01-29 14:49:00,2021-01-29 19:47:00,Pune,Delhi,7996,1
1,2,2021-04-27 00:00:00,Akasa Air,2021-04-28 09:08:00,2021-04-28 13:16:00,Lucknow,Delhi,9011,1
2,3,2022-10-18 00:00:00,Akasa Air,2022-11-16 11:19:00,2022-11-16 14:16:00,Hyderabad,Patna,10179,1
3,4,2020-12-07 00:00:00,TruJet,2020-12-14 19:19:00,2020-12-15 01:54:00,Delhi,Chennai,7452,2
4,5,2021-04-02 00:00:00,SpiceJet,2021-04-26 21:44:00,2021-04-27 03:21:00,Patna,Indore,9741,2
5,6,2020-01-26 00:00:00,Akasa Air,2020-02-18 15:32:00,2020-02-18 21:02:00,Ahmedabad,Jaipur,6563,2
6,7,31/12/2018,Vistara,2019-01-12 06:19:00,2019-01-12 11:25:00,Hyderabad,Bangalore,5310,3
7,8,2022-12-09 00:00:00,Alliance Air,2022-12-27 05:43:00,2022-12-27 13:03:00,Ahmedabad,Bangalore,12992,3
8,9,2018-06-25 00:00:00,TruJet,2018-07-10 18:29:00,2018-07-11 02:04:00,Hyderabad,Bangalore,9156,3
9,10,2019-07-17 00:00:00,IndiGo,2019-07-29 15:53:00,2019-07-29 16:55:00,Kolkata,Chennai,3975,4


In [10]:
# Optionally save to CSV files
passenger_df.to_csv("passengers_dirty.csv", index=False)
flight_booking_df.to_csv("flight_bookings_dirty.csv", index=False)