# Task 1: Generate the raw dataset using fixed rules


In [64]:
import numpy as np

seed_value = int("812")
n = 320
rng = np.random.default_rng(seed_value)
tickets = []

for i in range(1, n+1):
    
    ticket_id = f"T{seed_value}-{i:04d}"
    route = ["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"][(i + seed_value) % 5]
    day = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][(i + seed_value) % 7]    
    days_to_departure = 1 + ((i * 3 + seed_value) % 60)    
    class_value = ["economy", "premium", "business"][(i * 2 + seed_value) % 3]    
    base = 120 + (days_to_departure * -1.5)
    route_adj = [140, 220, 60, 180, 80][(i + seed_value) % 5]
    class_adj = [0, 80, 220][(i * 2 + seed_value) % 3]
    noise = rng.normal(0, 25)
    price_usd = round(base + route_adj + class_adj + noise, 2)
    
    if i % 28 == 0:
        price_usd = ""
    if i % 45 == 0:        
        if price_usd != "":
            price_usd = price_usd * -1
    if i % 37 == 0:
        class_value = class_value.upper()

    tickets.append({
        "ticket_id": ticket_id,
        "route": route,
        "day": day,
        "days_to_departure": days_to_departure,
        "class": class_value,
        "price_usd": price_usd
    })


print("Total records:", len(tickets))
print("First five records:")
tickets[:5]

Total records: 320
First five records:


[{'ticket_id': 'T812-0001',
  'route': 'DXB-SIN',
  'day': 'Tue',
  'days_to_departure': 36,
  'class': 'premium',
  'price_usd': 298.26},
 {'ticket_id': 'T812-0002',
  'route': 'MAD-ROM',
  'day': 'Wed',
  'days_to_departure': 39,
  'class': 'economy',
  'price_usd': 156.11},
 {'ticket_id': 'T812-0003',
  'route': 'NYC-LAX',
  'day': 'Thu',
  'days_to_departure': 42,
  'class': 'business',
  'price_usd': 429.67},
 {'ticket_id': 'T812-0004',
  'route': 'LHR-JFK',
  'day': 'Fri',
  'days_to_departure': 45,
  'class': 'premium',
  'price_usd': 413.07},
 {'ticket_id': 'T812-0005',
  'route': 'SFO-SEA',
  'day': 'Sat',
  'days_to_departure': 48,
  'class': 'economy',
  'price_usd': 117.02}]

In [65]:
type(tickets[0]['price_usd'])

float

# Task 2: Validate and clean records with core Python
Identify invalid records (missing/non-numeric price_usd, or negative prices). Build cleaned_tickets with only valid records and normalized lowercase class.

After cleaning, confirm cleaned count and verify no invalid prices remain. Show two cleaned records.



In [66]:
cleaned_records = []
invalid_records = []

for r in tickets:
    price = r.get('price_usd')

    if isinstance(price, (int, float)) and price > 0:
        r['class']= r['class'].lower()
        cleaned_records.append(r)        
    else:
        invalid_records.append(r)

In [67]:
invalid_records

[{'ticket_id': 'T812-0028',
  'route': 'NYC-LAX',
  'day': 'Mon',
  'days_to_departure': 57,
  'class': 'premium',
  'price_usd': ''},
 {'ticket_id': 'T812-0045',
  'route': 'SFO-SEA',
  'day': 'Thu',
  'days_to_departure': 48,
  'class': 'business',
  'price_usd': -325.05},
 {'ticket_id': 'T812-0056',
  'route': 'DXB-SIN',
  'day': 'Mon',
  'days_to_departure': 21,
  'class': 'economy',
  'price_usd': ''},
 {'ticket_id': 'T812-0084',
  'route': 'LHR-JFK',
  'day': 'Mon',
  'days_to_departure': 45,
  'class': 'business',
  'price_usd': ''},
 {'ticket_id': 'T812-0090',
  'route': 'SFO-SEA',
  'day': 'Sun',
  'days_to_departure': 3,
  'class': 'business',
  'price_usd': -400.3},
 {'ticket_id': 'T812-0112',
  'route': 'MAD-ROM',
  'day': 'Mon',
  'days_to_departure': 9,
  'class': 'premium',
  'price_usd': ''},
 {'ticket_id': 'T812-0135',
  'route': 'SFO-SEA',
  'day': 'Wed',
  'days_to_departure': 18,
  'class': 'business',
  'price_usd': -379.28},
 {'ticket_id': 'T812-0140',
  'route': 

In [68]:
cleaned_records

[{'ticket_id': 'T812-0001',
  'route': 'DXB-SIN',
  'day': 'Tue',
  'days_to_departure': 36,
  'class': 'premium',
  'price_usd': 298.26},
 {'ticket_id': 'T812-0002',
  'route': 'MAD-ROM',
  'day': 'Wed',
  'days_to_departure': 39,
  'class': 'economy',
  'price_usd': 156.11},
 {'ticket_id': 'T812-0003',
  'route': 'NYC-LAX',
  'day': 'Thu',
  'days_to_departure': 42,
  'class': 'business',
  'price_usd': 429.67},
 {'ticket_id': 'T812-0004',
  'route': 'LHR-JFK',
  'day': 'Fri',
  'days_to_departure': 45,
  'class': 'premium',
  'price_usd': 413.07},
 {'ticket_id': 'T812-0005',
  'route': 'SFO-SEA',
  'day': 'Sat',
  'days_to_departure': 48,
  'class': 'economy',
  'price_usd': 117.02},
 {'ticket_id': 'T812-0006',
  'route': 'DXB-SIN',
  'day': 'Sun',
  'days_to_departure': 51,
  'class': 'business',
  'price_usd': 450.43},
 {'ticket_id': 'T812-0007',
  'route': 'MAD-ROM',
  'day': 'Mon',
  'days_to_departure': 54,
  'class': 'premium',
  'price_usd': 171.18},
 {'ticket_id': 'T812-0008

In [69]:
len(tickets)

320

In [70]:
len(cleaned_records)

302

In [71]:
if all( isinstance(p['price_usd'], (int, float)) and p['price_usd'] >0 and p['class'] == p['class'].lower() 
        for p in cleaned_records):
  print('All are valid')

else:
    print("Some records are invalid")

All are valid


In [72]:
cleaned_records[:2]

[{'ticket_id': 'T812-0001',
  'route': 'DXB-SIN',
  'day': 'Tue',
  'days_to_departure': 36,
  'class': 'premium',
  'price_usd': 298.26},
 {'ticket_id': 'T812-0002',
  'route': 'MAD-ROM',
  'day': 'Wed',
  'days_to_departure': 39,
  'class': 'economy',
  'price_usd': 156.11}]

# Task 3: Convert to NumPy for analysis
Create NumPy arrays for prices and days. Compute mean and standard deviation of prices. Compute total revenue per day and ticket counts per day using vectorized operations (no loops). Validate daily totals sum to overall total revenue.

In [73]:
array_prices = np.array([ a['price_usd'] for a in cleaned_records])
array_prices

array([298.26, 156.11, 429.67, 413.07, 117.02, 450.43, 171.18, 145.16,
       429.25, 315.81, 311.3 , 407.13, 302.  , 343.47, 362.25, 402.66,
       146.76, 473.9 , 427.17, 139.61, 471.15, 249.56, 174.63, 544.85,
       176.67, 216.58, 341.88, 310.02, 417.53, 348.68, 210.88, 491.68,
       445.63, 200.72, 472.81, 220.67, 236.22, 560.76, 234.28, 187.23,
       347.76, 308.51, 263.  , 314.45, 139.73, 345.12, 328.21, 178.7 ,
       482.02, 238.95, 204.02, 556.98, 264.94, 366.37, 314.7 , 292.71,
       324.5 , 355.56, 180.93, 378.26, 395.66,  85.41, 431.78, 229.97,
       186.61, 432.11, 223.6 , 312.27, 446.5 , 300.48, 364.66, 362.79,
       343.44, 186.01, 430.29, 386.03, 136.61, 500.18, 211.57, 218.34,
       211.49, 227.44, 387.08, 249.66, 220.19, 335.26, 189.26, 490.77,
       382.06, 126.82, 456.71, 224.68, 226.47, 518.32, 194.72, 230.49,
       341.42, 294.66, 255.72, 332.85, 289.82, 109.89, 388.8 , 358.67,
       173.62, 459.7 , 223.68, 518.5 , 254.44, 290.17, 371.21, 266.67,
      

In [74]:
array_days = np.array([ c['day'] for c in cleaned_records])
array_days

array(['Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon', 'Tue', 'Wed',
       'Thu', 'Fri', 'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri',
       'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun',
       'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon', 'Tue', 'Wed',
       'Thu', 'Fri', 'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Fri', 'Sat',
       'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Tue',
       'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu',
       'Fri', 'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat',
       'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Tue',
       'Wed', 'Thu', 'Fri', 'Sat', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri',
       'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun',
       'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Tue', 'Wed',
       'Thu', 'Fri', 'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri',
       'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun',
       'Mon', 'Tue',

In [75]:
mean_price =np.mean(array_prices)
mean_price

np.float64(308.82480132450337)

In [76]:
std_price = np.std(array_prices)
std_price

np.float64(111.69676125398355)

In [77]:
unique_days, day_indices = np.unique(array_days, return_inverse = True)
unique_days

array(['Fri', 'Mon', 'Sat', 'Sun', 'Thu', 'Tue', 'Wed'], dtype='<U3')

In [78]:
total_revenue_per_day = np.bincount(day_indices, weights = array_prices)
total_revenue_per_day

array([14076.09, 10186.55, 13751.09, 13309.84, 14280.82, 13870.25,
       13790.45])

In [79]:
matched_day = dict(zip(unique_days, total_revenue_per_day))
matched_day

{np.str_('Fri'): np.float64(14076.089999999998),
 np.str_('Mon'): np.float64(10186.55),
 np.str_('Sat'): np.float64(13751.09),
 np.str_('Sun'): np.float64(13309.839999999998),
 np.str_('Thu'): np.float64(14280.82),
 np.str_('Tue'): np.float64(13870.250000000002),
 np.str_('Wed'): np.float64(13790.449999999999)}

In [80]:
day_indices

array([5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5,
       6, 4, 0, 2, 3, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 0,
       2, 3, 1, 5, 6, 4, 0, 2, 3, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3,
       1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 5, 6, 4, 0, 2, 1, 5, 6,
       4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 5, 6, 4, 0,
       2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 4, 0, 2, 3,
       5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5,
       6, 4, 0, 2, 3, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 3, 1, 5, 6, 4, 0,
       2, 3, 1, 5, 6, 4, 0, 2, 3, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3,
       1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 6, 4, 0, 2, 3, 1, 5, 6,
       4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 5, 6, 4, 0,
       2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 2, 3, 1, 5, 6, 4, 0, 2, 3,
       5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5,
       6, 4, 0, 2, 3, 5, 6, 4, 0, 2, 3, 5, 6, 4, 0,

In [81]:
ticket_counts = np.bincount(day_indices)
ticket_counts

array([45, 33, 45, 44, 45, 45, 45])

In [82]:
result = dict(zip(unique_days, ticket_counts))
result

{np.str_('Fri'): np.int64(45),
 np.str_('Mon'): np.int64(33),
 np.str_('Sat'): np.int64(45),
 np.str_('Sun'): np.int64(44),
 np.str_('Thu'): np.int64(45),
 np.str_('Tue'): np.int64(45),
 np.str_('Wed'): np.int64(45)}

In [83]:
for day,value in matched_day.items():
    print(str(day),':', round(value,2))

Fri : 14076.09
Mon : 10186.55
Sat : 13751.09
Sun : 13309.84
Thu : 14280.82
Tue : 13870.25
Wed : 13790.45


In [84]:
overall_revenue = np.sum(array_prices)
overall_revenue

np.float64(93265.09000000001)

In [85]:
sum_total_revenue_per_day= np.sum(total_revenue_per_day)
sum_total_revenue_per_day

np.float64(93265.08999999998)

In [86]:
assert np.isclose(overall_revenue, sum_total_revenue_per_day), "They are not equal"
print("Both are equal")

Both are equal


# Task 4: Identify high-price tickets
Define high-price tickets as above the 90th percentile of prices. Compute threshold and count. Verify all selected prices are >= threshold.

In [87]:
threshold = np.percentile(array_prices, 90)
threshold

np.float64(473.791)

In [88]:
high_price_ticket = array_prices >= threshold 
high_price_ticket

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False,  True, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False,

In [89]:
high_price_tickets = array_prices[high_price_ticket]
high_price_tickets

array([473.9 , 544.85, 491.68, 560.76, 482.02, 556.98, 500.18, 490.77,
       518.32, 518.5 , 474.04, 522.45, 495.17, 493.94, 540.53, 553.31,
       476.84, 504.84, 505.66, 488.96, 500.87, 492.93, 493.97, 476.92,
       476.02, 527.79, 482.86, 498.59, 557.76, 508.31, 487.86])

In [90]:
count = np.sum(high_price_ticket)
count

np.int64(31)

In [91]:
t =np.all(high_price_tickets >= threshold)
print(t)

True


# Task 5: Produce a final report
Create a report dictionary with keys:

total_tickets
cleaned_tickets
mean_price
std_price
daily_totals
high_price_count
Print a readable report and include at least one explicit validation statement.



In [92]:
report = {'total_tickets':len(tickets),
          'cleaned_tickets':len(cleaned_records),
          'mean_price': mean_price,
          'std_price': std_price,
          'daily_totals': {str(day): round(float(value),2) for day, value in matched_day.items()},
          'high_price_count':count
         }


In [93]:
report

{'total_tickets': 320,
 'cleaned_tickets': 302,
 'mean_price': np.float64(308.82480132450337),
 'std_price': np.float64(111.69676125398355),
 'daily_totals': {'Fri': 14076.09,
  'Mon': 10186.55,
  'Sat': 13751.09,
  'Sun': 13309.84,
  'Thu': 14280.82,
  'Tue': 13870.25,
  'Wed': 13790.45},
 'high_price_count': np.int64(31)}

In [94]:
string_report= f"""

THIS IS READIBLE REPORT

Total Tickets : {report['total_tickets']}
Cleaned Tickets : { report['cleaned_tickets']}
Price Mean : {report['mean_price']}
Price Std : { report['std_price']}
Daily Totals : { report['daily_totals']}
High Price Count : {report['high_price_count']}
"""

print(string_report)



THIS IS READIBLE REPORT

Total Tickets : 320
Cleaned Tickets : 302
Price Mean : 308.82480132450337
Price Std : 111.69676125398355
Daily Totals : {'Fri': 14076.09, 'Mon': 10186.55, 'Sat': 13751.09, 'Sun': 13309.84, 'Thu': 14280.82, 'Tue': 13870.25, 'Wed': 13790.45}
High Price Count : 31

