In [24]:
# Import dependencies
#  import pendra for data validation
# ! pip install pandas Pandera 

import pandas as pd
import pandera as pa
import numpy as np


## Steps for processing the dataset

1. **Reading CSV from `merge_data` folder.**
2. **Delete unwanted columns.**
3. **Splitting the `departure_time` into two columns: `depart_date` and `depart_time`.**
4. **Splitting the `arrival_time` into two columns: `arrival_date` and `arrival_time_only`.**
5. **Checking for missing values and duplicate values.**


In [25]:
# Read the data into a Pandas DataFrame
itinerary_details_df = pd.read_csv('Resources/merge_data/cleaned_flight_price_data.csv')

In [26]:
itinerary_details_df.columns

Index(['date', 'itinerary_id', 'cabin_class', 'sort_by', 'price_raw', 'price',
       'currency', 'flight_number', 'origin_airport', 'origin_city',
       'origin_country', 'destination_airport', 'destination_city',
       'destination_country', 'departure_time', 'arrival_time',
       'number_of_stops', 'marketing_airline', 'operating_airline',
       'change_allowed', 'cancellation_allowed', 'is_self_transfer',
       'has_flexible_options', 'Score', 'depart_date', 'depart_time',
       'arrival_date', 'arrival_time_only', 'duration_in_hrs'],
      dtype='object')

In [27]:
itinerary_details_unwanted = ['date','price_raw','cancellation_allowed','sort_by','currency',
                     'has_flexible_options','change_allowed','destination_city', 'destination_country',
                     'origin_city', 'origin_country']
itinerary_details_df = itinerary_details_df.drop(columns=itinerary_details_unwanted, axis=1)

In [28]:
itinerary_details_df.columns

Index(['itinerary_id', 'cabin_class', 'price', 'flight_number',
       'origin_airport', 'destination_airport', 'departure_time',
       'arrival_time', 'number_of_stops', 'marketing_airline',
       'operating_airline', 'is_self_transfer', 'Score', 'depart_date',
       'depart_time', 'arrival_date', 'arrival_time_only', 'duration_in_hrs'],
      dtype='object')

In [29]:
# adding a new auto-incrementing column in itinerary_details_df to trip_id

itinerary_details_df['trip_id']= range(1, len(itinerary_details_df) + 1)


In [30]:
# checking for dulpicate in itinerary_details_df 
itinerary_details_df.duplicated().sum()


0

## Steps for creating the dataset for `flight_df`

1. **Creating the `flight_df`** from the columns:
   - `flight_number`
   - `origin_airport`
   - `destination_airport`


2. **Checking for null values** and **duplicate values**, and **deleting duplicates** from `flight_df`.

3. **Dropping the columns** `origin_airport`, `destination_airport`, from `march_25_df`.

4. **Creating a new column `flight_id`** by concatenating the values in the `flight_number` with the string 'FL' in `flight_df`.

5. **Adding a column `flight_id`** to the `march_25_df`.

6. **Merging the `march_25_df`** with the `flight_df` on the `flight_number` column.

7. **Deleting `flight_number`** from `march_25_df`.


In [31]:
# Getting flight_number,origin_airport,destination_airport,duration_minutes,stop_count,	marketing_airline,	operating_airline to crate a new dataset flight_df
flight_df = itinerary_details_df[['flight_number','origin_airport','destination_airport']]

In [32]:
# Checking for duplicates in flight_df
flight_df.duplicated().sum()
# deleting duplicates
flight_df = flight_df.drop_duplicates()

In [33]:
# adding  a new column flight_id to the flight_df
# Create a new column flight_id by concatenating the values in the flight_number with string 'FL'
flight_df['flight_id'] = 'FL' + flight_df['flight_number'].astype(str)

flight_df.head()

Unnamed: 0,flight_number,origin_airport,destination_airport,flight_id
0,6340,YYZ,ATL,FL6340
1,6482,ATL,DFW,FL6482
2,2205,YTZ,YOW,FL2205
3,2662,YYZ,ATL,FL2662
4,2988,YYZ,ATL,FL2988


In [34]:
flight_df.columns

Index(['flight_number', 'origin_airport', 'destination_airport', 'flight_id'], dtype='object')

In [35]:
# dropping the origin_airport,destination_airport columns from the price_flight_df
itinerary_details_df = itinerary_details_df.drop(columns=['origin_airport',
                                        'destination_airport'])

In [36]:
itinerary_details_df.columns


Index(['itinerary_id', 'cabin_class', 'price', 'flight_number',
       'departure_time', 'arrival_time', 'number_of_stops',
       'marketing_airline', 'operating_airline', 'is_self_transfer', 'Score',
       'depart_date', 'depart_time', 'arrival_date', 'arrival_time_only',
       'duration_in_hrs', 'trip_id'],
      dtype='object')

In [37]:
itinerary_details_df['itinerary_id'].nunique()

6520

In [38]:
itinerary_details_df['trip_id'].nunique()

11587

In [None]:
# create a dataframe for itinerary_price 
itinerary_price_df= itinerary_details_df[['trip_id','price','itinerary_id','number_of_stops']]
itinerary_price_df.to_csv('Resources/clean_data/itinerary_price.csv', index=False)

0

In [56]:

# Remove duplicate records (excluding trip_id)
itinerary_price_unique = itinerary_price_df.drop_duplicates(subset=itinerary_price_df.columns.difference(['trip_id']), keep='first')

# Save the new CSV
itinerary_price_unique.to_csv('Resources/clean_data/itinerary_price_d.csv', index=False)


In [None]:
# Merge the itinerary_detail with the flight_df
itinerary_details_df = pd.merge(itinerary_details_df, flight_df, on='flight_number', how='left')
itinerary_details_df.head()

## Steps for creating Dataset for cabin_class
1. **Getting unique values from `cabin_class` from `flight_price_df`.**
2. **Creating a numpy array for `cabin_class_id`.**
3. **Add `cc` to each `cabin_class_id` list.**
4. **Create a `cabin_class_df` DataFrame with the `cabin_class_id` array as the `cabin_class_id` and `cabin_class` list as the `cabin_class` name.**
5. **Merging the `flight_price_df` with `cabin_class` with `cabin_class_df` on `cabin_class` column.**


In [None]:
# Getting unique values from cabin_class column
cabin_class= itinerary_details_df['cabin_class'].unique()
print(cabin_class)
# Getting count of the unique values from cabin_class column
itinerary_details_df['cabin_class'].nunique()
# Create numpy arrays from 1-4 for the cabin_class
cabin_class_id = np.arange(1,5)
# Use a list comprehension to add "cs" to each cabin_class_id. 
cabin_class_id =  ['cc' + str(id) for id in cabin_class_id]
# Create a cabin_class_df DataFrame with the cabin_class__id array as the cabin_class_id and cabin_class list as the cabin_class name.
cabin_class_df = pd.DataFrame({'cabin_class_id': cabin_class_id, 'cabin_class': cabin_class})
cabin_class_df
# save the cabin_class_df to a csv file
cabin_class_df.to_csv('Resources/clean_data/cabin_class.csv', index=False)



In [None]:
# save the cabin_class_df to a csv file
cabin_class_df.to_csv('Resources/clean_data/cabin_class.csv', index=False)

In [None]:
# Merging the march_25_df with cabin_class_df on cabin_class column
itinerary_details_df= itinerary_details_df.merge(cabin_class_df, on='cabin_class')

# Delete the column cabin_class
itinerary_details_df = itinerary_details_df.drop(columns=['cabin_class'])

itinerary_details_df.head()


In [None]:
# saving the flight_df to a csv file
flight_df.to_csv('Resources/clean_data/flight.csv', index=False)

In [None]:
# Saving the flight_price_df to a csv file
itinerary_details_df.to_csv('Resources/clean_data/itinerary_details.csv', index=False)