### 1. Importing Libraries

In [1]:
import os
import numpy as np  
import pandas as pd  
from sklearn.model_selection import train_test_split

### 2. Reading the dataset


In [2]:
DATA_DIR = r'C:\Users\Abhinay\Desktop\flight-prediction-sageMaker\data'
PROJECT_DIR = r'C:\Users\Abhinay\Desktop\flight-prediction-sageMaker'
def load_data(name):
    """
    Load data from a CSV file.
    
    Parameters:
    name (str): The name of the CSV file (without extension).
    
    Returns:
    pd.DataFrame: The loaded data as a pandas DataFrame.
    """
    file_path = os.path.join(DATA_DIR, f'{name}.csv')
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Data file {file_path} does not exist.")
    
    return pd.read_csv(file_path)

In [3]:
flights = load_data('flight_price')

In [4]:
flights 

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info,4107
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info,4145
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,7229
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,12648


In [5]:
flights.shape

(10683, 11)

In [6]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


- the dataset has 10 683 rows and 11 colms
- columns like Date_of_Journey, Dep_Time, Arrival_Time have wrong data types
- 'Route' and 'Total_Stops' have 1 missing value each

### 3. Preliminary Analysis

#### 3.1 check data types  

In [7]:
flights.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [8]:
flights.dtypes

Airline            object
Date_of_Journey    object
Source             object
Destination        object
Route              object
Dep_Time           object
Arrival_Time       object
Duration           object
Total_Stops        object
Additional_Info    object
Price               int64
dtype: object

In [9]:
flights['Date_of_Journey'].iloc[0]

'24/03/2019'

In [10]:
flights['Total_Stops'].unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', nan, '4 stops'],
      dtype=object)

In [11]:
flights['Additional_Info'].value_counts()

Additional_Info
No info                         8345
In-flight meal not included     1982
No check-in baggage included     320
1 Long layover                    19
Change airports                    7
Business class                     4
No Info                            3
1 Short layover                    1
Red-eye flight                     1
2 Long layover                     1
Name: count, dtype: int64

In [12]:
flights['Price'].describe()

count    10683.000000
mean      9087.064121
std       4611.359167
min       1759.000000
25%       5277.000000
50%       8372.000000
75%      12373.000000
max      79512.000000
Name: Price, dtype: float64

#### 3.2 Check duplicates

In [13]:
#check duplicate rows
flights.duplicated().sum()

np.int64(220)

In [14]:
import pandas as pd

pd.reset_option('display.max_rows')


In [15]:
(
  flights
  .loc[flights.duplicated (keep=False)]
  .sort_values(['Airline', 'Date_of_Journey', 'Source', 'Destination','Dep_Time','Arrival_Time'])
)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
6321,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
9848,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
572,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
8168,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
1495,Air India,1/04/2019,Kolkata,Banglore,CCU → DEL → COK → BLR,10:00,01:20 02 Apr,15h 20m,2 stops,No info,10408
...,...,...,...,...,...,...,...,...,...,...,...
2870,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,05:45,08:35,2h 50m,non-stop,No check-in baggage included,4273
1535,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,20:30,23:20,2h 50m,non-stop,No check-in baggage included,3873
3711,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,20:30,23:20,2h 50m,non-stop,No check-in baggage included,3873
2634,Vistara,24/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,5403


#### 3.3 Observations

- The type of `Date_of_Journey`, `Dep_Time` and `Arrival_Time` should be changed to DateTime
- The type of `Duration` and `Total_Stops` is mixed, it should be of numeric type.
- There are 220 duplicate rows which need to be removed

### 4. Detailed Analysis

In [16]:
"abhin b   ".strip()

'abhin b'

In [17]:
flights.Airline.unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

In [18]:
(
  flights
  .Airline
  .str.replace(" Premium economy", "")
  .str.replace(" Business", "")
  .str.title() #making capital of first letter of each word
  .unique()
)

array(['Indigo', 'Air India', 'Jet Airways', 'Spicejet',
       'Multiple Carriers', 'Goair', 'Vistara', 'Air Asia', 'Trujet'],
      dtype=object)

- some of  the entries have inconsistent/ incaccurate values like 
'Jet Airways' nd 'Jet Airways Business' is same

Date_of_Journey

In [19]:
flights.Date_of_Journey

0        24/03/2019
1         1/05/2019
2         9/06/2019
3        12/05/2019
4        01/03/2019
            ...    
10678     9/04/2019
10679    27/04/2019
10680    27/04/2019
10681    01/03/2019
10682     9/05/2019
Name: Date_of_Journey, Length: 10683, dtype: object

Source

In [20]:
flights.Source.unique()

array(['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'], dtype=object)

Destination

In [21]:
flights.Destination.unique()

array(['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Delhi', 'Hyderabad'],
      dtype=object)

- 'New Delhi'  and  'Delhi' are the same city, so 
   we will replace 'New Delhi' with 'Delhi'

In [22]:
(
  flights
  .Destination
  .str.replace("New Delhi", "Delhi")
  .unique()
)

array(['Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Hyderabad'],
      dtype=object)

In [23]:
flights.Route

0                    BLR → DEL
1        CCU → IXR → BBI → BLR
2        DEL → LKO → BOM → COK
3              CCU → NAG → BLR
4              BLR → NAG → DEL
                 ...          
10678                CCU → BLR
10679                CCU → BLR
10680                BLR → DEL
10681                BLR → DEL
10682    DEL → GOI → BOM → COK
Name: Route, Length: 10683, dtype: object

In [24]:
flights.Dep_Time

0        22:20
1        05:50
2        09:25
3        18:05
4        16:50
         ...  
10678    19:55
10679    20:45
10680    08:20
10681    11:30
10682    10:55
Name: Dep_Time, Length: 10683, dtype: object

In [25]:
(
  flights
  .Dep_Time
  .loc[lambda ser: ser.str.contains("[^0-9:]")]
)

Series([], Name: Dep_Time, dtype: object)

In [26]:
pd.to_datetime(flights.Dep_Time)

  pd.to_datetime(flights.Dep_Time)


0       2025-07-09 22:20:00
1       2025-07-09 05:50:00
2       2025-07-09 09:25:00
3       2025-07-09 18:05:00
4       2025-07-09 16:50:00
                ...        
10678   2025-07-09 19:55:00
10679   2025-07-09 20:45:00
10680   2025-07-09 08:20:00
10681   2025-07-09 11:30:00
10682   2025-07-09 10:55:00
Name: Dep_Time, Length: 10683, dtype: datetime64[ns]

In [27]:
pd.to_datetime(flights.Dep_Time).dt.time

  pd.to_datetime(flights.Dep_Time).dt.time


0        22:20:00
1        05:50:00
2        09:25:00
3        18:05:00
4        16:50:00
           ...   
10678    19:55:00
10679    20:45:00
10680    08:20:00
10681    11:30:00
10682    10:55:00
Name: Dep_Time, Length: 10683, dtype: object

Arrival_Time

In [28]:
flights.Arrival_Time


0        01:10 22 Mar
1               13:15
2        04:25 10 Jun
3               23:30
4               21:35
             ...     
10678           22:25
10679           23:20
10680           11:20
10681           14:10
10682           19:15
Name: Arrival_Time, Length: 10683, dtype: object

In [29]:
(
  flights
  .Arrival_Time
  .loc[lambda ser: ser.str.contains("[^0-9:]")]
  .str.split(" ")
)

0        [01:10, 22, Mar]
2        [04:25, 10, Jun]
6        [10:25, 13, Mar]
7        [05:05, 02, Mar]
8        [10:25, 13, Mar]
               ...       
10666    [19:00, 13, Jun]
10667    [20:20, 13, Mar]
10672    [19:00, 28, Jun]
10673    [04:25, 28, May]
10674    [21:20, 13, Mar]
Name: Arrival_Time, Length: 4335, dtype: object

In [30]:
(
  flights
  .Arrival_Time
  .loc[lambda ser: ser.str.contains("[^0-9:]")]
  .str.split(" ", n=1)
)

0        [01:10, 22 Mar]
2        [04:25, 10 Jun]
6        [10:25, 13 Mar]
7        [05:05, 02 Mar]
8        [10:25, 13 Mar]
              ...       
10666    [19:00, 13 Jun]
10667    [20:20, 13 Mar]
10672    [19:00, 28 Jun]
10673    [04:25, 28 May]
10674    [21:20, 13 Mar]
Name: Arrival_Time, Length: 4335, dtype: object

In [31]:
(
  flights
  .Arrival_Time
  .loc[lambda ser: ser.str.contains("[^0-9:]")]
  .str.split(" ", n=1)
  .str.get(1)
  .unique() 
)

array(['22 Mar', '10 Jun', '13 Mar', '02 Mar', '10 May', '04 Mar',
       '13 Jun', '28 May', '19 Mar', '07 May', '02 Jun', '16 Jun',
       '19 May', '16 May', '28 Jun', '02 May', '28 Mar', '19 Jun',
       '04 Apr', '25 Mar', '07 Mar', '25 Jun', '07 Jun', '25 May',
       '13 May', '16 Mar', '22 May', '10 Apr', '04 Jun', '20 May',
       '28 Apr', '25 Apr', '10 Mar', '19 Apr', '13 Apr', '02 Apr',
       '23 Mar', '22 Apr', '11 May', '07 Apr', '03 May', '08 Mar',
       '03 Mar', '05 Mar', '22 Jun', '04 May', '26 May', '16 Apr',
       '26 Jun', '29 May', '29 Jun', '29 Mar', '23 May', '17 Jun'],
      dtype=object)

so there r values other than time which just looks like 
above

Duration

In [32]:
flights.Duration

0        2h 50m
1        7h 25m
2           19h
3        5h 25m
4        4h 45m
          ...  
10678    2h 30m
10679    2h 35m
10680        3h
10681    2h 40m
10682    8h 20m
Name: Duration, Length: 10683, dtype: object

In [33]:
(
  flights
  .Duration
  .loc[lambda ser: ~ser.str.contains("m")]
)

2        19h
18       23h
33       22h
44       12h
53        3h
        ... 
10591    23h
10638    14h
10639    38h
10673    15h
10680     3h
Name: Duration, Length: 1031, dtype: object

In [34]:
(
  flights
  .Duration
  .loc[lambda ser: ~ser.str.contains("h")]
)
flights.iloc[[6474]]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
6474,Air India,6/03/2019,Mumbai,Hyderabad,BOM → GOI → PNQ → HYD,16:50,16:55,5m,2 stops,No info,17327


- the observation of index 6474 is not a valid observation
as it takes more than 5 mins to reach form mumbai to hyd

In [35]:
(    
  flights
  .Duration
  .drop(index=6474)
  .str.split(" ", n=1, expand=True)
  .set_axis(['hours', 'minute'], axis=1)
  .assign(
    hours = lambda df_:
      df_
      .hours
      .str.replace("h", "")
      .astype(int)
      .mul(60),  # convert hours to minutes,
    minute = lambda df_:
       df_
       .minute
       .str.replace("m", "")
       .fillna(0)
       .astype(int)
       
  )
  .sum(axis=1)
  # .isna().sum()
  # .dtypes 
)    

0         170
1         445
2        1140
3         325
4         285
         ... 
10678     150
10679     155
10680     180
10681     160
10682     500
Length: 10682, dtype: int64

In [36]:
(    
  flights
  .Duration
  .drop(index=6474)
  .str.split(" ", n=1, expand=True)
  .set_axis(['hours', 'minute'], axis=1)
  .assign(
    hours = lambda df_:
      df_
      .hours
      .str.replace("h", "")
      .astype(int)
      .mul(60),  # convert hours to minutes,
    minute = lambda df_:
       df_
       .minute
       .str.replace("m", "")
       .fillna(0)
       .astype(int)
       
  )
  .sum(axis=1)
  # .isna().sum()
  # .dtypes 
  .rename("duration_minutes")
  .to_frame()
  .join(flights.Duration.drop(index=6474))
)    

Unnamed: 0,duration_minutes,Duration
0,170,2h 50m
1,445,7h 25m
2,1140,19h
3,325,5h 25m
4,285,4h 45m
...,...,...
10678,150,2h 30m
10679,155,2h 35m
10680,180,3h
10681,160,2h 40m


Total_Stops

In [37]:
flights.Total_Stops

0        non-stop
1         2 stops
2         2 stops
3          1 stop
4          1 stop
           ...   
10678    non-stop
10679    non-stop
10680    non-stop
10681    non-stop
10682     2 stops
Name: Total_Stops, Length: 10683, dtype: object

In [38]:
flights.Total_Stops.unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', nan, '4 stops'],
      dtype=object)

In [39]:
flights.Additional_Info.unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover', 'No Info',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

In [40]:
flights.loc[flights.Total_Stops.isnull()]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
9039,Air India,6/05/2019,Delhi,Cochin,,09:45,09:25 07 May,23h 40m,,No info,7480


- this row has no info abt total_stops and so there is only 1 row with 
missing value, so i will drop this row  of index 9039

In [41]:
(
  flights
  .Total_Stops
  .str.replace("non-stop", "0")
  .str.replace("stops?","", regex=True)  # Remove "stop" or "stops"
  .drop(index=9039)
  .astype(int)  # Convert to integer type
)

0        0
1        2
2        2
3        1
4        1
        ..
10678    0
10679    0
10680    0
10681    0
10682    2
Name: Total_Stops, Length: 10682, dtype: int64

Additional_Info

In [42]:
flights.Additional_Info.unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover', 'No Info',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

In [43]:
flights.loc[flights.Additional_Info.isnull()]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price


In [44]:
flights.Additional_Info.isna().sum()

np.int64(0)

### 5. Cleaning Operations

In [45]:
def convert_to_minutes(ser):
  return(    
  ser
  .str.split(" ", n=1, expand=True)
  .set_axis(['hours', 'minute'], axis=1)
  .assign(
    hours = lambda df_:
      df_
      .hours
      .str.replace("h", "")
      .astype(int)
      .mul(60),  # convert hours to minutes,
    minute = lambda df_:
       df_
       .minute
       .str.replace("m", "")
       .fillna(0)
       .astype(int),
    
       
  )
  .sum(axis=1)
  # .isna().sum()
  # .dtypes 
)    

In [None]:
def clean_data(df):
  return(
    df
    .drop(index=[6474, 9039])  # Drop rows with index 6474 and 9039
    .drop_duplicates()  
    .assign(**{
      col: df[col].str.strip()
      for col in df.select_dtypes(include='object').columns      
    })
    .rename(columns=str.lower)  # Convert column names to lowercase
    .assign(
      airline = lambda df_:(
        df_
        .airline
        .str.replace(" Premium economy", "")
        .str.replace(" Business", "")
        .str.title() #making capital of first letter of each word
      ),
      destination = lambda df_:(
        df_
        .destination
        .str.replace("New Delhi", "Delhi")
      ),
      date_of_journey = lambda df_: pd.to_datetime(df_.date_of_journey,dayfirst=True),
      dep_time = lambda df_: pd.to_datetime(df_.dep_time).dt.time,
      arrival_time = lambda df_: pd.to_datetime(df_.arrival_time).dt.time,
      duration = lambda df_: df_.duration.pipe(convert_to_minutes),
      total_stops= lambda df_:(
        df_
        .total_stops
        .str.replace("non-stop", "0")
        .str.replace("stops?","", regex=True)  # Remove "stop" or "stops"
        .astype(int)  # Convert to integer type
      ),
      additional_info = lambda df_: (
        df_
        .additional_info
        .str.replace("No Info", "No info")
      )         
    )           
    .drop(columns='route')
  )


In [47]:
flights_cleaned = clean_data(flights)

ValueError: time data "22:20" doesn't match format "%H:%M:%S", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [None]:
flights_cleaned

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-03-24,Banglore,Delhi,22:20:00,01:10:00,170,0,No info,3897
1,Air India,2019-05-01,Kolkata,Banglore,05:50:00,13:15:00,445,2,No info,7662
2,Jet Airways,2019-06-09,Delhi,Cochin,09:25:00,04:25:00,1140,2,No info,13882
3,Indigo,2019-05-12,Kolkata,Banglore,18:05:00,23:30:00,325,1,No info,6218
4,Indigo,2019-03-01,Banglore,Delhi,16:50:00,21:35:00,285,1,No info,13302
...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,2019-04-09,Kolkata,Banglore,19:55:00,22:25:00,150,0,No info,4107
10679,Air India,2019-04-27,Kolkata,Banglore,20:45:00,23:20:00,155,0,No info,4145
10680,Jet Airways,2019-04-27,Banglore,Delhi,08:20:00,11:20:00,180,0,No info,7229
10681,Vistara,2019-03-01,Banglore,Delhi,11:30:00,14:10:00,160,0,No info,12648


In [None]:
flights_cleaned.dtypes

airline                    object
date_of_journey    datetime64[ns]
source                     object
destination                object
dep_time                   object
arrival_time               object
duration                    int64
total_stops                 int64
additional_info            object
price                       int64
dtype: object

### 6. Split the data

In [None]:
flights_final = flights_cleaned.sample(1000)

In [None]:
X = flights_final.drop(columns='price')
y = flights_final.price.copy()

In [None]:
X_, X_test, y_, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(640, 9) (640,)
(160, 9) (160,)
(200, 9) (200,)


### 7. Export the Subsets

In [None]:
def export_data(X, y, name):
	file_name = f"{name}.csv"
	file_path = os.path.join(PROJECT_DIR, DATA_DIR, file_name)

	X.join(y).to_csv(file_path, index=False)

	return pd.read_csv(file_path).head()

In [None]:
export_data(X_train, y_train, "train")


Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-04-09,Delhi,Cochin,05:55:00,19:15:00,2240,2,No info,7711
1,Air India,2019-06-03,Delhi,Cochin,07:10:00,07:40:00,1470,2,No info,12698
2,Air India,2019-05-15,Kolkata,Banglore,16:45:00,21:05:00,1700,1,No info,7452
3,Vistara,2019-06-15,Chennai,Kolkata,07:05:00,09:20:00,135,0,No info,3687
4,Jet Airways,2019-04-09,Delhi,Cochin,23:05:00,19:00:00,1195,2,No info,9483


In [None]:
export_data(X_val, y_val, "val")

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-03-01,Banglore,Delhi,22:00:00,13:20:00,920,1,No info,21130
1,Vistara,2019-06-24,Banglore,Delhi,21:00:00,23:50:00,170,0,No info,4668
2,Jet Airways,2019-03-18,Banglore,Delhi,22:50:00,08:15:00,565,1,No info,13555
3,Jet Airways,2019-06-03,Banglore,Delhi,11:10:00,14:05:00,175,0,In-flight meal not included,5769
4,Multiple Carriers,2019-06-09,Delhi,Cochin,09:15:00,19:00:00,585,1,No info,11622


In [None]:
export_data(X_test, y_test, "test")


Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Vistara,2019-05-03,Banglore,Delhi,07:00:00,09:40:00,160,0,No info,4668
1,Indigo,2019-05-18,Delhi,Cochin,08:30:00,21:00:00,750,1,No info,7081
2,Air India,2019-05-09,Delhi,Cochin,12:30:00,09:25:00,1255,1,No info,7480
3,Jet Airways,2019-03-06,Banglore,Delhi,08:00:00,20:20:00,740,1,No info,16736
4,Jet Airways,2019-03-09,Delhi,Cochin,02:15:00,04:25:00,1570,1,No info,15029
