In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
sns.set()

In [2]:
dataset = pd.read_csv("economy.csv")

In [3]:
pd.set_option('display.max_columns', None)

dataset.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price
0,11-02-2022,SpiceJet,SG,8709,18:55,Delhi,02h 10m,non-stop,21:05,Mumbai,5953
1,11-02-2022,SpiceJet,SG,8157,06:20,Delhi,02h 20m,non-stop,08:40,Mumbai,5953
2,11-02-2022,AirAsia,I5,764,04:25,Delhi,02h 10m,non-stop,06:35,Mumbai,5956
3,11-02-2022,Vistara,UK,995,10:20,Delhi,02h 15m,non-stop,12:35,Mumbai,5955
4,11-02-2022,Vistara,UK,963,08:50,Delhi,02h 20m,non-stop,11:10,Mumbai,5955


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206774 entries, 0 to 206773
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        206774 non-null  object
 1   airline     206774 non-null  object
 2   ch_code     206774 non-null  object
 3   num_code    206774 non-null  int64 
 4   dep_time    206774 non-null  object
 5   from        206774 non-null  object
 6   time_taken  206774 non-null  object
 7   stop        206774 non-null  object
 8   arr_time    206774 non-null  object
 9   to          206774 non-null  object
 10  price       206774 non-null  object
dtypes: int64(1), object(10)
memory usage: 17.4+ MB


In [5]:
dataset.isnull().sum()

date          0
airline       0
ch_code       0
num_code      0
dep_time      0
from          0
time_taken    0
stop          0
arr_time      0
to            0
price         0
dtype: int64

In [6]:
dataset["journey_day"] = pd.to_datetime(dataset.date, format="%d-%m-%Y").dt.day
dataset["journey_month"] = pd.to_datetime(dataset["date"], format = "%d-%m-%Y").dt.month
dataset.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price,journey_day,journey_month
0,11-02-2022,SpiceJet,SG,8709,18:55,Delhi,02h 10m,non-stop,21:05,Mumbai,5953,11,2
1,11-02-2022,SpiceJet,SG,8157,06:20,Delhi,02h 20m,non-stop,08:40,Mumbai,5953,11,2
2,11-02-2022,AirAsia,I5,764,04:25,Delhi,02h 10m,non-stop,06:35,Mumbai,5956,11,2
3,11-02-2022,Vistara,UK,995,10:20,Delhi,02h 15m,non-stop,12:35,Mumbai,5955,11,2
4,11-02-2022,Vistara,UK,963,08:50,Delhi,02h 20m,non-stop,11:10,Mumbai,5955,11,2


In [7]:
dataset.drop(["date"], axis = 1, inplace = True)
dataset.head()

Unnamed: 0,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price,journey_day,journey_month
0,SpiceJet,SG,8709,18:55,Delhi,02h 10m,non-stop,21:05,Mumbai,5953,11,2
1,SpiceJet,SG,8157,06:20,Delhi,02h 20m,non-stop,08:40,Mumbai,5953,11,2
2,AirAsia,I5,764,04:25,Delhi,02h 10m,non-stop,06:35,Mumbai,5956,11,2
3,Vistara,UK,995,10:20,Delhi,02h 15m,non-stop,12:35,Mumbai,5955,11,2
4,Vistara,UK,963,08:50,Delhi,02h 20m,non-stop,11:10,Mumbai,5955,11,2


In [8]:
# Extracting Hours
dataset["dep_hour"] = pd.to_datetime(dataset["dep_time"]).dt.hour
dataset["dep_min"] = pd.to_datetime(dataset["dep_time"]).dt.minute
dataset.drop(["dep_time"], axis = 1, inplace = True)
dataset.head()

Unnamed: 0,airline,ch_code,num_code,from,time_taken,stop,arr_time,to,price,journey_day,journey_month,dep_hour,dep_min
0,SpiceJet,SG,8709,Delhi,02h 10m,non-stop,21:05,Mumbai,5953,11,2,18,55
1,SpiceJet,SG,8157,Delhi,02h 20m,non-stop,08:40,Mumbai,5953,11,2,6,20
2,AirAsia,I5,764,Delhi,02h 10m,non-stop,06:35,Mumbai,5956,11,2,4,25
3,Vistara,UK,995,Delhi,02h 15m,non-stop,12:35,Mumbai,5955,11,2,10,20
4,Vistara,UK,963,Delhi,02h 20m,non-stop,11:10,Mumbai,5955,11,2,8,50


In [9]:
dataset["arrival_hour"] = pd.to_datetime(dataset["arr_time"]).dt.hour
dataset["arrival_min"] = pd.to_datetime(dataset["arr_time"]).dt.minute
dataset.drop(["arr_time"], axis = 1, inplace = True)
dataset.head()

Unnamed: 0,airline,ch_code,num_code,from,time_taken,stop,to,price,journey_day,journey_month,dep_hour,dep_min,arrival_hour,arrival_min
0,SpiceJet,SG,8709,Delhi,02h 10m,non-stop,Mumbai,5953,11,2,18,55,21,5
1,SpiceJet,SG,8157,Delhi,02h 20m,non-stop,Mumbai,5953,11,2,6,20,8,40
2,AirAsia,I5,764,Delhi,02h 10m,non-stop,Mumbai,5956,11,2,4,25,6,35
3,Vistara,UK,995,Delhi,02h 15m,non-stop,Mumbai,5955,11,2,10,20,12,35
4,Vistara,UK,963,Delhi,02h 20m,non-stop,Mumbai,5955,11,2,8,50,11,10


In [11]:
duration = list(dataset["time_taken"])
for i in range(len(duration)):
    if len(duration[i].split()) != 2:    # Check if duration contains only hour or mins
        if "h" in duration[i]:
            duration[i] = duration[i].strip() + " 0m"   # Adds 0 minute
        else:
            duration[i] = "0h " + duration[i] # Adds 0 hour
dataset.head()


# In[19]:


duration_hours = []
duration_mins = []

for i in range(len(duration)):
    if((duration[i]=='1.03h m') or (duration[i]=='1.02h m')or (duration[i]=='1.01h m')):
        if((duration[i]=='1.03h m')): 
            duration_hours.append(1)
            duration_mins.append(3)
        elif (duration[i]=='1.02h m'):
            duration_hours.append(1)
            duration_mins.append(2)
        elif (duration[i]=='1.01h m'):
            duration_hours.append(1)
            duration_mins.append(3)
    else:
        duration_hours.append(int(duration[i].split(sep = "h")[0]))    # Extract hours from duration
        duration_mins.append(int(duration[i].split(sep = "m")[0].split()[-1]))

# Extracts only minutes from duration
# Add duration_hours and duration_mins list to our dataset df
dataset["Duration_hours"] = duration_hours
dataset["Duration_mins"] = duration_mins
# Drop Duration column from the dataset
dataset.drop(["time_taken"], axis = 1, inplace = True)

dataset.head()

Unnamed: 0,airline,ch_code,num_code,from,stop,to,price,journey_day,journey_month,dep_hour,dep_min,arrival_hour,arrival_min,Duration_hours,Duration_mins
0,SpiceJet,SG,8709,Delhi,non-stop,Mumbai,5953,11,2,18,55,21,5,2,10
1,SpiceJet,SG,8157,Delhi,non-stop,Mumbai,5953,11,2,6,20,8,40,2,20
2,AirAsia,I5,764,Delhi,non-stop,Mumbai,5956,11,2,4,25,6,35,2,10
3,Vistara,UK,995,Delhi,non-stop,Mumbai,5955,11,2,10,20,12,35,2,15
4,Vistara,UK,963,Delhi,non-stop,Mumbai,5955,11,2,8,50,11,10,2,20


In [12]:
dataset["airline"].value_counts()

Vistara      67270
Air India    47996
Indigo       43120
GO FIRST     23177
AirAsia      16098
SpiceJet      9011
StarAir         61
Trujet          41
Name: airline, dtype: int64

In [13]:
Airline = dataset[["airline"]]
Current_Airline_List = Airline['airline']
New_Airline_List = []
for carrier in Current_Airline_List:
    if carrier in ['GO FIRST', 'Indigo', 'Air India', 'SpiceJet',
       'Trujet', 'StarAir', 'Vistara', 'AirAsia']:
        New_Airline_List.append(carrier)
    else:
        New_Airline_List.append('Other')
Airline['airline'] = pd.DataFrame(New_Airline_List)
Airline['airline'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Airline['airline'] = pd.DataFrame(New_Airline_List)


Vistara      67270
Air India    47996
Indigo       43120
GO FIRST     23177
AirAsia      16098
SpiceJet      9011
StarAir         61
Trujet          41
Name: airline, dtype: int64

In [14]:
print(dataset["from"].value_counts())
# As Source is Nominal Categorical data we will perform OneHotEncoding
Source = dataset[["from"]]
Source = pd.get_dummies(Source, drop_first= True) 
# drop_first= True means we drop the first column to prevent multicollinearity
Source.head()

Delhi        43029
Mumbai       41045
Bangalore    35665
Kolkata      32874
Hyderabad    27990
Chennai      26171
Name: from, dtype: int64


Unnamed: 0,from_Chennai,from_Delhi,from_Hyderabad,from_Kolkata,from_Mumbai
0,0,1,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0


In [15]:
# Feature engineering on: Destination
print(dataset["to"].value_counts())
# Renaming destination 'New Delhi' to 'Delhi' - to match with Source
Destination = dataset[["to"]]
Current_Destination_List = Destination['to']
New_Destination_List = []
for value in Current_Destination_List:
    if value in ['New Delhi']:
        New_Destination_List.append('Delhi')
    else:
        New_Destination_List.append(value)
Destination['to'] = pd.DataFrame(New_Destination_List)

# As Destination is Nominal Categorical data we will perform OneHotEncoding
Destination = pd.get_dummies(Destination, drop_first = True)
Destination.head()

Delhi        40654
Mumbai       40118
Bangalore    34914
Kolkata      34777
Hyderabad    29101
Chennai      27210
Name: to, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Destination['to'] = pd.DataFrame(New_Destination_List)


Unnamed: 0,to_Chennai,to_Delhi,to_Hyderabad,to_Kolkata,to_Mumbai
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1
