In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor,BaggingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_squared_error
from pyspark.sql.functions import col, hour, udf
from pyspark.sql.types import StringType
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [None]:
flightprice_df = pd.read_csv("/content/drive/MyDrive/Project_2024/DSP_Project/FlightFarePrediction_data.csv",nrows=1000000)
airports_df = pd.read_csv('/content/drive/MyDrive/Project_2024/DSP_Project/airports.csv')

In [None]:
#Section A

# 1.Understanding Data...........

# Display the first 2 rows of the DataFrame
print("First 2 rows of the DataFrame:\n")
print(flightprice_df.head(2))
print("\n" * 2)

# Display a summary of the DataFrame, including the data types and non-null values
print("Summary of the DataFrame:\n")
flightprice_df.info()
print("\n" * 2)

# Display descriptive statistics to summarize the central tendency and dispersion of DataFrame's distribution
print("Descriptive statistics:\n")
print(flightprice_df.describe())
print("\n" * 2)

# Print the number of missing values in each column
print("Number of missing values in each column:\n")
print(flightprice_df.isnull().sum())
print("\n" * 2)

# Print the number of duplicate rows in the DataFrame
print("Number of duplicate rows in the DataFrame:\n")
print(flightprice_df.duplicated().sum())
print("\n" * 2)

# Print the shape of the DataFrame (number of rows and columns)
print("Shape of the DataFrame (rows, columns):\n")
print(flightprice_df.shape)
print("\n" * 2)

First 2 rows of the DataFrame:

                              legId  searchDate  flightDate startingAirport  \
0  65a8a691c4d77487a439d6f44c219a39  2022-04-17  2022-04-21             LGA   
1  1ad3c9a84f803cc98028ba8c67f35941  2022-04-18  2022-04-24             ATL   

  destinationAirport fareBasisCode travelDuration  elapsedDays  \
0                DFW      UA3NA0BQ        PT5H45M            0   
1                LGA      H3AHZNN1        PT2H22M            0   

   isBasicEconomy  isRefundable  ...  segmentsArrivalTimeEpochSeconds  \
0            True         False  ...           1650551340||1650563100   
1           False         False  ...                       1650853920   

                              segmentsArrivalTimeRaw  \
0  2022-04-21T10:29:00.000-04:00||2022-04-21T12:4...   
1                      2022-04-24T22:32:00.000-04:00   

   segmentsArrivalAirportCode  segmentsDepartureAirportCode  \
0                    ATL||DFW                      LGA||ATL   
1               

In [None]:
# Extracting Selected Columns into a New DataFrame

flightpredict_df = flightprice_df[['flightDate','segmentsAirlineName','segmentsArrivalTimeEpochSeconds', 'segmentsDepartureTimeEpochSeconds','startingAirport','destinationAirport','travelDuration','isBasicEconomy','isRefundable','isNonStop','baseFare','totalFare','seatsRemaining','totalTravelDistance']]

flightpredict_df.head()

Unnamed: 0,flightDate,segmentsAirlineName,segmentsArrivalTimeEpochSeconds,segmentsDepartureTimeEpochSeconds,startingAirport,destinationAirport,travelDuration,isBasicEconomy,isRefundable,isNonStop,baseFare,totalFare,seatsRemaining,totalTravelDistance
0,2022-04-21,Delta||Delta,1650551340||1650563100,1650542400||1650554580,LGA,DFW,PT5H45M,True,False,False,110.7,142.6,9,1487.0
1,2022-04-24,American Airlines,1650853920,1650845400,ATL,LGA,PT2H22M,False,False,True,450.23,498.6,1,762.0
2,2022-04-22,Alaska Airlines||Alaska Airlines,1650678600||1650697140,1650671280||1650688200,OAK,LAX,PT7H11M,False,False,False,645.02,717.0,7,1628.0
3,2022-05-18,Delta||Delta,1652914020||1652941920,1652904600||1652923320,LGA,SFO,PT10H22M,False,False,False,440.93,497.6,9,2897.0
4,2022-05-06,JetBlue Airways||JetBlue Airways,1651919400||1651929660,1651899120||1651925100,SFO,BOS,PT8H29M,False,False,False,506.05,567.6,3,2751.0


In [None]:
#Section B Data Cleaning...........

# Remove duplicates
flightpredict_df = flightpredict_df.drop_duplicates()

# Handling missing values
# Fill missing values for numerical columns with the median
numerical_cols = flightpredict_df.select_dtypes(include=[np.number]).columns
flightpredict_df[numerical_cols] = flightpredict_df[numerical_cols].fillna(flightpredict_df[numerical_cols].median())

# Fill missing values for categorical columns with the mode
categorical_cols = flightpredict_df.select_dtypes(include=[object, 'category']).columns
flightpredict_df[categorical_cols] = flightpredict_df[categorical_cols].fillna(flightpredict_df[categorical_cols].mode().iloc[0])

#Data After Cleaning...........

# Print the number of missing values in each column after cleaning
print("Number of missing values in each column after cleaning:\n")
print(flightpredict_df.isnull().sum())
print("\n" * 3)

# Print the shape of the DataFrame after cleaning
print("Shape of the DataFrame after cleaning (rows, columns):\n")
print(flightpredict_df.shape)
print("\n" * 3)

# Print the first 5 rows of the cleaned DataFrame
print("First 5 rows of the cleaned DataFrame:\n")
print(flightpredict_df.head())

Number of missing values in each column after cleaning:

flightDate                           0
segmentsAirlineName                  0
segmentsArrivalTimeEpochSeconds      0
segmentsDepartureTimeEpochSeconds    0
startingAirport                      0
destinationAirport                   0
travelDuration                       0
isBasicEconomy                       0
isRefundable                         0
isNonStop                            0
baseFare                             0
totalFare                            0
seatsRemaining                       0
totalTravelDistance                  0
dtype: int64




Shape of the DataFrame after cleaning (rows, columns):

(916976, 14)




First 5 rows of the cleaned DataFrame:

   flightDate               segmentsAirlineName  \
0  2022-04-21                      Delta||Delta   
1  2022-04-24                 American Airlines   
2  2022-04-22  Alaska Airlines||Alaska Airlines   
3  2022-05-18                      Delta||Delta   
4  2022-05-0