In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from matplotlib import pyplot as plt
from joblib import dump, load

In [2]:
import pandas as pd
df = pd.read_csv('flight data.csv', on_bad_lines='skip')
df.head(5)

Unnamed: 0,from_airport_code,from_country,dest_airport_code,dest_country,aircraft_type,airline_number,airline_name,flight_number,departure_time,arrival_time,duration,stops,price,currency,co2_emissions,avg_co2_emission_for_this_route,co2_percentage,scan_date
0,ALG,Algeria,AEP,Argentina,Airbus A318|Canadair RJ 1000|Airbus A330|Airbu...,multi,[Air France| Iberia| LATAM],AF1491|AF1491|AF1491|AF1491,2022-04-30 14:30:00,2022-05-01 10:15:00,1425,3,1279.0,USD,1320000.0,1320000.0,0%,2022-04-29 17:52:59
1,ALG,Algeria,AEP,Argentina,Airbus A318|Canadair RJ 1000|Boeing 787|Airbus...,multi,[Air France| Iberia| LATAM],AF1491|AF1491|AF1491|AF1491,2022-04-30 14:30:00,2022-05-01 10:15:00,1425,3,1279.0,USD,1195000.0,1320000.0,-9%,2022-04-29 17:52:59
2,ALG,Algeria,AEP,Argentina,Airbus A320|Airbus A321|Boeing 787|Airbus A320,multi,[Air France| LATAM],AF1855|AF1855|AF1855|AF1855,2022-04-30 12:45:00,2022-05-01 10:15:00,1530,3,1284.0,USD,1248000.0,1320000.0,-5%,2022-04-29 17:52:59
3,ALG,Algeria,AEP,Argentina,Airbus A318|Airbus A320|Boeing 787|Airbus A320,multi,[Air France| LATAM],AF1491|AF1491|AF1491|AF1491,2022-04-30 14:30:00,2022-05-01 10:15:00,1425,3,1290.0,USD,1347000.0,1320000.0,2%,2022-04-29 17:52:59
4,ALG,Algeria,AEP,Argentina,Airbus A321neo|Boeing 777|Airbus A320,multi,[Lufthansa| LATAM],LH1317|LH1317|LH1317,2022-04-30 12:35:00,2022-05-01 10:15:00,1540,2,1347.0,USD,1381000.0,1320000.0,4%,2022-04-29 17:52:59


In [3]:
# Drop duplicate values
df.drop_duplicates(inplace=True)

# Drop rows with missing values
df.dropna(inplace=True)

In [4]:
# remove the airline_name column in  pandas DataFrame that contains square brackets []

df['airline_name'] = df['airline_name'].str.replace('[','').str.replace(']','')
df['airline_name'] = df['airline_name'].str.split('|').str[0]

  df['airline_name'] = df['airline_name'].str.replace('[','').str.replace(']','')


In [5]:
# delete the duplicate flight number in column flight_number behind the strings with a | separator

df['flight_number'] = df['flight_number'].str.split('|').str[0]

In [6]:
# Get the datetime info

df['departure_month'] = pd.to_datetime(df['departure_time']).dt.month

In [7]:
# Check Whether the units are standardized

df['currency'].unique()

array(['USD'], dtype=object)

In [8]:
df['co2_percentage'] = df['co2_percentage'].str.replace('%','')
df.head()

Unnamed: 0,from_airport_code,from_country,dest_airport_code,dest_country,aircraft_type,airline_number,airline_name,flight_number,departure_time,arrival_time,duration,stops,price,currency,co2_emissions,avg_co2_emission_for_this_route,co2_percentage,scan_date,departure_month
0,ALG,Algeria,AEP,Argentina,Airbus A318|Canadair RJ 1000|Airbus A330|Airbu...,multi,Air France,AF1491,2022-04-30 14:30:00,2022-05-01 10:15:00,1425,3,1279.0,USD,1320000.0,1320000.0,0,2022-04-29 17:52:59,4
1,ALG,Algeria,AEP,Argentina,Airbus A318|Canadair RJ 1000|Boeing 787|Airbus...,multi,Air France,AF1491,2022-04-30 14:30:00,2022-05-01 10:15:00,1425,3,1279.0,USD,1195000.0,1320000.0,-9,2022-04-29 17:52:59,4
2,ALG,Algeria,AEP,Argentina,Airbus A320|Airbus A321|Boeing 787|Airbus A320,multi,Air France,AF1855,2022-04-30 12:45:00,2022-05-01 10:15:00,1530,3,1284.0,USD,1248000.0,1320000.0,-5,2022-04-29 17:52:59,4
3,ALG,Algeria,AEP,Argentina,Airbus A318|Airbus A320|Boeing 787|Airbus A320,multi,Air France,AF1491,2022-04-30 14:30:00,2022-05-01 10:15:00,1425,3,1290.0,USD,1347000.0,1320000.0,2,2022-04-29 17:52:59,4
4,ALG,Algeria,AEP,Argentina,Airbus A321neo|Boeing 777|Airbus A320,multi,Lufthansa,LH1317,2022-04-30 12:35:00,2022-05-01 10:15:00,1540,2,1347.0,USD,1381000.0,1320000.0,4,2022-04-29 17:52:59,4


In [9]:
# Change object variables to category type

for col in df.columns:
    # Check if the column is of object type
    if df[col].dtype == 'object':
        # If it is, convert it to category type
        df[col] = df[col].astype('category')

In [10]:
df['co2_percentage'] = df['co2_percentage'].astype('int64')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 847355 entries, 0 to 998865
Data columns (total 19 columns):
 #   Column                           Non-Null Count   Dtype   
---  ------                           --------------   -----   
 0   from_airport_code                847355 non-null  category
 1   from_country                     847355 non-null  category
 2   dest_airport_code                847355 non-null  category
 3   dest_country                     847355 non-null  category
 4   aircraft_type                    847355 non-null  category
 5   airline_number                   847355 non-null  category
 6   airline_name                     847355 non-null  category
 7   flight_number                    847355 non-null  category
 8   departure_time                   847355 non-null  category
 9   arrival_time                     847355 non-null  category
 10  duration                         847355 non-null  int64   
 11  stops                            847355 non-null  in

In [11]:
# Convert text data to numbers

df['from_country'] = df['from_country'].cat.codes
df['dest_country'] = df['dest_country'].cat.codes
df['airline_name'] = df['airline_name'].cat.codes

In [12]:
# Define the X columns and Y columns

X = df[['from_country','dest_country','airline_name','duration','stops','co2_emissions','co2_percentage','departure_month']]
Y = df[['price']]

In [13]:
# Standardization

sc = StandardScaler()
X = sc.fit_transform(X)

In [14]:
# Split the dataset into training data and testing data

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.2, random_state = 42)

## Decision Tree Model

In [15]:
from sklearn.tree import DecisionTreeRegressor
DTR = DecisionTreeRegressor().fit(Xtrain, Ytrain)
Ypred = DTR.predict(Xtest)

In [16]:
# Evaluate the model's performance
r_squared = r2_score(Ytest, Ypred)
mse = mean_squared_error(Ytest, Ypred)
mae = mean_absolute_error(Ytest, Ypred)

# Print the evaluation metrics
print(f"R-squared: {r_squared:.2f}")
print(f"Mean squared error: {mse:.2f}")
print(f"Mean absolute error: {mae:.2f}")

R-squared: 0.82
Mean squared error: 633280.23
Mean absolute error: 308.73


In [17]:
dump(DTR, 'DecisionTree.joblib')

['DecisionTree.joblib']

## KNN Model

In [18]:
from sklearn.neighbors import KNeighborsRegressor
KN = DecisionTreeRegressor().fit(Xtrain, Ytrain)
Ypred = KN.predict(Xtest)

In [19]:
# Evaluate the model's performance
r_squared = r2_score(Ytest, Ypred)
mse = mean_squared_error(Ytest, Ypred)
mae = mean_absolute_error(Ytest, Ypred)

# Print the evaluation metrics
print(f"R-squared: {r_squared:.2f}")
print(f"Mean squared error: {mse:.2f}")
print(f"Mean absolute error: {mae:.2f}")

R-squared: 0.82
Mean squared error: 636241.06
Mean absolute error: 308.37


In [20]:
dump(KN, 'KNN.joblib')

['KNN.joblib']