In [144]:

# Flight Price Prediction:
# We want to predict the price of a flight based on features such as departure location, destination, airline, and flight duration. This is a regression task where you would use features like "from_airport_code," "dest_airport_code,","stops", "airline_name," and "duration" to predict the "price" column.


In [145]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsRegressor


In [146]:
data = pd.read_csv("../data/PreProcessedFlightData.csv")

In [147]:
# Preprocess categorical features using one-hot encoding
categorical_columns = ['from_airport_code', 'from_country', 'dest_airport_code', 'dest_country', 'aircraft_type', 'airline_number', 'airline_name']
# TODO REMOVED FLIGHT NUMBER
df = pd.get_dummies(data, columns=categorical_columns)


In [148]:
# Encode airline_name using label encoding
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['airline_name'] = label_encoder.fit_transform(X['airline_name'])

In [149]:
df['airline_name']

0        844
1        721
2       1080
3        776
4        374
        ... 
5312     761
5313    1021
5314    1127
5315     556
5316      85
Name: airline_name, Length: 5317, dtype: int64

In [150]:
# Normalize or standardize the numeric features
scaler = StandardScaler()
df['duration'] = scaler.fit_transform(df['duration'].values.reshape(-1, 1))


In [151]:
df.head()

Unnamed: 0,flight_number,departure_time,arrival_time,duration,stops,price,currency,co2_emissions,avg_co2_emission_for_this_route,co2_percentage,...,airline_name_[WestJet| Qantas],airline_name_[WestJet| easyJet],airline_name_[XiamenAir],airline_name_[XiamenAir| Cathay Pacific],airline_name_[XiamenAir| China Southern],airline_name_[XiamenAir| China Southern| ANA],airline_name_[XiamenAir| KLM],airline_name_[XiamenAir| Qantas],airline_name_[easyJet| WestJet],airline_name
0,LH1238|LH1238,2022-05-07 11:50:00,2022-05-07 19:15:00,-1.519162,1,425.0,USD,227000.0,118000.0,91.0,...,False,False,False,False,False,False,False,False,False,844
1,6E2014|6E2014,2022-05-01 13:25:00,2022-05-01 23:40:00,-1.2602,1,104.0,USD,208000.0,183000.0,13.0,...,False,False,False,False,False,False,False,False,False,721
2,ZH9512,2022-05-30 17:30:00,2022-05-30 20:10:00,-1.953304,0,223.0,USD,164000.0,164000.0,0.0,...,False,False,False,False,False,False,False,False,False,1080
3,LA2376|LA2376|LA2376,2022-05-01 17:50:00,2022-05-02 16:14:00,-0.149709,2,2062.0,USD,883000.0,827000.0,6.0,...,False,False,False,False,False,False,False,False,False,776
4,SN2641|SN2641|SN2641,2022-05-14 07:00:00,2022-05-14 21:15:00,-0.986004,2,363.0,USD,333000.0,216000.0,54.0,...,False,False,False,False,False,False,False,False,False,374


In [152]:
# Select relevant features and target variable
# drop the irrelevant column
df.drop(['departure_time','flight_number','arrival_time','scan_date','currency','co2_emissions','avg_co2_emission_for_this_route','co2_percentage'], axis=1, inplace=True)

# x is equals to df without the price column
X = df.drop(['price'], axis=1)

# y is equals to the price column
Y = df['price']

In [153]:
df

Unnamed: 0,duration,stops,price,from_airport_code_ADD,from_airport_code_AEP,from_airport_code_ALG,from_airport_code_ATH,from_airport_code_BOG,from_airport_code_BOM,from_airport_code_BRU,...,airline_name_[WestJet| Qantas],airline_name_[WestJet| easyJet],airline_name_[XiamenAir],airline_name_[XiamenAir| Cathay Pacific],airline_name_[XiamenAir| China Southern],airline_name_[XiamenAir| China Southern| ANA],airline_name_[XiamenAir| KLM],airline_name_[XiamenAir| Qantas],airline_name_[easyJet| WestJet],airline_name
0,-1.519162,1,425.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,844
1,-1.260200,1,104.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,721
2,-1.953304,0,223.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1080
3,-0.149709,2,2062.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,776
4,-0.986004,2,363.0,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5312,1.565535,3,3414.0,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,761
5313,1.893046,2,1429.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1021
5314,1.314190,2,1167.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1127
5315,-1.069786,1,868.0,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,556


In [154]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [155]:
# Create a k-NN regression model
k = 3  # Choose the value of k
knn = KNeighborsRegressor(n_neighbors=k)

In [156]:
X_train

Unnamed: 0,duration,stops,from_airport_code_ADD,from_airport_code_AEP,from_airport_code_ALG,from_airport_code_ATH,from_airport_code_BOG,from_airport_code_BOM,from_airport_code_BRU,from_airport_code_CAI,...,airline_name_[WestJet| Qantas],airline_name_[WestJet| easyJet],airline_name_[XiamenAir],airline_name_[XiamenAir| Cathay Pacific],airline_name_[XiamenAir| China Southern],airline_name_[XiamenAir| China Southern| ANA],airline_name_[XiamenAir| KLM],airline_name_[XiamenAir| Qantas],airline_name_[easyJet| WestJet],airline_name
3214,-0.224351,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,227
5273,-0.449800,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,267
1912,0.202175,2,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,374
4899,-0.041554,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,108
3906,-0.562525,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3092,-0.589944,2,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,590
3772,1.078077,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,699
5191,0.438288,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,730
5226,0.255491,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,880


In [157]:

# Train the k-NN model
knn.fit(X_train, y_train)

In [158]:
# Make price predictions
y_pred = knn.predict(X_test)

In [160]:
from sklearn.metrics import mean_squared_error, r2_score
# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 2999330.548802562
R-squared: 0.1043682700001004


## Model Evaluation

Results for the Model:

- **Mean Squared Error (MSE):** 2,999,330.55
- **R-squared (R²):** 0.1044


- **Mean Squared Error (MSE):** This measures how much my model's predictions differ, on average, from the actual values. In this case, the MSE of 2,999,330.55 is quite high, indicating that, on average, 
  
- *My model's predictions have an error of approximately 2,999,330.55 units squared. Lower MSE values are better, so a lower MSE would have indicated a better model.*

- **R-squared (R²):** R-squared measures how well my model explains the variance in the dependent variable (in this case, flight prices) based on the independent variables (features). It ranges from 0 to 1, and a higher R² is better. 
  
- *My R² of 0.1044 means that my model explains only about 10.44% of the variance in the flight prices. This suggests that my model's features do not account for much of the price variation, and there's a lot of unexplained variance.*

In summary, my model's MSE is relatively high, indicating significant prediction errors, and the R² is low, indicating that my features don't explain much of the price variation. Trying different models or exploring different features is the best move in this case. 
