<a href="https://colab.research.google.com/github/makhmudov-khondamir/Machine-Learning-Projects/blob/main/Project%3A%20Airfare%20price%20prediction(Updated).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Project: Airfare price prediction(Updated)**
Predicting what the future prices of airline tickets might be for airlines

In [2]:

#extract the zip file and dataset preparation

import zipfile
import os

# path to the zip file
zip_path = 'aviachipta-narxini-bashorat-qilish.zip'

# directory for extraction
new_file_name = '/content/extracted_files'

# create the directory if it does not exist
os.makedirs(new_file_name, exist_ok=True)

# and extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(new_file_name)

In [3]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px
from xgboost import XGBRegressor

In [4]:
df=pd.read_csv("/content/extracted_files/train_data.csv")
df.drop('id', axis=1, inplace=True)

x = df.drop('price', axis=1)
y = df['price']

categorical = list(x.select_dtypes(include=['object']).columns)
numerical = list(x.select_dtypes(include=['number']).columns)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

pipelineCat = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
pipelineNum = Pipeline([
    ('scaler', StandardScaler())
])

fullpipeline = ColumnTransformer([
    ('categorical', pipelineCat, categorical),
    ('numerical', pipelineNum, numerical)
])

Xtrain = fullpipeline.fit_transform(x_train)
Xtest = fullpipeline.transform(x_test)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

model = XGBRegressor(random_state=42)

cv_scores = cross_val_score(model, Xtrain, y_train, cv=kf, scoring='neg_mean_squared_error')
mean_cv_score = -cv_scores.mean()
print(f'Cross-Validation Mean RMSE: {np.sqrt(mean_cv_score)}')

model.fit(Xtrain, y_train)

predictionRF = model.predict(Xtest)

mseRF = mean_squared_error(y_test, predictionRF)
rmseRF = np.sqrt(mseRF)
print(f'Test RMSE: {rmseRF}')

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 6, 9],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(XGBRegressor(random_state=42), param_grid, scoring='neg_mean_squared_error', cv=kf, n_jobs=-1)
grid_search.fit(Xtrain, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Predict with the best model
best_predictions = best_model.predict(Xtest)
best_mse = mean_squared_error(y_test, best_predictions)
best_rmse = np.sqrt(best_mse)

print(f'Best Model RMSE: {best_rmse}')
print(f'Best Hyperparameters: {best_params}')


Cross-Validation Mean RMSE: 3875.95521670487
Test RMSE: 3771.454832186972
Best Model RMSE: 3555.9087616560564
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 200, 'subsample': 0.8}


**Testing**

In [5]:
test_set=pd.read_csv("/content/extracted_files/test_data.csv")
test_set

preparedX=fullpipeline.transform(test_set)
prediction=best_model.predict(preparedX)

sol=pd.DataFrame({'id':test_set['id'],'price':prediction})

sol.to_csv('sol.csv',index=False)