In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings("ignore")


In [None]:
df = pd.read_csv("Flight_Price.csv") 
df.head()

In [None]:
df.info()

In [None]:
df.shape



In [None]:
df.describe()


In [None]:
df.isnull().sum()

In [None]:

df.isna().sum()

In [None]:
df.dropna(inplace=True)


In [None]:


# Step 2: Convert Date_of_Journey to Journey_day and Journey_month
df["Journey_day"] = pd.to_datetime(df["Date_of_Journey"], format="%d/%m/%Y").dt.day
df["Journey_month"] = pd.to_datetime(df["Date_of_Journey"], format="%d/%m/%Y").dt.month
df.drop("Date_of_Journey", axis=1, inplace=True)

# Step 3: Convert Dep_Time into hour and minute
df["Dep_hour"] = pd.to_datetime(df["Dep_Time"]).dt.hour
df["Dep_minute"] = pd.to_datetime(df["Dep_Time"]).dt.minute
df.drop("Dep_Time", axis=1, inplace=True)

# Step 4: Convert Arrival_Time into hour and minute
df["Arrival_hour"] = pd.to_datetime(df["Arrival_Time"]).dt.hour
df["Arrival_minute"] = pd.to_datetime(df["Arrival_Time"]).dt.minute
df.drop("Arrival_Time", axis=1, inplace=True)

# Step 5: Convert Duration into total minutes
def convert_duration_to_minutes(x):
    x = x.strip()
    if 'h' in x and 'm' in x:
        h, m = x.split('h')
        return int(h.strip()) * 60 + int(m.replace('m', '').strip())
    elif 'h' in x:
        return int(x.replace('h', '').strip()) * 60
    elif 'm' in x:
        return int(x.replace('m', '').strip())
    return 0

df["Duration_mins"] = df["Duration"].apply(convert_duration_to_minutes)
df.drop("Duration", axis=1, inplace=True)

# Step 6: Label Encode Total_Stops
le = LabelEncoder()
df["Total_Stops"] = le.fit_transform(df["Total_Stops"])

# Step 7: Drop Route column (too many unique text paths) or optionally encode it
df.drop("Route", axis=1, inplace=True)

# Step 8: One-hot encode categorical columns
df = pd.get_dummies(df, columns=["Airline", "Source", "Destination", "Additional_Info"], drop_first=True)

# Final check
print(df.shape)
print(df.head())


In [None]:
#1. Price Distribution


plt.figure(figsize=(8, 5))
sns.distplot(df['Price'], bins=50, kde=True, color='teal')
plt.title('Distribution of Flight Prices')
plt.xlabel('Price (INR)')
plt.ylabel('Count')
plt.show()


In [None]:
#Price vs Total Stops
plt.figure(figsize=(8, 5))
sns.boxplot(x='Total_Stops', y='Price', data=df)
plt.title('Flight Price vs Total Stops')
plt.xlabel('Number of Stops')
plt.ylabel('Price (INR)')
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the CSV file into a DataFrame
original_df = pd.read_csv('Flight_Price.csv')

# Now plot Price vs Airline using the DataFrame
plt.figure(figsize=(12, 6))
sns.boxplot(x='Airline', y='Price', data=original_df)
plt.xticks(rotation=45)
plt.title('Flight Price by Airline')
plt.ylabel('Price (INR)')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Features and target
X = df.drop(['Price'], axis=1)
y = df['Price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling (for Linear Regression only)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)

print("Linear Regression")
print("RMSE:", mean_squared_error(y_test, y_pred_lr, squared=False))
print("R2 Score:", r2_score(y_test, y_pred_lr))


In [None]:
pip install xgboost


In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)

print("XGBoost")
print("RMSE:", mean_squared_error(y_test, y_pred_xgb, squared=False))
print("R2 Score:", r2_score(y_test, y_pred_xgb))


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest")
print("RMSE:", mean_squared_error(y_test, y_pred_rf, squared=False))
print("R2 Score:", r2_score(y_test, y_pred_rf))


In [None]:
import matplotlib.pyplot as plt
importances = rf.feature_importances_
features = X.columns
sorted_idx = importances.argsort()[::-1]

plt.figure(figsize=(12, 6))
plt.bar(range(len(importances)), importances[sorted_idx], align='center')
plt.xticks(range(len(importances)), features[sorted_idx], rotation=90)
plt.title("Feature Importance (Random Forest)")
plt.tight_layout()
plt.show()


In [None]:
import pickle
import os

# Save Random Forest model
with open("final_model.pkl", "wb") as f:
    pickle.dump(rf, f)

# Save scaler (used for Linear Regression only)
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Save LabelEncoder (for Total_Stops)
with open("label_encoder_total_stops.pkl", "wb") as f:
    pickle.dump(le, f)

print("✅ Pickle files saved successfully.")


In [None]:
import mlflow
import os

# Define a local path where MLflow can safely write
mlflow_tracking_dir = os.path.abspath("mlruns")  # or give a full valid path

# Make sure the directory exists
os.makedirs(mlflow_tracking_dir, exist_ok=True)

# Set the MLflow tracking URI to local folder
mlflow.set_tracking_uri(f"file://{mlflow_tracking_dir}")


In [None]:
if mlflow.active_run():
    mlflow.end_run()

with mlflow.start_run(run_name="RandomForest_Local"):
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("random_state", 42)

    rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
    r2_rf = r2_score(y_test, y_pred_rf)

    mlflow.log_metric("rmse", rmse_rf)
    mlflow.log_metric("r2_score", r2_rf)

    mlflow.sklearn.log_model(rf, artifact_path="rf_model")

    print(f"✅ Model logged. RMSE: {rmse_rf:.2f}, R²: {r2_rf:.2f}")


In [39]:
import pickle

with open("features.pkl", "wb") as f:
    pickle.dump(X.columns.tolist(), f)
