<a href="https://colab.research.google.com/github/manojDOX/Train-Price-Prediction-MLOPS/blob/main/Train_Price_Prediction_MLOPs_colabfile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# include required library
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import plotly.express as px
import seaborn as sns

In [None]:
# train data extracting from csv

file_id = "1mpkljxu5o4xenVe9bJu4ZbY1vkUNP8ig"
url = f"https://drive.google.com/uc?export=download&id={file_id}"

df = pd.read_csv(url)
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.select_dtypes(include=['object']).describe().T

In [None]:
(df.isnull().sum()/len(df))*100

In [None]:
# check for the skewness for the imputation
# continues data operation
print(f"sskewness : {df['price'].skew():.2f} so using mean because <0.5")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

# First subplot - bar chart for train_class
train_freq = df['train_class'].value_counts()
axes[0].bar(train_freq.index, train_freq.values)
axes[0].set_title("Plot 1 : train_class")
axes[0].set_xlabel("Class")
axes[0].set_ylabel("Frequency")
axes[0].tick_params(axis='x', rotation=45)

# Second subplot - bar chart for fare
fare_freq = df['fare'].value_counts()
axes[1].bar(fare_freq.index, fare_freq.values)
axes[1].set_title("Plot 2 : fare")
axes[1].set_xlabel("Fare")
axes[1].set_ylabel("Frequency")
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


In [None]:
price_mean = df['price'].mean()
df['price'].fillna(price_mean,inplace=True)

In [None]:
df['price'].isnull().sum()

In [None]:
df['train_class'].fillna('Turista',inplace=True)
df['fare'].fillna('Promo',inplace=True)

In [None]:
(df.isnull().sum()/len(df))*100

In [None]:
df.duplicated().sum()

In [None]:
df.columns

In [None]:
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])


In [None]:
df.info()

In [None]:
# 'insert_date'
df['insert_date'] = pd.to_datetime(df['insert_date'])

In [None]:
df.info()

In [None]:
df_demo = df.copy()
df_demo['total_hour'] = (df['end_date'] - df['start_date']) / pd.Timedelta(hours=1)
df_demo.loc[:,['start_date','total_hour','end_date']]

In [None]:
df_demo['route_path']  = df_demo['origin']+" "+df_demo['destination']
df_demo

In [None]:
df_demo = df_demo[['train_type','price','train_class','fare','total_hour','route_path']]

In [None]:
df_demo

In [None]:
df_demo['To_journey_hour'] = (df['start_date'] - df['insert_date']) / pd.Timedelta(hours=1)

In [None]:
df_demo.head()

In [None]:
list_df_demo = df_demo.columns
list_df_demo

In [None]:
for i in ['train_type', 'train_class', 'fare',
       'route_path']:
  print(i+":")
  print(df_demo[i].unique())
  print()

In [None]:
# --- Train Type Translation ---
train_type_map = {
    'MD-AVE': 'Medium Distance + High Speed AVE',
    'MD-LD': 'Medium Distance + Long Distance',
    'ALVIA': 'High Speed + Long Distance Hybrid',
    'REGIONAL': 'Regional',
    'AVE': 'High Speed AVE',
    'INTERCITY': 'Intercity',
    'AVE-MD': 'High Speed AVE + Medium Distance',
    'AVE-LD': 'High Speed AVE + Long Distance',
    'R. EXPRES': 'Regional Express',
    'AVE-TGV': 'International High Speed AVE-TGV',
    'AV City': 'Low Cost High Speed AVE',
    'MD': 'Medium Distance',
    'LD-MD': 'Long Distance + Medium Distance',
    'LD': 'Long Distance',
    'LD-AVE': 'Long Distance + High Speed AVE',
    'TRENHOTEL': 'Night Train (Sleeper)'
}

# --- Train Class Translation ---
train_class_map = {
    'Turista con enlace': 'Economy with Connection',
    'Turista': 'Economy',
    'Preferente': 'First Class',
    'Turista Plus': 'Premium Economy',
    'Cama Turista': 'Economy Sleeper Bed'
}

# --- Fare Translation ---
fare_map = {
    'Flexible': 'Flexible Fare',
    'Promo +': 'Promo Plus',
    'Promo': 'Promo',
    'Adulto ida': 'Adult One-way',
    'Mesa': 'Group Shared Table'
}

# ---- APPLY CHANGES ----
df_demo['train_type']  = df_demo['train_type'].map(train_type_map)
df_demo['train_class'] = df_demo['train_class'].map(train_class_map)
df_demo['fare']        = df_demo['fare'].map(fare_map)


In [None]:
df_demo

In [None]:
luxury_classes = ["First Class", "Premium Economy", "Economy Sleeper Bed"]
df_demo["is_luxury_class"] = df_demo["train_class"].isin(luxury_classes).astype(int)


In [None]:
fare_score_map = {"Flexible Fare": 2, "Promo Plus": 1, "Promo": 0, "Adult One-way": 0, "Group Shared Table": 0}
df_demo["fare_score"] = df_demo["fare"].map(fare_score_map)


In [None]:
df_demo["origin"] = df_demo["route_path"].str.split(" ").str[0]
df_demo["destination"] = df_demo["route_path"].str.split(" ").str[1]


In [None]:
df_demo

In [None]:
from sklearn.preprocessing import LabelEncoder

df = df_demo.copy()   # keep original safe
le = LabelEncoder()

categorical_cols = ['train_type', 'train_class', 'fare',
                    'origin', 'destination']

# Encode and drop original columns
for col in categorical_cols:
    df[col + "_enc"] = le.fit_transform(df[col])  # encoded column
    df.drop(columns=[col], inplace=True)          # drop original string column

# Reset index
df.reset_index(drop=True, inplace=True)
df.drop(columns=['route_path'], inplace=True)
df.head()


In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split

# Target variable
y = df['price']

# Feature variables (all columns except price)
X = df.drop(columns=['price'])

# Train 70% / Test 30%
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Baseline Random Forest
rf = RandomForestRegressor(
    n_estimators=100,   # number of trees
    random_state=42,    # reproducibility
    n_jobs=-1           # use all CPU cores for faster training
)

# Train the model
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# Evaluate performance
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("ðŸ”¹ Baseline Random Forest Results")
print("RMSE:", rmse)
print("RÂ² Score:", r2)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# ðŸ”¹ Predict on test set
y_pred = rf.predict(X_test)

# ðŸ”¹ Residuals (actual âˆ’ predicted)
residuals = y_test - y_pred

# ðŸ”¹ Residual Plot (Predicted vs Residuals)
plt.figure(figsize=(8,6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel("Predicted Price")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title("Residual Plot - Random Forest")
plt.show()


In [None]:
import numpy as np

# Create the sample as 2D array (required by scikit-learn)
sample = np.array([[2.633333, 1043.475, 0, 0, 1, 0, 3, 1, 3]])

# Predict price
predicted_price = rf.predict(sample)[0]
print("Predicted Price =", predicted_price)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import randint

rf = RandomForestRegressor(random_state=42, n_jobs=-1)

param_dist_rf = {
    "n_estimators": randint(100, 1000),
    "max_depth": [None] + list(range(3, 21)),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 10),
    "max_features": ["sqrt", "log2", None],   # â¬… FIXED
    "bootstrap": [True, False],
}

rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist_rf,
    n_iter=30,
    cv=5,
    scoring="neg_mean_squared_error",   # â¬… updated for compatibility
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rf_random.fit(X_train, y_train)

best_rf = rf_random.best_estimator_
print("Best RF params:", rf_random.best_params_)

# Prediction
rf_pred = best_rf.predict(X_test)

# Evaluation
r2 = r2_score(y_test, rf_pred)
rmse = mean_squared_error(y_test, rf_pred) ** 0.5   # â¬… FIXED

print("RF RÂ² :", r2)
print("RF RMSE :", rmse)
