In [1]:
# pip install flask joblib xgboost numpy pandas
# !pip install --upgrade scikit-learn

## 3. Predictive Modeling with XGBoost & Flask API
3.1 Training the Model
Train an XGBoost model on combined data (or separately if needed). For example, here we use features from all tables.

In [2]:
import pandas as pd
import xgboost as xgb
import joblib
import psycopg2
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# ✅ PostgreSQL Connection Details
DB_CONFIG = {
    "host": "localhost",
    "port": 5433,
    "database": "nyc",
    "user": "postgres",
    "password": "password"
}

TABLES = ["yellow_tripdata_2024_01", "yellow_tripdata_2024_02", "green_tripdata_2024_01", "green_tripdata_2024_02"]

# ✅ Fetch Data from PostgreSQL
def fetch_data():
    conn = psycopg2.connect(**DB_CONFIG)
    dfs = []

    for table in TABLES:
        query = f"""
        SELECT 
            trip_distance, passenger_count, extra, mta_tax, tip_amount, tolls_amount, 
            improvement_surcharge, total_amount, fare_amount,
            (EXTRACT(EPOCH FROM (dropoff_datetime - pickup_datetime)) / 60) AS trip_duration
        FROM public.{table} 
        WHERE trip_distance > 0 AND passenger_count > 0 
        AND fare_amount > 0 AND total_amount > 0
        AND dropoff_datetime > pickup_datetime
        LIMIT 20000;
        """
        
        df = pd.read_sql(query, conn)
        dfs.append(df)

    conn.close()
    return pd.concat(dfs, ignore_index=True)

# ✅ Load & Prepare Data
data = fetch_data()
print(f"✅ Loaded {len(data)} rows from PostgreSQL.")

# ✅ Feature Engineering
data["fare_per_mile"] = data["fare_amount"] / data["trip_distance"]
data["fare_per_passenger"] = data["fare_amount"] / data["passenger_count"]
data["tip_ratio"] = data["tip_amount"] / data["total_amount"]
data["toll_ratio"] = data["tolls_amount"] / data["total_amount"]

# **New Features for Trip Duration Prediction**
data["duration_per_mile"] = data["trip_duration"] / data["trip_distance"]
data["duration_per_passenger"] = data["trip_duration"] / data["passenger_count"]

# ✅ Handle Missing & Infinite Values
data.replace([float("inf"), float("-inf")], 0, inplace=True)
data.fillna(0, inplace=True)

# ✅ Feature Selection
FEATURES = ["trip_distance", "passenger_count", "extra", "mta_tax", "tip_amount", 
            "tolls_amount", "improvement_surcharge", "total_amount",
            "fare_per_mile", "fare_per_passenger", "tip_ratio", "toll_ratio",
            "duration_per_mile", "duration_per_passenger"]

# ✅ Train Fare Model
TARGET_FARE = "fare_amount"
X_train_fare, X_test_fare, y_train_fare, y_test_fare = train_test_split(
    data[FEATURES], data[TARGET_FARE], test_size=0.2, random_state=42
)

dtrain_fare = xgb.DMatrix(X_train_fare, label=y_train_fare, feature_names=FEATURES)
dtest_fare = xgb.DMatrix(X_test_fare, label=y_test_fare, feature_names=FEATURES)

params = {
    "objective": "reg:squarederror",
    "max_depth": 6,
    "eta": 0.1,
    "eval_metric": "rmse"
}

model_fare = xgb.train(params, dtrain_fare, num_boost_round=100, evals=[(dtest_fare, "Test")])

# ✅ Train Trip Duration Model
TARGET_DURATION = "trip_duration"
X_train_duration, X_test_duration, y_train_duration, y_test_duration = train_test_split(
    data[FEATURES], data[TARGET_DURATION], test_size=0.2, random_state=42
)

dtrain_duration = xgb.DMatrix(X_train_duration, label=y_train_duration, feature_names=FEATURES)
dtest_duration = xgb.DMatrix(X_test_duration, label=y_test_duration, feature_names=FEATURES)

model_duration = xgb.train(params, dtrain_duration, num_boost_round=100, evals=[(dtest_duration, "Test")])

# ✅ Evaluate Models
print(f"🔍 Fare RMSE: {mean_squared_error(y_test_fare, model_fare.predict(dtest_fare)) ** 0.5:.2f}")
print(f"🔍 Duration RMSE: {mean_squared_error(y_test_duration, model_duration.predict(dtest_duration)) ** 0.5:.2f}")

# ✅ Save Models
joblib.dump(model_fare, "xgb_fare_model.pkl")
joblib.dump(model_duration, "xgb_duration_model.pkl")

print("✅ Both models trained and saved.")

  df = pd.read_sql(query, conn)


✅ Loaded 80000 rows from PostgreSQL.
[0]	Test-rmse:14.34511
[1]	Test-rmse:13.05229
[2]	Test-rmse:11.90952
[3]	Test-rmse:10.88808
[4]	Test-rmse:10.00448
[5]	Test-rmse:9.16092
[6]	Test-rmse:8.45025
[7]	Test-rmse:7.82489
[8]	Test-rmse:7.25259
[9]	Test-rmse:6.78137
[10]	Test-rmse:6.33399
[11]	Test-rmse:5.97024
[12]	Test-rmse:5.63094
[13]	Test-rmse:5.33502
[14]	Test-rmse:5.08749
[15]	Test-rmse:4.86215
[16]	Test-rmse:4.65641
[17]	Test-rmse:4.49565
[18]	Test-rmse:4.35585
[19]	Test-rmse:4.24227
[20]	Test-rmse:4.12823
[21]	Test-rmse:4.03106
[22]	Test-rmse:3.95986
[23]	Test-rmse:3.88690
[24]	Test-rmse:3.83002
[25]	Test-rmse:3.76017
[26]	Test-rmse:3.71062
[27]	Test-rmse:3.66425
[28]	Test-rmse:3.62690
[29]	Test-rmse:3.59729
[30]	Test-rmse:3.56771
[31]	Test-rmse:3.53250
[32]	Test-rmse:3.51240
[33]	Test-rmse:3.49768
[34]	Test-rmse:3.47432
[35]	Test-rmse:3.45432
[36]	Test-rmse:3.44651
[37]	Test-rmse:3.44572
[38]	Test-rmse:3.44320
[39]	Test-rmse:3.43723
[40]	Test-rmse:3.44429
[41]	Test-rmse:3.42535
[4

## 3.2 Exposing the Model via Flask
Create a simple Flask app (can be run from a separate Jupyter cell or as a standalone script):



In [3]:
from flask import Flask, request, jsonify
import joblib
import numpy as np
import xgboost as xgb
import threading

app = Flask(__name__)

# ✅ Load Both Models
model_fare = joblib.load("xgb_fare_model.pkl")
model_duration = joblib.load("xgb_duration_model.pkl")

FEATURES = ["trip_distance", "passenger_count", "extra", "mta_tax", "tip_amount", 
            "tolls_amount", "improvement_surcharge", "total_amount",
            "fare_per_mile", "fare_per_passenger", "tip_ratio", "toll_ratio",
            "duration_per_mile", "duration_per_passenger"]

@app.route('/predict', methods=['POST'])
def predict():
    try:
        data = request.get_json()

        # ✅ Ensure All Expected Fields Exist
        missing_fields = [f for f in FEATURES if f not in data]
        if missing_fields:
            return jsonify({"error": f"Missing fields: {missing_fields}"}), 400

        # ✅ Convert Data to XGBoost DMatrix
        features = np.array([[data[f] for f in FEATURES]])  # Ensure correct feature order
        dinput = xgb.DMatrix(features, feature_names=FEATURES)  # ✅ Convert to DMatrix

        # ✅ Make Predictions
        fare_pred = model_fare.predict(dinput)[0]
        duration_pred = model_duration.predict(dinput)[0]

        return jsonify({
            "fare_prediction": float(fare_pred),
            "duration_prediction": float(duration_pred)
        })

    except Exception as e:
        return jsonify({"error": str(e)}), 500
    
# ✅ Run Flask in a Background Thread
def run_flask():
    app.run(host="0.0.0.0", port=5000, debug=False, use_reloader=False)

threading.Thread(target=run_flask, daemon=True).start()
print("✅ Flask API started on http://localhost:5000")


✅ Flask API started on http://localhost:5000
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.0.101:5000
Press CTRL+C to quit
127.0.0.1 - - [22/Mar/2025 21:41:04] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2025 21:41:04] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2025 21:41:04] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2025 21:41:04] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2025 21:41:04] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2025 21:41:04] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2025 21:41:04] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2025 21:41:04] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2025 21:41:04] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2025 21:41:04] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2025 21:41:04] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2025 21:41:04] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2025 21:41:04] "POST /predi