In [None]:
from flask import Flask, request, jsonify
import numpy as np
import pandas as pd
import lightgbm as lgb
import json
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

app = Flask(__name__)

# ─── 0) Load all your static assets ───────────────────────────────────────────────
stores       = pd.read_csv('stores.csv')
transactions = pd.read_csv('transactions.csv', parse_dates=['date'])
oil          = pd.read_csv('oil.csv',      parse_dates=['date'])
holidays     = pd.read_csv('holidays_events.csv')

MODEL        = lgb.Booster(model_file='lgb_model.txt')

with open('feature_columns.json','r') as f:
    EXPECTED_COLS = json.load(f)

# Dedupe your saved schema (just in case)
seen = set(); deduped = []
for c in EXPECTED_COLS:
    if c not in seen:
        deduped.append(c); seen.add(c)
EXPECTED_COLS = deduped


# ─── 1) Feature‐engineering Helpers ──────────────────────────────────────────────

def create_date_features(df):
    df['month']            = df.date.dt.month.astype('int8')
    df['day_of_month']     = df.date.dt.day.astype('int8')
    df['day_of_year']      = df.date.dt.dayofyear.astype('int16')
    df['week_of_month']    = ((df.date.dt.day - 1)//7 + 1).astype('int8')
    df['week_of_year']     = df.date.dt.isocalendar().week.astype('int8')
    df['day_of_week']      = (df.date.dt.dayofweek + 1).astype('int8')
    df['year']             = df.date.dt.year.astype('int32')
    df['is_wknd']          = (df.date.dt.weekday//4).astype('int8')
    df['quarter']          = df.date.dt.quarter.astype('int8')
    df['is_month_start']   = df.date.dt.is_month_start.astype('int8')
    df['is_month_end']     = df.date.dt.is_month_end.astype('int8')
    df['is_quarter_start'] = df.date.dt.is_quarter_start.astype('int8')
    df['is_quarter_end']   = df.date.dt.is_quarter_end.astype('int8')
    df['is_year_start']    = df.date.dt.is_year_start.astype('int8')
    df['is_year_end']      = df.date.dt.is_year_end.astype('int8')
    df['season'] = np.where(
        df.month.isin([12,1,2]), 0,
        np.where(df.month.isin([6,7,8]), 2,
                 np.where(df.month.isin([9,10,11]), 3, 1))
    ).astype('int8')
    return df

def one_hot_encoder(df, nan_as_category=True):
    original = list(df.columns)
    cats     = df.select_dtypes(['object','category']).columns.tolist()
    df_enc   = pd.get_dummies(df, columns=cats, dummy_na=nan_as_category)
    df_enc.columns = df_enc.columns.str.replace(' ', '_')
    return df_enc, [c for c in df_enc.columns if c not in original]

def create_oil_features(df, oil):
    if not pd.api.types.is_datetime64_dtype(oil['date']):
        oil['date'] = pd.to_datetime(oil['date'])
    oil_series = (
        oil.set_index('date')['dcoilwtico']
           .replace(0, np.nan)
           .interpolate()
           .fillna(method='bfill')
           .rename('dcoilwtico_interpolated')
           .reset_index()
    )
    df = df.merge(oil_series, on='date', how='left')
    df['oil_above_70'] = (df.dcoilwtico_interpolated >= 70).astype('int8')
    df.drop('dcoilwtico_interpolated', axis=1, inplace=True)
    return df

def create_holiday_features(df, holidays):
    # copy & immediately drop 'transferred' so holidays.columns stay fixed
    h = holidays.copy().drop('transferred', axis=1)
    h["date"]        = pd.to_datetime(h.date)
    h['description'] = h.description.fillna('').astype(str)

    # 1) build transferred-tr1/tr2
    tr1 = h[(h.type=="Holiday")][['date','description','type','locale','locale_name']]
    tr2 = h[(h.type=="Transfer")][['date','description','type','locale','locale_name']]
    if not tr1.empty and not tr2.empty:
        tr = pd.concat([tr1.reset_index(drop=True), tr2.reset_index(drop=True)], axis=1)
        tr = tr.iloc[:, :5]                # keep only 5 core columns
        tr.columns = ['date','description','type','locale','locale_name']
    else:
        tr = pd.DataFrame(columns=['date','description','type','locale','locale_name'])

    # 2) base holidays (exclude Transfer)
    base = h[h.type!="Transfer"].drop_duplicates()

    # 3) stack rows
    holidays_full = pd.concat([base, tr], axis=0, ignore_index=True)

    # 4) clean-up
    holidays_full["description"] = holidays_full.description.str.replace(r"[-\d+]","",regex=True)
    holidays_full["type"]        = np.where(
        holidays_full.type=="Additional","Holiday",holidays_full.type)
    holidays_full["description"] = holidays_full.description.str.replace("Puente ","",regex=False)
    holidays_full["type"]        = np.where(
        holidays_full.type=="Bridge","Holiday",holidays_full.type)

    # 5) split work_day / events
    work_day  = holidays_full[holidays_full.type=="Work Day"]
    evt_table = holidays_full[holidays_full.type!="Work Day"]

    events = (evt_table[evt_table.type=="Event"]
              .drop(['type','locale','locale_name'],axis=1)
              .rename({'description':'events'},axis=1))
    hol_tbl = evt_table[evt_table.type!="Event"].drop('type',axis=1)

    # 6) merge national/regional/local
    if not hol_tbl.empty:
        regional = (hol_tbl[hol_tbl.locale=="Regional"]
                    .rename({'locale_name':'state','description':'holiday_regional'},axis=1)
                    .drop('locale',axis=1).drop_duplicates())
        national = (hol_tbl[hol_tbl.locale=="National"]
                    .rename({'description':'holiday_national'},axis=1)
                    .drop(['locale','locale_name'],axis=1).drop_duplicates())
        local    = (hol_tbl[hol_tbl.locale=="Local"]
                    .rename({'description':'holiday_local','locale_name':'city'},axis=1)
                    .drop('locale',axis=1).drop_duplicates())
    else:
        regional = pd.DataFrame(columns=["date","state","holiday_regional"])
        national = pd.DataFrame(columns=["date","holiday_national"])
        local    = pd.DataFrame(columns=["date","city","holiday_local"])

    df = df.merge(national, how='left', on='date')
    df = df.merge(regional, how='left', on=['date','state']) if 'state' in df else df.assign(holiday_regional=np.nan)
    df = df.merge(local,    how='left', on=['date','city'])  if 'city'  in df else df.assign(holiday_local=np.nan)

    # 7) work_day & events one‐hots
    df = df.merge(work_day[['date','type']].rename({'type':'IsWorkDay'},axis=1),
                  how='left') if not work_day.empty else df.assign(IsWorkDay=np.nan)
    if 'events' in events.columns and not events.empty:
        events['events'] = (events.events.fillna('').astype(str)
                            .pipe(lambda s: np.where(s.str.contains('futbol',na=False),'Futbol',s)))
        evt_enc, evt_cat = one_hot_encoder(events)
        df = df.merge(evt_enc, how='left', on='date')
        if evt_cat:
            df[evt_cat] = df[evt_cat].fillna(0)

    # 8) binary flags & local‐string flags
    df['holiday_national_binary']  = df.holiday_national.notna().astype('int8')
    df['holiday_local_binary']     = df.holiday_local.notna().astype('int8')
    df['holiday_regional_binary']  = df.holiday_regional.notna().astype('int8')
    df['national_independence']    = np.where(
        df.holiday_national.isin([
          'Batalla de Pichincha','Independencia de Cuenca',
          'Independencia de Guayaquil','Primer Grito de Independencia'
        ]),1,0).astype('int8')

    df['holiday_local']        = df.holiday_local.fillna('').astype(str)
    df['local_cantonizacio']   = np.where(df.holiday_local.str.contains('Cantonizacio',na=False),1,0).astype('int8')
    df['local_fundacion']      = np.where(df.holiday_local.str.contains('Fundacion',na=False),1,0).astype('int8')
    df['local_independencia']  = np.where(df.holiday_local.str.contains('Independencia',na=False),1,0).astype('int8')

    # final one‐hot the holiday columns
    hol_cols = ["holiday_national","holiday_regional","holiday_local"]
    existing = [c for c in hol_cols if c in df]
    if existing:
        for c in existing:
            df[c] = df[c].fillna('').astype(str)
        hol_enc, hol_cat = one_hot_encoder(df[existing], nan_as_category=False)
        df = pd.concat([df.drop(existing,axis=1), hol_enc], axis=1)

    return df, work_day

def preprocess_full(raw, stores, transactions, oil, holidays_df):
    raw['date'] = pd.to_datetime(raw.date)
    df = raw.merge(stores, on='store_nbr', how='left')
    df, work_day = create_holiday_features(df, holidays_df)
    df = create_date_features(df)
    df = create_oil_features(df, oil)
    df['workday'] = (~(
        (df.holiday_national_binary==1) |
        (df.holiday_regional_binary==1) |
        (df.holiday_local_binary==1) |
        (df.day_of_week.isin([6,7]))
    )).astype('int8')
    df.loc[df.date.isin(work_day.date), 'workday'] = 1
    df['wageday'] = ((df.is_month_end==1)|(df.day_of_month==15)).astype('int8')
    for c in ['family','city','state','type','cluster','oil_above_70']:
        df[c] = df[c].astype('category') if c in df else df[c]
    return df

# ─── 2) The /predict endpoint ────────────────────────────────────────────────────

@app.route('/predict', methods=['POST'])
def predict():
    p = request.get_json()
    raw = pd.DataFrame([{
        'id':          p['id'],
        'date':        p['date'],
        'store_nbr':   p['store_nbr'],
        'family':      p['family'],
        'onpromotion': p['onpromotion']
    }])
    df_feat = preprocess_full(raw, stores, transactions, oil, holidays)
    df_feat = df_feat.reindex(columns=EXPECTED_COLS, fill_value=0)

    out_id   = int(df_feat['id'].iloc[0])
    out_date = df_feat['date'].iloc[0]

    X_pred = df_feat.drop(['id','date','sales'], axis=1, errors='ignore').copy()
    for c in ['city','state','type','family','cluster','oil_above_70','IsWorkDay']:
        if c in X_pred: X_pred[c] = X_pred[c].astype('category')

    y_log1p = MODEL.predict(X_pred, validate_features=False)
    y_pred   = float(np.expm1(y_log1p)[0])

    return jsonify({
        'id': out_id,
        'date': str(out_date),
        'predicted_sales': y_pred
    })

if __name__ == '__main__':
    # disable the reloader inside notebooks
    app.run(host='0.0.0.0', port=8000, debug=True, use_reloader=False)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8000
 * Running on http://192.168.1.113:8000
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [11/May/2025 21:52:17] "POST /predict HTTP/1.1" 200 -


In [14]:
import lightgbm as lgb
import numpy as np
import pandas as pd

# ─── assume x is your input DataFrame ───────────────────────
# it looks like:
#    id   date     store_nbr  family   sales  onpromotion  city  state  type  ...
# ─────────────────────────────────────────────────────────────

# 1) Pull out id & date so we can print them later
ids   = x['id'].values
dates = x['date'].values

# 2) Build the feature matrix for prediction
X_pred = x.drop(['id', 'date', 'sales'], axis=1).copy()

# 3) Cast your categoricals exactly like you did in training
cat_cols = ['city','state','type','family','cluster','oil_above_70','IsWorkDay']
for c in cat_cols:
    if c in X_pred.columns:
        X_pred[c] = X_pred[c].astype('category')

# 4) Load your LightGBM model
model = lgb.Booster(model_file='lgb_model.txt')

# 5) Predict log1p(sales), skipping the pandas‐metadata check
y_log1p = model.predict(X_pred, validate_features=False)

# 6) Invert the transform
y_pred = np.expm1(y_log1p)

# 7a) Print the plain NumPy array of sales
print("Predicted sales array:")
print(y_pred)

# 7b) (Optional) Print them alongside id & date
print("\nDetailed output:")
for _id, _date, sale in zip(ids, dates, y_pred):
    print(f"id: {_id}, date: {_date}, predicted_sales: {sale:.2f}")


Predicted sales array:
[4.46368169]

Detailed output:
id: 3000888, date: 2017-08-16T00:00:00.000000000, predicted_sales: 4.46
