In [1]:
import polars as pl
import pandas as pd
from xgboost import XGBRegressor, XGBRFRegressor, plot_importance
import lightgbm as lgb
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.ensemble import VotingRegressor
import pickle
import math
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

today= datetime.now()

In [82]:
df=pl.read_parquet("C:\\Users\\smishra14\\OneDrive - Stryker\\data\\Endoscopy.parquet")
df=df[['CatalogNumber','SALES_DATE','`Act Orders Rev','`Fcst Stat Prelim Rev','L2 Stat Final Rev']]
df1=df.filter(pl.col('SALES_DATE')<today-relativedelta(months=1))
df1=df1.group_by(['CatalogNumber','SALES_DATE']).sum()
df1=df1.sort('SALES_DATE',descending=False)
df1=df1.with_columns(pl.col('CatalogNumber').str.replace_all('/','-'))
df2=df1.filter(pl.col('SALES_DATE')<=today-relativedelta(months=13))
df2=df2.group_by('CatalogNumber').sum()
df2=df2.filter(pl.col('`Act Orders Rev')>0)
cc=df1.filter(pl.col('SALES_DATE')>today-relativedelta(months=13)).group_by('CatalogNumber').sum().sort('`Act Orders Rev',descending=True)
cc=cc.filter(pl.col('`Act Orders Rev')>12)  # total act orders more than 12 in last 12 months
cc=cc.join(df2[['CatalogNumber']],on='CatalogNumber',how='inner') #act orders in last 12 months

def preprocess(df):
    df=df.with_columns(month=pl.col('SALES_DATE').dt.month())
    df=df.with_columns(days=(pl.col('SALES_DATE')-datetime(year=2021,month=1,day=1)).dt.total_days())
    df=df.with_columns(lag12=pl.col('`Act Orders Rev').shift(12).over('CatalogNumber'))
    df=df.with_columns(pl.when(pl.col('lag12').is_null()).then(pl.col('`Act Orders Rev')).otherwise(pl.col('lag12')).alias('lag12'))
    return df

def fcpreprocess1(df,mon,cat):
    fdf=pl.DataFrame({'CatalogNumber':[cat]*mon,'SALES_DATE':pl.date_range(df['SALES_DATE'].max()+relativedelta(months=1),df['SALES_DATE'].max()+relativedelta(months=mon), "1mo",eager=True)})
    df=df.with_columns(actwfc=pl.col('`Act Orders Rev'))
    fdf=fdf.with_columns(month=pl.col('SALES_DATE').dt.month())
    fdf=fdf.with_columns(days=(pl.col('SALES_DATE')-datetime(year=2021,month=1,day=1)).dt.total_days())
    fdf=pl.concat([df,fdf],how='diagonal_relaxed')
    fdf=fdf.with_columns(lag12=pl.col('actwfc').shift(12).over('CatalogNumber'))
    fdf=fdf.with_columns(pl.when(pl.col('lag12').is_null()).then(pl.col('`Act Orders Rev')).otherwise(pl.col('lag12')).alias('lag12'))
    return fdf

def fcpreprocess(df,mon,cat,pred):
    df3=df.filter(pl.col('SALES_DATE')>df['SALES_DATE'].max()-relativedelta(months=mon)).with_columns(actwfc=pred,pred=pred)
    df=pl.concat([df.filter(pl.col('SALES_DATE')<=df['SALES_DATE'].max()-relativedelta(months=mon)),df3],how='diagonal_relaxed')
    fdf=pl.DataFrame({'CatalogNumber':[cat]*mon,'SALES_DATE':pl.date_range(df['SALES_DATE'].max()+relativedelta(months=1),df['SALES_DATE'].max()+relativedelta(months=mon), "1mo",eager=True)})
    fdf=fdf.with_columns(month=pl.col('SALES_DATE').dt.month())
    fdf=fdf.with_columns(days=(pl.col('SALES_DATE')-datetime(year=2021,month=1,day=1)).dt.total_days())
    fdf=pl.concat([df,fdf],how='diagonal_relaxed')
    fdf=fdf.with_columns(lag12=pl.col('actwfc').shift(12).over('CatalogNumber'))
    fdf=fdf.with_columns(pl.when(pl.col('lag12').is_null()).then(pl.col('`Act Orders Rev')).otherwise(pl.col('lag12')).alias('lag12'))
    return fdf

def pplot1(df,cat):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df['SALES_DATE'].to_list(),y=df['`Act Orders Rev'].to_list(),mode='lines',name='Act Orders',text=cat))
    fig.add_trace(go.Scatter(x=df['SALES_DATE'].to_list(),y=df['pred'].to_list(),mode='lines',name='XGBoost'))
    return fig

Tree based models cannot capture trends outside training data
https://medium.com/@simon.peter.mueller/overcoming-the-limitations-of-tree-based-models-in-time-series-forecasting-c2c5bd71a8f1

Hence below combined both tree based XGBoost and Linear booster 

In [69]:
pp=preprocess(df1)
for i in cc['CatalogNumber'][:1000]:
    p1=pp.filter(pl.col('CatalogNumber')==i).to_pandas()
    mod1=XGBRegressor()
    mod2=XGBRegressor(booster='gblinear')
    #mod2=LGBRegressor(linear_tree= True)
    pipe=VotingRegressor([('xgbm',mod1),('lgbm',mod2)])
    pipe.fit(X=p1[['month','days','lag12']],y=p1['`Act Orders Rev'])
    with open(f'models/{i}.pkl','wb') as f:
        pickle.dump(pipe,f)

In [None]:
fin_df=pl.DataFrame()
for i in cc['CatalogNumber'][:1000]:
    df2=df1.filter(pl.col('CatalogNumber')==i)
    df2=fcpreprocess1(df2,12,i)
    with open(f'models/{i}.pkl','rb') as f:
        pr=pickle.load(f)
        md=df1['SALES_DATE'].max()
        for t in range(5):
            pred=pr.predict(df2.filter(pl.col('SALES_DATE')>md)[['month','days','lag12']].to_pandas())
            md=df2['SALES_DATE'].max()
            df2=fcpreprocess(df2,12,i,pred)
    df2=df2.filter(pl.col('SALES_DATE')<=df2['SALES_DATE'].max()-relativedelta(months=12))
    fin_df=pl.concat([fin_df,df2])

In [None]:
df.write_csv('demantrastat.csv')
fin_df.write_csv('forecast.csv')

In [None]:
page = []
for i in cc['CatalogNumber'][:30]:
    df2=df1.filter(pl.col('CatalogNumber')==i)
    df2=fcpreprocess1(df2,12,i)
    with open(f'models/{i}.pkl','rb') as f:
        pr=pickle.load(f)
        md=df1['SALES_DATE'].max()
        for t in range(5):
            pred=pr.predict(df2.filter(pl.col('SALES_DATE')>md)[['month','days','lag12']].to_pandas())
            md=df2['SALES_DATE'].max()
            df2=fcpreprocess(df2,12,i,pred)
    page.append(pplot1(df2,i))
rows = math.ceil(len(page) / 2)
fig1 = make_subplots(rows=rows, cols=2,subplot_titles=[i for i in cc['CatalogNumber'][:20]])
for i, chart in enumerate(page):
    fig1.add_traces([chart.data[0],chart.data[1]],rows=math.floor((i+2)/2), cols=i%2+1)
fig1.update_layout(height=2900,showlegend=False)
fig1.show()

In [None]:
import polars as pl
from datetime import datetime
#df=pl.read_parquet('C:\\Users\\smishra14\\OneDrive - Stryker\\data\\APAC.parquet')
df.filter(pl.col('SALES_DATE')>datetime(2022,4,1)).filter(pl.col('SALES_DATE')<datetime(2026,4,1)).write_csv('C:\\Users\\smishra14\\OneDrive - Stryker\\data\\APAC.csv')

In [11]:
df.filter(pl.col('SALES_DATE')>datetime(2024,3,1)).filter(pl.col('SALES_DATE')<datetime(2026,1,1)).filter(pl.col('Region')=='South Pacific').write_csv('C:\\Users\\smishra14\\OneDrive - Stryker\\data\\SSP1.csv')
df.filter(pl.col('SALES_DATE')>datetime(2024,3,1)).filter(pl.col('SALES_DATE')<datetime(2026,1,1)).filter(pl.col('Region')=='Japan').write_csv('C:\\Users\\smishra14\\OneDrive - Stryker\\data\\Japan1.csv')

## Without Lag

In [83]:
def pplot(df,pred,cat):
    pdf=df.filter(pl.col('CatalogNumber')==cat)
    prd=pl.DataFrame({'SALES_DATE':pl.date_range(today.replace(day=1),today.replace(day=1)+relativedelta(months=len(pred)-1), "1mo",eager=True),'XGBoost':pred})
    pdf=pdf.with_columns(pl.col('SALES_DATE').cast(pl.Date))
    pdf=pdf.join(prd,on='SALES_DATE', how="full", coalesce=True)
    pdf=pdf.sort('SALES_DATE',descending=False)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=pdf['SALES_DATE'].to_list(),y=pdf['`Act Orders Rev'].to_list(),mode='lines',name='Act Orders',text=cat))
    fig.add_trace(go.Scatter(x=pdf['SALES_DATE'].to_list(),y=pdf['XGBoost'].to_list(),mode='lines',name='XGBoost'))
    return fig

def preprocess(df):
    df=df.with_columns(month=pl.col('SALES_DATE').dt.month())
    df=df.with_columns(days=(pl.col('SALES_DATE')-datetime(year=2021,month=1,day=1)).dt.total_days())
    return df

In [85]:
pp=preprocess(df1)
for i in cc['CatalogNumber'][:1000]:
    p1=pp.filter(pl.col('CatalogNumber')==i).to_pandas()
    mod1=XGBRegressor()
    mod2=XGBRegressor(booster='gblinear')
    #mod2=LGBRegressor(linear_tree= True)
    pipe=VotingRegressor([('xgbm',mod1),('lgbm',mod2)])
    pipe.fit(X=p1[['month','days']],y=p1['`Act Orders Rev'])
    with open(f'models/wolag/{i}.pkl','wb') as f:
        pickle.dump(pipe,f)

In [None]:
fin_df=pl.DataFrame()
df3=preprocess(df1) #correct this to generate 5 year forecast
for i in cc['CatalogNumber'][:1000]:
    df2=df3.filter(pl.col('CatalogNumber')==i)
    with open(f'models/wolag/{i}.pkl','rb') as f:
        pr=pickle.load(f)
        md=df1['SALES_DATE'].max()
        for t in range(5):
            pred=pr.predict(df2.filter(pl.col('SALES_DATE')>md)[['month','days']].to_pandas())
            md=df2['SALES_DATE'].max()
            df2=fcpreprocess(df2,12,i,pred)  #correct this to generate 5 year forecast
    fin_df=pl.concat([fin_df,df2])

In [86]:
fdf=pl.DataFrame({'SALES_DATE':pl.date_range(today.replace(day=1),today.replace(day=1)+relativedelta(months=60), "1mo",eager=True)})
fp=preprocess(fdf)
page = []
for i in cc['CatalogNumber'][:30]:
    with open(f'models/wolag/{i}.pkl','rb') as f:
        pr=pickle.load(f)
    pred=pr.predict(fp[['month','days']].to_pandas())
    page.append(pplot(df1,pred,i))
rows = math.ceil(len(page) / 2)
fig1 = make_subplots(rows=rows, cols=2,subplot_titles=[i for i in cc['CatalogNumber'][:20]])
for i, chart in enumerate(page):
    fig1.add_traces([chart.data[0],chart.data[1]],rows=math.floor((i+2)/2), cols=i%2+1)
fig1.update_layout(height=2900,showlegend=False)
fig1.show()

In [None]:
def preprocess(df):
    df=df.with_columns(lag3=pl.col('`Act Orders Rev').shift(3).over('CatalogNumber'))
    df=df.with_columns(pl.when(pl.col('lag3').is_null()).then(pl.col('`Act Orders Rev')).otherwise(pl.col('lag3')).alias('lag3'))
    df=df.with_columns(lag4=pl.col('`Act Orders Rev').shift(4).over('CatalogNumber'))
    df=df.with_columns(pl.when(pl.col('lag4').is_null()).then(pl.col('`Act Orders Rev')).otherwise(pl.col('lag4')).alias('lag4'))
    df=df.with_columns(lag5=pl.col('`Act Orders Rev').shift(5).over('CatalogNumber'))
    df=df.with_columns(pl.when(pl.col('lag5').is_null()).then(pl.col('`Act Orders Rev')).otherwise(pl.col('lag5')).alias('lag5'))
    df=df.with_columns(lag6=pl.col('`Act Orders Rev').shift(6).over('CatalogNumber'))
    df=df.with_columns(pl.when(pl.col('lag6').is_null()).then(pl.col('`Act Orders Rev')).otherwise(pl.col('lag6')).alias('lag6'))
    df=df.with_columns(lag12=pl.col('`Act Orders Rev').shift(12).over('CatalogNumber'))
    df=df.with_columns(pl.when(pl.col('lag12').is_null()).then(pl.col('`Act Orders Rev')).otherwise(pl.col('lag12')).alias('lag12'))
    df=df.with_columns(month=pl.col('SALES_DATE').dt.month())
    return df

In [None]:
import llama_cpp
llm = llama_cpp.llama.Llama. .from_pretrained(repo_id="Qwen/Qwen2.5-3B-Instruct-GGUF",filename="*q8_0.gguf",verbose=False)

SyntaxError: invalid syntax (131121084.py, line 2)

In [2]:
from llama_cpp import Llama
llm = Llama(
      model_path="gguf\\qwen2.5-3b-instruct-q8_0.gguf",
      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
      # seed=1337, # Uncomment to set a specific seed
      # n_ctx=2048, # Uncomment to increase the context window
)

llama_model_loader: loaded meta data with 26 key-value pairs and 435 tensors from gguf\qwen2.5-3b-instruct-q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = qwen2.5-3b-instruct
llama_model_loader: - kv   3:                            general.version str              = v0.1-v0.1
llama_model_loader: - kv   4:                           general.finetune str              = qwen2.5-3b-instruct
llama_model_loader: - kv   5:                         general.size_label str              = 3.4B
llama_model_loader: - kv   6:                          qwen2.block_count u32              = 36
llama_model_loader: - kv   7:               

In [6]:
mes=llm.create_chat_completion(messages = [
          {"role": "system", "content": ""},
          {
              "role": "user",
              "content": "Who won the world series in 2020"
          }
      ],stream=0)
mes

Llama.generate: 23 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =    1043.26 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    44 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    7699.05 ms /    45 tokens


{'id': 'chatcmpl-13f5dabb-7ba1-4137-8f1f-7f8b597b385f',
 'object': 'chat.completion',
 'created': 1742747224,
 'model': 'gguf\\qwen2.5-3b-instruct-q8_0.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': 'The New York Yankees won the World Series in 2020. They defeated the Los Angeles Dodgers in 4 games, the fewest games needed to win the World Series since 1908.'},
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 24, 'completion_tokens': 43, 'total_tokens': 67}}