In [None]:
pip install downcast



In [None]:
import pandas as pd
import pickle
import numpy as np
import random 
from downcast import reduce
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
import time
from sklearn.preprocessing import LabelEncoder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
calendar_=pd.read_csv('/content/drive//My Drive/CS-1/calendar.csv')
sales_train_evaluation_=pd.read_csv('/content/drive//My Drive/CS-1/sales_train_evaluation.csv')
sell_prices_=pd.read_csv('/content/drive//My Drive/CS-1/sell_prices.csv')

* Randomly selected a Single Data point for Computation 

In [None]:
#https://datatofish.com/random-rows-pandas-dataframe/
sales=sales_train_evaluation_.sample().reset_index().drop(['index'],axis=1) 
sales

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,...,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,FOODS_2_273_CA_4_evaluation,FOODS_2_273,FOODS_2,FOODS,CA_4,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,1,2,3,0,0,0,0,0,2,0,0,2,0,3,1,3,1,2,1,0,1,1,1,0,0,3,0,1,0,0,0,0,6,0


#Function-1

In [None]:
def function_1(x):
    #Adding columns for the days d_1942 to d_1969 as nan for which we need to forecast sales
    for i in range(1942,1970):
      x['d_'+str(i)]=np.nan
      x['d_'+str(i)]=x['d_'+str(i)].astype(np.float16)
    
    #Melting
    #To make analysis of data in table easier we can reshape the data into a more computer-friendly form using pandas in Python. 
    #pandas.melt() is one of the function to do so
    df=pd.melt(x,id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'],var_name='d',value_name='demand')
    df=pd.merge(df,calendar_,on='d',how='left')
    df=pd.merge(df,sell_prices_,on=['item_id','store_id','wm_yr_wk'],how='left')

    #Since we got many rows with 'NaN' values in Sell Price
    #Thus replacing 'NaN" in 'sell_price' feature with the mean value
    df['sell_price']=df['sell_price'].fillna(df.groupby('id')['sell_price'].transform('mean'))

    #Converting Object Data Type to Category Data Type
    col=list(df.columns)
    types=list(df.dtypes.values)
    for i,j in enumerate(types):
      if j.name == 'object':
       df[col[i]]=df[col[i]].astype('category')
    
    #Replacing 'NaN' values by 'no_event'
    cat=['event_name_1','event_type_1','event_name_2','event_type_2']
    for i in cat:
      df[i]=df[i].cat.add_categories('no_event')
      df[i].fillna('no_event',inplace=True)
    
    #Adding feature 'is_weekend' which tells about that day is weekend or not
    f=lambda x: 1 if x<=2 else 0
    #https://stackoverflow.com/questions/21608228/conditional-replace-pandas
    df['is_weekend']=df['wday'].map(f) 
    df['is_weekend']=df['is_weekend'].astype(np.int8)
    
    #Adding feature 'month_day' which tells day of the month
    m=df["date"].tolist()
    m=[i.split("-")[2] for i in m]
    df["month_day"]=m
    df['month_day']=df['month_day'].astype(np.int8)
    
    #Adding feature 'month_week_number' which tells which week of the month
    #https://stackoverflow.com/questions/3806473/python-week-number-of-the-month
    df['month_week_number']=(df['month_day']-1) // 7 + 1 
    df['month_week_number']=df['month_week_number'].astype(np.int8)
    
    #Adding feature 'events_per_day' which tells us number of events on particular day
    f=lambda x: 0 if x=='no_event' else 1
    #https://stackoverflow.com/questions/21608228/conditional-replace-pandas
    df['events_per_day']=df['event_type_1'].map(f) 
    index=df.index 
    indices=index[df['event_type_2']!='no_event'].tolist()
    for i in indices:
      df['events_per_day'][i]+=1
      df['events_per_day']=df['events_per_day'].astype(np.int8)
    
    #Lag features are the classical way that time series forecasting problems are transformed into supervised learning problems.
    #Lag is expressed in a time unit & corresponds to the amount of data history we allow the model to use when making the prediction.
    #Here we have applied Lags on 'demand' column.
    #The maximum Lags taken is 70 days 
    #https://stackoverflow.com/questions/20410312/how-to-create-a-lagged-data-structure-using-pandas-dataframe
    lags=[28,35,42,49,56,63,70]
    for i in lags:
      df['lag_'+str(i)]=df.groupby(['id'])['demand'].shift(i)

    #Replacing 'NaN' in 'lags' features with 0
    lags=['lag_28','lag_35','lag_42','lag_49','lag_56','lag_63','lag_70']
    for i in lags:
      df[i]=df[i].fillna(0) 
    
    #Rolling is a very useful operation for time series data.
    #Here we have computing Rolling-Mean on 'demand' column.
    #The maximum Window size taken is 42
    #https://stackoverflow.com/questions/13996302/python-rolling-functions-for-groupby-object
    #https://www.geeksforgeeks.org/python-pandas-dataframe-transform/
    window=[7,14,28,35,42]
    for i in window:
      df['rolling_median_'+str(i)]=df.groupby(['id'])['demand'].transform(lambda s: s.rolling(i,center=False).median())

    #Replacing 'NaN' in 'rolling_ mean' features with 0
    window=['rolling_median_7','rolling_median_14','rolling_median_28','rolling_median_35','rolling_median_42']
    for i in window:
      df[i]=df[i].fillna(0) 

    #Encoding refers to converting the labels into numeric form so as to convert it into the machine-readable form.
    #Machine learning algorithms can then decide in a better way on how those labels must be operated.
    #It is an important pre-processing step for the structured dataset in supervised learning
    #https://www.mygreatlearning.com/blog/label-encoding-in-python/
    labelencoder=LabelEncoder() 
    category=['event_name_1','event_type_1','event_name_2','event_type_2','id','item_id','dept_id','cat_id','store_id','state_id']
    for i in category:
      df[i+'_']=labelencoder.fit_transform(df[i])

    #Drop all the categorical columns bcoz we already added coresponding columns with label-encoding
    df=df.drop(['event_name_1','event_type_1','event_name_2','event_type_2','id','item_id','dept_id','cat_id','store_id','state_id'],axis=1)

    #Removed '_' from 'd' column values so that we can convert Categorical feature into Numerical feature easily
    l=[]
    for i in df['d']:
      l.append(i.split('_')[1])
    df['day']=l
    #https://stackoverflow.com/questions/15891038/change-column-type-in-pandas
    df['day']=df['day'].astype(np.int16) 

    #Since 'weekday' is represented by 'wday' & 'd' is represented by 'day'
    #We already have 'month','year' thats why 'date' is also duplicate column
    df=df.drop(['d','date','weekday'],axis=1)

    df=df.drop(['demand'],axis=1)
    df=reduce(df)

    #Taken data after 1000 days (d_1000) so that processing speed will be fast (last approx. 31 months data)
    df=df[df['day']>1000]

    #Divide data into Test/Validation
    #Validation: From d_1914 to d_1942
    #Test: From d_1942 to d_1970
    l=[]
    for i in range(1914,1942):
      l.append(i)
    x_valid=df.loc[df['day'].isin(l)]
    x_test=df.loc[df['day']>=1942]

    #Loading Already Trained LightGBM Regressor Model for Computaion 
    with open('/content/drive//My Drive/CS-1/lgb_model.pkl','rb') as f:
      lgb=pickle.load(f)

    pred_valid=pd.DataFrame()
    pred_test=pd.DataFrame()
    pred_valid['id']=x['id'] 
    pred_test['id']=x['id'] 
    j=1
    k=1
    for i in range(1914,1942):
      pred_valid['F'+str(j)]=lgb.predict(x_valid[x_valid['day']==(i)]) 
      j+=1
    pred_valid["id"]=pred_valid["id"].apply(lambda x: x.replace('evaluation','validation'))

    for i in range(1942,1970):
      pred_test['F'+str(k)]=lgb.predict(x_test[x_test['day']==(i)]) 
      k+=1
       
    return pred_valid,pred_test


In [None]:
start=time.clock() 
prediction_valid,prediction_test=function_1(sales)
elapsed=time.clock()
elapsed=elapsed - start
print("Time spent for Predictions: ", np.round(elapsed,2))

Time spent for Predictions:  2.37


* Forecast Sales from Day 1914 to 1941 

In [None]:
prediction_valid

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_2_273_CA_4_validation,0.981293,0.913018,1.136423,0.505705,0.954719,0.533876,0.331676,1.0282,1.913008,2.125701,1.133325,3.221623,2.078078,1.364402,1.898234,1.772512,1.922363,1.697718,1.725335,2.126977,1.745593,1.728166,1.301806,1.329687,1.132609,0.658532,1.267908,1.002004


* Forecast Sales from Day 1942 to 1969

In [None]:
prediction_test

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_2_273_CA_4_evaluation,0.715647,0.704247,0.869178,0.902847,0.578938,0.853775,0.718523,0.44532,0.803831,1.00666,0.414762,0.737411,0.787071,0.654264,0.768275,0.387881,1.002529,0.790984,0.649645,0.970832,0.569661,0.876275,0.8009,1.013956,0.82274,0.711885,1.094165,0.233124


#Function-2

In [None]:
sales

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,...,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941,d_1942,d_1943,d_1944,d_1945,d_1946,d_1947,d_1948,d_1949,d_1950,d_1951,d_1952,d_1953,d_1954,d_1955,d_1956,d_1957,d_1958,d_1959,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969
0,FOODS_2_273_CA_4_evaluation,FOODS_2_273,FOODS_2,FOODS,CA_4,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,3,0,1,0,0,0,0,6,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
sales=sales.iloc[:,:-28]

In [None]:
def function_2(x,y_test):
  #Inserting columns for the days d_1914 to d_1941 as nan for which we need to forecast sales.
  #Bcoz from days d_1914 to d_1941 we also have true labels 
  #Thus it will be good to use this range of data for getting RMSE value 
  for i in range(1914,1942):
    x['d_'+str(i)]=np.nan
    x['d_'+str(i)]=x['d_'+str(i)].astype(np.float16)
    
  #Melting
  #To make analysis of data in table easier we can reshape the data into a more computer-friendly form using pandas in Python. 
  #pandas.melt() is one of the function to do so
  df=pd.melt(x,id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'],var_name='d',value_name='demand')
  df=pd.merge(df,calendar_,on='d',how='left')
  df=pd.merge(df,sell_prices_,on=['item_id','store_id','wm_yr_wk'],how='left')
  
  #Since we got many rows with 'NaN' values in Sell Price
  #Thus replacing 'NaN" in 'sell_price' feature with the mean value
  df['sell_price']=df['sell_price'].fillna(df.groupby('id')['sell_price'].transform('mean'))

  #Converting Object Data Type to Category Data Type
  col=list(df.columns)
  types=list(df.dtypes.values)
  for i,j in enumerate(types):
    if j.name == 'object':
      df[col[i]]=df[col[i]].astype('category')

  #Replacing 'NaN' values by 'no_event'
  cat=['event_name_1','event_type_1','event_name_2','event_type_2']
  for i in cat:
    df[i]=df[i].cat.add_categories('no_event')
    df[i].fillna('no_event',inplace=True)

  #Adding feature 'is_weekend' which tells about that day is weekend or not
  f=lambda x: 1 if x<=2 else 0
  #https://stackoverflow.com/questions/21608228/conditional-replace-pandas
  df['is_weekend']=df['wday'].map(f) 
  df['is_weekend']=df['is_weekend'].astype(np.int8)

  #Adding feature 'month_day' which tells day of the month
  m=df["date"].tolist()
  m=[i.split("-")[2] for i in m]
  df["month_day"]=m
  df['month_day']=df['month_day'].astype(np.int8)

  #Adding feature 'month_week_number' which tells which week of the month
  #https://stackoverflow.com/questions/3806473/python-week-number-of-the-month
  df['month_week_number']=(df['month_day']-1) // 7 + 1 
  df['month_week_number']=df['month_week_number'].astype(np.int8)
    
  #Adding feature 'events_per_day' which tells us number of events on particular day
  f=lambda x: 0 if x=='no_event' else 1
  #https://stackoverflow.com/questions/21608228/conditional-replace-pandas
  df['events_per_day']=df['event_type_1'].map(f) 
  index=df.index 
  indices=index[df['event_type_2']!='no_event'].tolist()
  for i in indices:
    df['events_per_day'][i]+=1
    df['events_per_day']=df['events_per_day'].astype(np.int8)

  #Lag features are the classical way that time series forecasting problems are transformed into supervised learning problems.
  #Lag is expressed in a time unit & corresponds to the amount of data history we allow the model to use when making the prediction.
  #Here we have applied Lags on 'demand' column.
  #The maximum Lags taken is 70 days 
  #https://stackoverflow.com/questions/20410312/how-to-create-a-lagged-data-structure-using-pandas-dataframe
  lags=[28,35,42,49,56,63,70]
  for i in lags:
    df['lag_'+str(i)]=df.groupby(['id'])['demand'].shift(i)

  #Replacing 'NaN' in 'lags' features with 0
  lags=['lag_28','lag_35','lag_42','lag_49','lag_56','lag_63','lag_70']
  for i in lags:
    df[i]=df[i].fillna(0) 

  #Rolling is a very useful operation for time series data.
  #Here we have computing Rolling-Mean on 'demand' column.
  #The maximum Window size taken is 42
  #https://stackoverflow.com/questions/13996302/python-rolling-functions-for-groupby-object
  #https://www.geeksforgeeks.org/python-pandas-dataframe-transform/
  window=[7,14,28,35,42]
  for i in window:
    df['rolling_median_'+str(i)]=df.groupby(['id'])['demand'].transform(lambda s: s.rolling(i,center=False).median())

  #Replacing 'NaN' in 'rolling_ mean' features with 0
  window=['rolling_median_7','rolling_median_14','rolling_median_28','rolling_median_35','rolling_median_42']
  for i in window:
    df[i]=df[i].fillna(0) 

  #Encoding refers to converting the labels into numeric form so as to convert it into the machine-readable form.
  #Machine learning algorithms can then decide in a better way on how those labels must be operated.
  #It is an important pre-processing step for the structured dataset in supervised learning
  #https://www.mygreatlearning.com/blog/label-encoding-in-python/
  labelencoder=LabelEncoder() 
  category=['event_name_1','event_type_1','event_name_2','event_type_2','id','item_id','dept_id','cat_id','store_id','state_id']
  for i in category:
    df[i+'_']=labelencoder.fit_transform(df[i])

  #Drop all the categorical columns bcoz we already added coresponding columns with label-encoding
  df=df.drop(['event_name_1','event_type_1','event_name_2','event_type_2','id','item_id','dept_id','cat_id','store_id','state_id'],axis=1)

  #Removed '_' from 'd' column values so that we can convert Categorical feature into Numerical feature easily
  l=[]
  for i in df['d']:
    l.append(i.split('_')[1])
  df['day']=l
  #https://stackoverflow.com/questions/15891038/change-column-type-in-pandas
  df['day']=df['day'].astype(np.int16) 

  #Since 'weekday' is represented by 'wday' & 'd' is represented by 'day'
  #We already have 'month','year' thats why 'date' is also duplicate column
  df=df.drop(['d','date','weekday'],axis=1)

  df=df.drop(['demand'],axis=1)
  df=reduce(df)

  #Taken data after 1000 days (d_1000) so that processing speed will be fast (last approx. 31 months data)
  df=df[df['day']>1000]
  
  #Test: From d_1913 to d_1941
  x_test=df.loc[df['day']>1913]
    
  #Loading Already Trained LightGBM Regressor Model for Computaion
  with open('/content/drive//My Drive/CS-1/lgb_model.pkl','rb') as f:
    lgb=pickle.load(f)

  pred_test=lgb.predict(x_test) 
  rmse=np.sqrt(((pred_test-y_test)**2).mean())
  
  return rmse


In [None]:
#https://stackoverflow.com/questions/5478351/python-time-measure-function
start=time.clock()
rmse=function_2(sales.iloc[:,:-28],sales.iloc[:,-28:].values.tolist())
elapsed=time.clock()
elapsed=elapsed - start
print("Time spent: {}".format(np.round(elapsed,2)))
print("Rmse: {}".format(np.round(rmse,3)))

Time spent: 2.07
Rmse: 1.343
