In [None]:
import math
import pandas as pd
import numpy as np
import datetime as dt
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression,LinearRegression,BayesianRidge, Lasso
from statistics import mean
from math import sqrt
import tensorflow as tf
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import LSTM, Bidirectional
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras import Input, layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

import datetime
import warnings
from tqdm import tqdm
from pathlib import Path
import time
from copy import deepcopy
import os

In [None]:
calendar = pd.read_csv("calendar.csv")
calendar

In [None]:
sales = pd.read_csv("sales_train_validation.csv")
sales

In [None]:
prices = pd.read_csv("sell_prices.csv")
prices

In [None]:
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def merge_by_concat(df1, df2, merge_on):
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1

In [None]:
price_df = reduce_mem_usage(prices)
calendar_df = reduce_mem_usage(calendar)
sales_df = reduce_mem_usage(sales)

In [None]:
price_df


In [None]:
def transform(data):
    
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
        
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 
           'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    
    return data



In [None]:
data = pd.read_pickle('data_small.pkl')

In [None]:
data 

In [None]:
submission = pd.read_csv("sample_submission.csv")
submission 

In [None]:
data.info()

In [None]:
from sklearn.preprocessing import LabelEncoder


data = transform(data)

In [None]:
data.head()

In [None]:
pickle = pd.read_pickle('data_small.pkl')
pickle.info()

In [None]:
training_data = data.drop(columns = ["d", "wday", "month", "year"])

In [None]:
date_index = calendar_df['date']
dates = date_index[0:1913]
dates_list = [dt.datetime.strptime(date, '%Y-%m-%d').date() for date in dates]


In [None]:
calendar_df

In [None]:
calendar_df[["event_name_1", "event_type_1", "event_name_2", "event_type_2"]] = calendar_df[["event_name_1", "event_type_1", "event_name_2", "event_type_2"]].fillna("Unknown")

In [None]:
calendar_df.info()

In [None]:
item_store_list = []
items = sales_df["item_id"]
stores = sales_df["store_id"]
for i in range(len(sales_df.index)):
    item_store = items.iloc[i] + "_" + stores.iloc[i]
    item_store_list.append(item_store)
    

In [None]:
sales_df['item_store_id'] = item_store_list


DF_Sales = sales_df.loc[:,'d_1':'d_1913'].T
DF_Sales.columns = sales_df['item_store_id'].values


DF_Sales = pd.DataFrame(DF_Sales).set_index([dates_list])
DF_Sales.index = pd.to_datetime(DF_Sales.index)
DF_Sales.head()

In [None]:
DF_Sales.info()

In [None]:
index = 10
y = pd.DataFrame(DF_Sales.iloc[:,index])
y = pd.DataFrame(y).set_index([dates_list])
TS_selected = y 
y.index = pd.to_datetime(y.index)
ax = y.plot(figsize=(30, 9),color='red')
ax.set_facecolor('lightgrey')
plt.xticks(fontsize=21 )
plt.yticks(fontsize=21 )
plt.legend(fontsize=20)
plt.title(label = 'Sales Demand Selected Time Series Over Time',fontsize = 23)
plt.ylabel(ylabel = 'Sales Demand',fontsize = 21)
plt.xlabel(xlabel = 'Date',fontsize = 21)
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder


def calendartransform(data):
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'weekday']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
    for feature in features:
        encoder = LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    return data

def salestransform(data):
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    for feature in cat:
        encoder = LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    return data
        
def pricestransform(data):
    cat = ['item_id', 'store_id']
    for feature in cat:
        encoder = LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    return data

In [None]:
calendar_df["date"] = pd.to_datetime(calendar_df["date"])

sub_cal = calendar_df[["date", "wm_yr_wk", "event_name_1", "event_type_1", "event_name_2", "event_type_2", "snap_CA", "snap_TX", 
                      "snap_WI"]]

sales_calendar_df = pd.merge(DF_Sales, sub_cal, how = "left", left_index = True, right_on = "date")

In [None]:
sales_calendar_df

In [None]:
sales_df

In [None]:
price_df

In [None]:
item_store_list = []
items = price_df["item_id"]
stores = price_df["store_id"]
for i in range(len(price_df.index)):
    item_store = items.iloc[i] + "_" + stores.iloc[i]
    item_store_list.append(item_store)
    

In [None]:
price_df['item_store_id'] = item_store_list
price_df

In [None]:
sales_df

In [None]:
DF_Price = price_df.loc[:,'sell_price'].T
DF_Price

In [None]:
new_sales_df = sales_df.drop(columns = ["id", "item_id", "store_id", "dept_id", "cat_id", "state_id"])
new_sales_df

In [None]:
index = sales_calendar_df["date"]
sales_calendar = sales_calendar_df.set_index(index)

In [None]:
sales_calendar

In [None]:
processed_df = calendartransform(sales_calendar)


In [None]:
processed_df

In [None]:
days = calendar_df["weekday"]
filterdays = days[0:1913]
merged = pd.merge(sales_calendar_df, filterdays, how = "inner", left_index = True, right_index = True)

In [None]:
processed = calendartransform(merged)
processed.drop(columns = ["date", "wm_yr_wk"], inplace = True)
processed