In [None]:
import warnings
warnings.filterwarnings('ignore')

## 1. Data Ingestion

### 1.1 Load Cluster Dataset

In [None]:
%%spark
import pandas as pd
df = spark.read.format('csv').option('header','true').load("/mnt/resource/o9_spark_temp/jhub/5300/martinfabbri_5300/Sales.csv")
#df = spark.read.format('csv').option('header','true').load('/mnt/resource/o9_spark_temp/jhub/6017/aabhaschandra_6017/Sales.csv')
sales = df.toPandas()
sales['Date'] = pd.to_datetime(sales['Date'])
sales['StoreId'] = sales['StoreId'].astype(int)
sales['WeeklySales'] = sales['WeeklySales'].astype(float)
sales['Department'] = sales['Department'].astype(int)
sales['IsHoliday'] = sales['IsHoliday'].astype(bool)

sales.head()

In [None]:
%%spark
df.summary().show()

### 1.2 Load EKG Dataset

In [None]:
%%spark
from o9_ibpl_magics import spark_ibpl

df = spark_ibpl('select ([Store].[Store_ID] * [Store].[Type] * [Store].[Size]) on column;',spark)

stores_df = df.withColumnRenamed("StoreStoreID","StoreId")
stores = stores_df.toPandas()
stores['StoreId']  = stores['StoreId'].astype(int)
stores.head()

### 1.3 Load External Dataset - Azure Blob Storage

In [None]:
%%spark
csv_path = "https://o9demostorage.blob.core.windows.net/o9demodata/Features.csv"
features = pd.read_csv(csv_path, encoding='utf8')
features['Date'] = pd.to_datetime(features['Date'])
features['StoreId']  = features['StoreId'].astype(int)
features.head()

## 2. Data Processing

### 2.1 Merging Dataframes

In [None]:
%%spark
df=pd.merge(sales,features, on=['StoreId','Date', 'IsHoliday'], how='left')
df=pd.merge(df,stores, on=['StoreId'], how='left')

df=df.fillna(0)
df['Temperature'] = (df['Temperature']- 32) * 5./9.

types_encoded, types = df['StoreType'].factorize()
df['Type'] = types_encoded

df.head()

### 2.2 Remove Duplicates 

In [None]:
%%spark
print('training_data duplicated:{}'.format(df.duplicated().sum()))
df.drop_duplicates(inplace=True)

### 2.3 Feature Engineering

In [None]:
%%spark
tab_info = pd.DataFrame(df.dtypes).T.rename(index={0:'column Type'}) 
tab_info = tab_info.append(pd.DataFrame(df.isnull().sum()).T.rename(index={0:'null values (nb)'}))
tab_info = tab_info.append(pd.DataFrame(df.isnull().sum()/df.shape[0]*100).T.
                                       rename(index={0: 'null values (%)'}))
tab_info

In [None]:
%%spark
df_average_sales_week = df.groupby(by=['Date'], as_index=False)['WeeklySales'].sum()
df_average_sales = df_average_sales_week.sort_values('WeeklySales', ascending=False)
ts = df_average_sales_week.set_index('Date')

## 3. Model Training

In [None]:
%%spark
from sklearn.linear_model import LinearRegression

def fit_ar_model(ts, orders):  
    X=np.array([ ts.values[(i-orders)].squeeze() if i >= np.max(orders) else np.array(len(orders) * [np.nan]) for i in range(len(ts))])
    mask = ~np.isnan(X[:,:1]).squeeze()
    Y= ts.values
    lin_reg=LinearRegression()
    lin_reg.fit(X[mask],Y[mask])
    print(lin_reg.coef_, lin_reg.intercept_)
    print('Score factor: %.2f' % lin_reg.score(X[mask],Y[mask]))
    return lin_reg.coef_, lin_reg.intercept_
    
def predict_ar_model(ts, orders, coef, intercept):
    return np.array([np.sum(np.dot(coef, ts.values[(i-orders)].squeeze())) + intercept  if i >= np.max(orders) else np.nan for i in range(len(ts))])

In [None]:
%%spark
import numpy as np
orders = np.array([1,6,52])
coef, intercept = fit_ar_model(ts,orders)

## 4. Inference

In [None]:
%%spark
pred = pd.DataFrame(index=ts.index, data=predict_ar_model(ts, orders, coef, intercept))
pred.tail()