### Train a classifier to determine product seasonality

In [None]:
#import necessary libraries
from azureml.core import Workspace, Dataset
from azureml.data.datapath import DataPath
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Establish workspace from the environment and retrieve the defined AggregatedProductSeasonality dataset

In [None]:
ws = Workspace.from_config()

# Load data from registered dataset
dataset = Dataset.get_by_name(ws, name='AggregatedProductSeasonality')
prod_df = dataset.to_pandas_dataframe()
# Pivot the data frame to make daily sale items counts columns.
prod_prep_df = prod_df.set_index(['ProductId', 'Seasonality','TransactionDateId'])['TransactionItemsCount'].unstack()
prod_prep_df = prod_prep_df.rename_axis(None, axis=1).reset_index()
prod_prep_df = prod_prep_df.fillna(0)

### Isolate features and prediction classes. Standardize feature by removing the mean and scaling to unit variance.

In [None]:
X = prod_prep_df.iloc[:, 2:].values
y = prod_prep_df['Seasonality'].values

X_scale = StandardScaler().fit_transform(X)

# Perform dimensionality reduction using Principal Components Analysis and two target components.
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scale)
principal_components = MinMaxScaler().fit_transform(principal_components)

pca_df = pd.DataFrame(data = principal_components, columns = ['pc1', 'pc2'])
pca_df = pd.concat([pca_df, prod_prep_df[['Seasonality']]], axis = 1)

### Visualize the products data mapped to the two principal components

Display the products data frame in two dimensions (mapped to the two principal components).

Note the clear separation of clusters.

In [None]:
fig = plt.figure(figsize = (6,6))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [1, 2, 3]
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = pca_df['Seasonality'] == target
    ax.scatter(pca_df.loc[indicesToKeep, 'pc1']
               , pca_df.loc[indicesToKeep, 'pc2']
               , c = color
               , s = 1)
ax.legend(['All Season Products', 'Summer Products', 'Winter Products'])
ax.plot([-0.05, 1.05], [0.77, 1.0], linestyle=':', linewidth=1, color='y')
ax.plot([-0.05, 1.05], [0.37, 0.6], linestyle=':', linewidth=1, color='y')
ax.grid()

plt.show()
plt.close()

In [None]:
# Redo the Principal Components Analysis, this time with twenty dimensions.

def col_name(x):
    return f'f{x:02}'

pca = PCA(n_components=20)
principal_components = pca.fit_transform(X_scale)
principal_components = MinMaxScaler().fit_transform(principal_components)

X = pd.DataFrame(data = principal_components, columns = list(map(col_name, np.arange(0, 20))))
pca_df = pd.concat([X, prod_prep_df[['ProductId']]], axis = 1)
pca_automl_df = pd.concat([X, prod_prep_df[['Seasonality']]], axis = 1)

X = X[:4500]
y = prod_prep_df['Seasonality'][:4500]
pca_automl_df = pca_automl_df[:4500]

### Register the PCA dataframe a dataset with AML Studio

In [None]:
# register the pca_automl_df dataset with azure machine learning workspace for automl use in the next task
# due to the distributed nature, we must first persist the data to storage to be read by a registered dataset
local_path = 'pca.parquet'
pca_automl_df.to_parquet(local_path)
pca_datastore = ws.get_default_datastore()

pca_datastore.upload_files(files=['pca.parquet'], target_path='data', overwrite=True)
pca_ds = Dataset.Tabular.from_parquet_files(pca_datastore.path('data/pca.parquet'))
pca_ds = pca_ds.register(workspace=ws, name='pcadata', description='data for automl')


### Train ensemble of trees classifier (using XGBoost)

In [None]:
# Split into test and training data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)
#train
model = XGBClassifier()
model.fit(X_train, y_train)

In [None]:
# Perform predictions with the newly trained model
y_pred = model.predict(X_test)

# Calculate the accuracy of the model using test data
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))