In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType
from pyspark.sql.functions import ceil,col,split,lit,sum,bround
from pyspark.sql import functions as sf
from pyspark.sql import Window
from pyspark.sql.functions import min, max
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import HiveContext
import pyspark.sql.functions as F
import pyspark.sql.functions as f
from pyspark.sql.functions import when
from pyspark.sql.functions import desc, regexp_replace
from pyspark.sql.functions import *

from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql.functions import *
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics 
from sklearn.metrics import accuracy_score


In [None]:
sqlContext = HiveContext(sc)

inventory_raw_data = sqlContext.read \
     .format('com.databricks.spark.csv') \
     .options(header='false', delimiter=',') \
     .load('location')

fr03_raw_data_dc01 = sqlContext.read \
     .format('com.databricks.spark.csv') \
     .options(header='false', delimiter=',') \
     .load('location')

...

fr03_raw_data_dc30 = sqlContext.read \
     .format('com.databricks.spark.csv') \
     .options(header='false', delimiter=',') \
     .load('location')

fr03_raw_data = [fr03_raw_data_dc01,fr03_raw_data_dc02,fr03_raw_data_dc02,fr03_raw_data_dc03,fr03_raw_data_dc04,fr03_raw_data_dc05,fr03_raw_data_dc07,fr03_raw_data_dc09,fr03_raw_data_dc10,fr03_raw_data_dc20,fr03_raw_data_dc30]
fr03_raw_data = reduce(DataFrame.unionAll, fr03_raw_data)

In [None]:
#Inventory
inventory_raw_data = inventory_raw_data.select(col("_c0"), col("_c1"), col("_c2"),col("_c3"),col("_c4"), col("_c5"),col("_c6"),col("_c7"),col("_c8"), col("_c9"), col("_c10"), col("_c11"), col("_c12"), col("_c13"), col("_c14"), col("_c15"), col("_c16"), col("_c17"), col("_c18"), col("_c19"), col("_c20"), col("_c21"), col("_c22"), col("_c23"), col("_c24"),col("_c25"), col("_c26"), col("_c27"), col("_c28"), col("_c29"), col("_c30"), col("_c31"), col("_c32"), col("_c33"), col("_c34"), col("_c35"), col("_c36"), col("_c37"), col("_c38"), col("_c39"), col("_c40"), col("_c41"), col("_c42"), col("_c43"))\
.withColumnRenamed("_c0", "DC") \
.withColumnRenamed("_c1", "WH") \
...
.withColumnRenamed("_c42", "Vendor_Ti") \
.withColumnRenamed("_c43", "Vendor_Hi") \

In [None]:
#FR03
fr03_raw_data = fr03_raw_data.select(col("_c0"), col("_c1"), col("_c2"),col("_c3"),col("_c4"), col("_c5"),col("_c6"),col("_c7"),col("_c8"), col("_c9"), col("_c10"), col("_c11"), col("_c12"), col("_c13"), col("_c14"), col("_c15"), col("_c16"), col("_c17"), col("_c18"), col("_c19"), col("_c20"), col("_c21"), col("_c22"), col("_c23"), col("_c24"),col("_c25"), col("_c26"), col("_c27"), col("_c28"))\
.withColumnRenamed("_c0", "DC_fr03") \
.withColumnRenamed("_c1", "WH_fr03") \
.withColumnRenamed("_c2", "LOCATION_fr03") \
.withColumnRenamed("_c3", "Slot_Type_fr03")\
...
.withColumnRenamed("_c28", "Date_fr03")\

In [None]:
fr03_raw_data_select = fr03_raw_data.select(col("DC_fr03"), ... col("Date_fr03"))

In [None]:
# Inventory
inventory_raw_data = inventory_raw_data.withColumn("DC",inventory_raw_data.DC.cast(IntegerType()))\
...
.withColumn("Vendor_Hi",inventory_raw_data.Vendor_Hi.cast(IntegerType()))\

In [None]:
# FR03
fr03_raw_data_select = fr03_raw_data_select.withColumn("DC_fr03",fr03_raw_data.DC_fr03.cast(IntegerType()))\
...
.withColumn("CoorZ",fr03_raw_data_select.CoorZ.cast(IntegerType()))\

In [None]:
inventory_raw_data = inventory_raw_data.withColumn("PRODUCT_CYCLE_CLASS", when(inventory_raw_data["PRODUCT_CYCLE_CLASS"].isNull(), lit("No_Cycle_Class")).otherwise(inventory_raw_data["PRODUCT_CYCLE_CLASS"]))

In [None]:
inventory_fr03_raw_data = inventory_raw_data.join(fr03_raw_data_select,(inventory_raw_data["Slot_Address"] == fr03_raw_data_select["LOCATION_fr03"])&(inventory_raw_data["DC"] == fr03_raw_data_select["DC_fr03"])&(inventory_raw_data["WH"] == fr03_raw_data_select["WH_fr03"]),"left")

inventory_fr03_raw_data = inventory_fr03_raw_data.distinct()


In [None]:
inventory_fr03_df = inventory_fr03_raw_data.toPandas()

## Data Preprocessing

In [None]:
# drop records that doesn't have location information, because they are usually in the pharmacy, not major inventory
inventory_fr03_df = inventory_fr03_df.dropna(subset=['DC_fr03'])

In [None]:
# remove sensitive information
inventory_fr03_df = inventory_fr03_df.drop(columns = ["Senior","Buyer_Name"])

In [None]:
# look at the data information
inventory_fr03_df.info()

In [None]:
# check for the null values
inventory_fr03_df.isnull().sum()

In [None]:
# check for the duplicated records
inventory_fr03_df.duplicated().sum()

## Kmeans

In [None]:
inventory_fr03_df["MVTOT_AVG_SHIP"].isnull().sum()
inventory_fr03_df = inventory_fr03_df.dropna(subset=['MVTOT_AVG_SHIP'])

In [None]:
X = inventory_fr03_df[["MVTOT_AVG_SHIP","CoorX","CoorY","DC"]]

In [None]:
# elbow method determing k
# wcss stands for Within-Cluster Sum of Square
wcss = [] 
for i in range(1, 11): 
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X) 
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') 
plt.show()

#ref:https://www.analyticsvidhya.com/blog/2021/01/in-depth-intuition-of-k-means-clustering-algorithm-in-machine-learning/

In [None]:
kmeans = KMeans(n_clusters = 4, init = "k-means++", random_state = 42)
y_kmeans = kmeans.fit_predict(X)

In [None]:
inventory_fr03_df['Predicted_Class'] = y_kmeans

## Multi-class Logistic Model

In [None]:
# use one-hot encoder
data_df = pd.get_dummies(inventory_fr03_df,columns=['Product_Super_Category'])

In [None]:
# get the column name
data_df.columns

In [None]:
X = data_df[['Product_Super_Category_BAKERY', 'Product_Super_Category_DAIRY',
       'Product_Super_Category_DELI', 'Product_Super_Category_EDIBLE GROCERY',
       'Product_Super_Category_FROZEN', 'Product_Super_Category_GEN MDSE',
       'Product_Super_Category_H&BC', 'Product_Super_Category_MEAT',
       'Product_Super_Category_NATURE PLACE',
       'Product_Super_Category_NON-EDIBLE GROCERY',
       'Product_Super_Category_PRODUCE', 'Product_Super_Category_SEAFOOD',
       'Product_Super_Category_Z-SUPER CATEGORY','Vendor#','Qty_in_Slot']]
y = data_df['Predicted_Class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 123)

In [None]:
model1 = LogisticRegressionCV(random_state=123, cv=5,multi_class='multinomial').fit(X_train, y_train)
preds = model1.predict(X_test)

#print the tunable parameters (They were not tuned in this example, everything kept as default)
params = model1.get_params()
print(params)

In [None]:
#Accuracy statistics

print('Accuracy Score:', accuracy_score(y_test, preds))  

#Create classification report

class_report=classification_report(y_test, preds)
print(class_report)

In [None]:
#transform confusion matrix into array
#the matrix is stored in a vaiable called confmtrx
confmtrx = np.array(confusion_matrix(y_test, preds))
#Create DataFrame from confmtrx array 
#rows for test: Male, Female, Infant designation as index 
#columns for preds: male, predicted_female, predicted_infant as column

pd.DataFrame(confmtrx, index=[0,1,2,3],
columns=['predicted_0', 'predicted_1', 'predicted_2','predicted_3'])

In [None]:
#Print model parameters
# print('Intercept: \n', model1.intercept_)
# print('Coefficients: \n', model1.coef_)

## Random Forest

In [None]:
#Import Random Forest Model

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [None]:
feature_imp = pd.Series(clf.feature_importances_,index=X_train.columns).sort_values(ascending=False)
feature_imp

In [None]:
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

In [None]:
print('Accuracy Score:', accuracy_score(y_test, y_pred))  
class_report=classification_report(y_test, y_pred)
print(class_report)

In [None]:
#transform confusion matrix into array
#the matrix is stored in a vaiable called confmtrx
confmtrx = np.array(confusion_matrix(y_test, y_pred))
#Create DataFrame from confmtrx array 
#rows for test: Male, Female, Infant designation as index 
#columns for preds: male, predicted_female, predicted_infant as column

pd.DataFrame(confmtrx, index=[0,1,2,3],
columns=['predicted_0', 'predicted_1', 'predicted_2','predicted_3'])