In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from DataManipulation import DataManipulation
from Estimators.XGBoost import XGBoost
from Logging import Logging
from Transformers.FilterDepartment import FilterDepartment
from Transformers.ImputePrice import ImputePrice
from Transformers.LagFeature import LagFeature
from Transformers.LogTransformation import LogTransformation
from Transformers.MonthlyAggregate import MonthlyAggregate
from Transformers.NegativeSales import NegativeSales

import pandas as pd
import findspark

In [3]:
def initialize_session(name):
    return SparkSession.builder.master("local[*]").appName(name).\
        config("spark.driver.bindAddress", "localhost").\
        config("spark.ui.port", "4050").getOrCreate()

In [4]:
findspark.init()
spark = initialize_session("Assignment")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
log = Logging.getLogger()
log.info("Initializing session")

data = DataManipulation()
df = data.get_data()

In [5]:
pandas_df = pd.DataFrame({"Letters":["X", "Y", "Z"]})
spark_df = spark.createDataFrame(pandas_df)

# Add the spark data frame to the catalog
spark_df.createOrReplaceTempView('spark_df')

spark_df.show()

+-------+
|Letters|
+-------+
|      X|
|      Y|
|      Z|
+-------+



In [6]:
# df = data.filter_store(df, "WI_1")
filterDepartment = FilterDepartment(inputCol="FOODS_1", filterCol="dept_id")

In [7]:
imputePrice = ImputePrice()
negativeSales = NegativeSales(column="sales")
aggregate = MonthlyAggregate(columns=["store_id", "dept_id", "year", "month"],
                             expressions={"sales": "sum",
                                          "sell_price": "avg",
                                          "event_name_1": "count",
                                          "event_name_2": "count",
                                          "snap_WI": "sum"}
                             )
logTransformation = LogTransformation(inputCols=["sales"])
lagFeatures = LagFeature(partitionBy=["store_id", "dept_id"],
                         orderBy=["year", "month"],
                         lags=[i for i in range(1, 12)],
                         target="sales"
                         )

storeIndexer = StringIndexer(inputCol="store_id", outputCol="store_id_index")
yearIndexer = StringIndexer(inputCol="year", outputCol="year_index")

In [8]:
log.info("Initiating pipeline")
transformed = Pipeline(stages=[filterDepartment, imputePrice, negativeSales, aggregate,
                               logTransformation, lagFeatures, storeIndexer,
                               yearIndexer]).fit(df).transform(df)

In [9]:
train, test = data.train_test_split(transformed)

# XGBoost

In [10]:
inputColumns = ["store_id_index", "month", "year_index", "event_name_1", "event_name_2", "sell_price"]
inputColumns.extend(["lag_{}".format(i) for i in range(1, 12)])

#xgbModel =
xgbModel = XGBoost(inputCols=inputColumns, labelCol="sales").fit(train)

Training XGBoost
score:                                                                                                                 
2.9916431909900405                                                                                                     
score:                                                                                                                 
6.3183759718282895                                                                                                     
score:                                                                                                                 
2.9946315615338612                                                                                                     
score:                                                                                                                 
3.0001932511570852                                                                                                     
score:                 

In [12]:
pred = xgbModel.transform(test)
print(pred.show(10))

+----------+------------------+
|prediction|            actual|
+----------+------------------+
| 3.8714511|3.7662640906519957|
|  3.847605| 3.991447598003803|
| 3.9086847|3.8987251815894934|
| 3.9006307| 4.073571728304925|
| 3.9226177| 3.770557474850995|
|  3.888113| 3.922154325231059|
| 3.7979376|   3.9703933720796|
| 3.8614612| 4.006380458549693|
| 3.8469942|  4.14674801363064|
| 3.9204493|4.1285285037974395|
+----------+------------------+
only showing top 10 rows

None


In [17]:
xgbModel.save("XGBoostBestModel")

# Random Forest