### Analyse search terms on the e-commerce web server


##### The search term data set for the e-commerce web server and run analytic queries on it.


In [None]:
# Import spark

In [1]:
from pyspark.sql import SparkSession

In [None]:
# Start session

In [2]:
spark = SparkSession.builder.appName("Analyse search terms on the e-commerce web server").getOrCreate()

In [None]:
# Load the csv into a spark dataframe

In [3]:
df = spark.read.csv("searchterms.csv", header=True, inferSchema=True)

In [None]:
# The number of rows and columns

In [7]:
print(f"Number of rows: {df.count()}, columns below:")
print(df.columns)

Number of rows: 10000, columns below:
['day', 'month', 'year', 'searchterm']


In [None]:
# Top 5 rows

In [8]:
df.show(5)

+---+-----+----+--------------+
|day|month|year|    searchterm|
+---+-----+----+--------------+
| 12|   11|2021| mobile 6 inch|
| 12|   11|2021| mobile latest|
| 12|   11|2021|   tablet wifi|
| 12|   11|2021|laptop 14 inch|
| 12|   11|2021|     mobile 5g|
+---+-----+----+--------------+
only showing top 5 rows



In [None]:
# The datatype of the column searchterm

In [9]:
df.printSchema()

root
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- searchterm: string (nullable = true)



In [None]:
# Search how many times was the term `gaming laptop` searched?

In [12]:
from pyspark.sql.functions import lower
df.filter(lower(df.searchterm).contains('gaming laptop')).count()

499

In [None]:
# Top 5 most frequently used search terms

In [18]:
from pyspark.sql.functions import col
df.groupBy(df.searchterm).count().sort(col("count").desc()).show(5)

+-------------+-----+
|   searchterm|count|
+-------------+-----+
|mobile 6 inch| 2312|
|    mobile 5g| 2301|
|mobile latest| 1327|
|       laptop|  935|
|  tablet wifi|  896|
+-------------+-----+
only showing top 5 rows



In [None]:
# The pretrained sales forecasting model

In [21]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

model = LinearRegressionModel.load("/models/sales_prediction.model")

In [None]:
# Using the sales forecast model, predict the sales for the year of 2023.

In [24]:
def predict(year):
    assembler = VectorAssembler(inputCols=["year"], outputCol="features")
    data = [[year, 0]]
    columns = ["year", "sales"]
    _ = spark.createDataFrame(data, columns)
    __ = assembler.transform(_).select('features', 'year')
    predictions = model.transform(__)
    predictions.select('prediction').show()

In [23]:
predict(2023)

+------------------+
|        prediction|
+------------------+
|175.16564294006457|
+------------------+

