<p style="text-align:center">
        <img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/assets/logos/SN_web_lightmode.png" width="300" alt="Skills Network Logo">
</p>


### Analyse search terms on the e-commerce web server


##### In this assignment you will download the search term data set for the e-commerce web server and run analytic queries on it.


In [None]:
# Install spark

In [1]:
!pip install pyspark
!pip install findspark

import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression



In [None]:
# Start session

In [2]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession.builder.appName("Ecommerce ML predictions").getOrCreate()

25/07/20 07:33:29 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/20 07:33:31 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
# Download The search term dataset from the below url
!wget 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv'

--2025-07-20 07:33:58--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104, 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 233457 (228K) [text/csv]
Saving to: ‘searchterms.csv’


2025-07-20 07:33:58 (31.8 MB/s) - ‘searchterms.csv’ saved [233457/233457]



In [None]:
# Load the csv into a spark dataframe

In [6]:
df = spark.read.csv('searchterms.csv', header = True, inferSchema = True)

                                                                                

In [None]:
# Print the number of rows and columns

In [13]:
print(f"The database has {df.count()} rows and {len(df.columns)} columns")

The database has 10000 rows and 4 columns


In [None]:
# Print the top 5 rows

In [14]:
df.show(5)

+---+-----+----+--------------+
|day|month|year|    searchterm|
+---+-----+----+--------------+
| 12|   11|2021| mobile 6 inch|
| 12|   11|2021| mobile latest|
| 12|   11|2021|   tablet wifi|
| 12|   11|2021|laptop 14 inch|
| 12|   11|2021|     mobile 5g|
+---+-----+----+--------------+
only showing top 5 rows



In [None]:
# Find out the datatype of the column searchterm?

In [15]:
df.printSchema()

root
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- searchterm: string (nullable = true)



In [None]:
# How many times was the term `gaming laptop` searched?

In [20]:
df.where('searchterm="gaming laptop"').count()

499

In [None]:
# Print the top 5 most frequently used search terms?

In [24]:
df.groupby('searchterm').count().sort('count', ascending = False).show(5)



+-------------+-----+
|   searchterm|count|
+-------------+-----+
|mobile 6 inch| 2312|
|    mobile 5g| 2301|
|mobile latest| 1327|
|       laptop|  935|
|  tablet wifi|  896|
+-------------+-----+
only showing top 5 rows



                                                                                

In [26]:
# The pretrained sales forecasting model is available at  the below url
!wget 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz'

--2025-07-20 07:45:47--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104, 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1490 (1.5K) [application/x-tar]
Saving to: ‘model.tar.gz’


2025-07-20 07:45:47 (13.0 MB/s) - ‘model.tar.gz’ saved [1490/1490]



In [32]:
!tar -tvf model.tar.gz

drwxr-sr-x jupyterlab/resources 0 2022-03-16 04:54 sales_prediction.model/
drwxr-sr-x jupyterlab/resources 0 2022-03-16 04:54 sales_prediction.model/metadata/
-rw-r--r-- jupyterlab/resources 512 2022-03-16 04:54 sales_prediction.model/metadata/part-00000
-rw-r--r-- jupyterlab/resources  12 2022-03-16 04:54 sales_prediction.model/metadata/.part-00000.crc
-rw-r--r-- jupyterlab/resources   0 2022-03-16 04:54 sales_prediction.model/metadata/_SUCCESS
-rw-r--r-- jupyterlab/resources   8 2022-03-16 04:54 sales_prediction.model/metadata/._SUCCESS.crc
drwxr-sr-x jupyterlab/resources   0 2022-03-16 04:54 sales_prediction.model/data/
-rw-r--r-- jupyterlab/resources 1909 2022-03-16 04:54 sales_prediction.model/data/part-00000-1db9fe2f-4d93-4b1f-966b-3b09e72d664e-c000.snappy.parquet
-rw-r--r-- jupyterlab/resources    0 2022-03-16 04:54 sales_prediction.model/data/_SUCCESS
-rw-r--r-- jupyterlab/resources   24 2022-03-16 04:54 sales_prediction.model/data/.part-00000-1db9fe2f-4d93-4b1f-966b-3b09e72d66

In [None]:
# Load the sales forecast model.
# Take a screenshot of the code and name it as loadmodel.jpg)

In [33]:
from pyspark.ml.regression import LinearRegressionModel
model = LinearRegressionModel.load('sales_prediction.model')

In [None]:
# Using the sales forecast model, predict the sales for the year of 2023.

In [36]:
assembler = VectorAssembler(inputCols=["year"], outputCol="features") 
data = [[2023, 0]] 
columns = ["year", "sales"]  # Updated column names for clarity
df = spark.createDataFrame(data, columns)

transformed_df = assembler.transform(df).select('features', 'sales')  # Updated column selection

predictions = model.transform(transformed_df)

predictions.select('features','prediction').show()

+--------+------------------+
|features|        prediction|
+--------+------------------+
|[2023.0]|175.16564294006457|
+--------+------------------+

