A notebook that shows how to explore a dataset and extract handful subsets of data to better work in your project. 
Use this notebook as a blueprint about how to explore a new dataset:
1. (Optional) - Convert the dataset into a proper data format (JSON)
2. Learn the structure of the dataset and learn how to navigate it
3. In case of huge datasets, extract just a proper amount of data to use during development
4. Learn how to explore a dataset using DataFrame (pandas) and matlib to extract and display meaningful insights rather than just reading the raw data.

### Objective
You can master the data you are using for your project. You can interpret, manipulate and extract subsets of data from the original dataset.

In [None]:
# Fix SSL certificate issues on macOS
import os
import certifi
os.environ['SSL_CERT_FILE'] = certifi.where()
os.environ['REQUESTS_CA_BUNDLE'] = certifi.where()

In [None]:
# Install matplotlib in the current kernel (using %pip ensures it installs in the correct environment)
%pip install matplotlib

In [None]:
# To better display graphs into notebooks use the following:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Install required packages
import json
import pandas as pd

In [None]:
# Read the first line of the "data/meta_Electronics.jsonl" file to check its content
with open('../../data/meta_Electronics.jsonl', 'r') as f:
    first_line = json.loads(f.readline())

In [None]:
# Display the deserialized object representing the first line of the "data/meta_Electronics.jsonl" file
first_line

In [None]:
# Filter the items that have available date in year 2022 or later ('Date First Available' field inside the 'details' metadata)

def filter_out_data(data:dict) -> dict:
    exclude: bool = False
    if int(data['details']['Date First Available'][-4:]) < 2022:
        exclude = True
    return exclude

In [None]:
# Create new JSONL files with filtered data, applying the filter function defined above
with open("../../data/meta_Electronics.jsonl", "r") as fp:
    with open(
        "../../data/meta_Electronics_2022_2023.jsonl", "a", encoding="utf-8"
    ) as fp_out:
        with open(
            "../../data/meta_Electronics_2022_2023_no_date.jsonl", "a", encoding="utf-8"
        ) as fp_out_no_date:
            i = 0
            for line in fp:
                data = json.loads(line.strip())
                try:
                    exclude = filter_out_data(data)
                    if not exclude:
                        json.dump(data, fp_out)
                        fp_out.write("\n")
                        fp_out.flush()
                except:
                    json.dump(data, fp_out_no_date)
                    fp_out_no_date.write("\n")
                    fp_out_no_date.flush()
                i += 1
                if i % 10000 == 0:
                    print(f"Processed {i} lines")

In [None]:
# Split the new datasets into categories: create one dataset containing items with main category and
# another dataset containing items without main category

# Function to filter items without main category
def filter_category(data: dict) -> dict:
    filter = False
    if data["main_category"] == None:
        filter = True

    return filter

with open("../../data/meta_Electronics_2022_2023.jsonl", "r") as fp:
    with open(
        "../../data/meta_Electronics_2022_2023_with_category.jsonl",
        "a",
        encoding="utf-8",
    ) as fp_out:
        with open(
            "../../data/meta_Electronics_2022_2023_no_category.jsonl",
            "a",
            encoding="utf-8",
        ) as fp_out_no_category:
            for line in fp:
                data = json.loads(line.strip())
                if not filter_category(data):
                    json.dump(data, fp_out)
                    fp_out.write("\n")
                    fp_out.flush()
                else:
                    json.dump(data, fp_out_no_category)
                    fp_out_no_category.write("\n")
                    fp_out_no_category.flush()

### Pandas

In [None]:
# Prompt distribution by category
df = pd.read_json(
    "../../data/meta_Electronics_2022_2023_with_category.jsonl", lines=True
)
df.head()
df["main_category"].value_counts().plot(kind="bar")

In [None]:
# Filter out items that have at least 100 ratings
df_ratings_100 = df[df["rating_number"] > 100]
df_ratings_100["main_category"].value_counts().plot(kind="bar")

In [None]:
# Display distribution of ratings
df_ratings_100["average_rating"].plot(kind="hist", bins=50, range=(0, 5))

In [None]:
# Extract a sample of 1000 items from the dataset 'df_ratings_100'. Use a random seed 
# of 20 to ensure reproducibility (arbitrary choice).
df_sample_1000 = df_ratings_100.sample(n=1000, random_state=20)

In [None]:
# Show the distribution of average ratings in the sample of 1000 items (0-5)
df_sample_1000["average_rating"].plot(kind="hist", bins=50, range=(0, 5))

In [None]:
# Show distribution of prices in the sample of 1000 items (0-500)
df_sample_1000["price"].plot(kind="hist", bins=100, range=(0, 500))

In [None]:
# Show the distribution of main categories in the sample of 1000 items (how many products per category)
df_sample_1000["main_category"].value_counts().plot(kind="bar")

In [None]:
# Save the filtered dataset by ratings and the sample of 1000 items to new JSONL files
df_ratings_100.to_json(
    "../../data/meta_Electronics_2022_2023_with_category_ratings_100.jsonl",
    orient="records",
    lines=True,
)
df_sample_1000.to_json(
    "../../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl",
    orient="records",
    lines=True,
)

In [None]:
# Read the saved JSONL files back into DataFrames
df_ratings_100 = pd.read_json(
    "../../data/meta_Electronics_2022_2023_with_category_ratings_100.jsonl", lines=True
)
df_sample_1000 = pd.read_json(
    "../../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl",
    lines=True,
)

In [None]:
# Link the df_ratings_100 dataset back to the original Electronics.jsonl, which contains all 
# users reviews, to create a new JSONL file with all details for the filtered items only.
# It use parent_asin as the unique identifier to match items between the two datasets.
with open("../../data/Electronics.jsonl", "r") as fp:
    with open(
        "../../data/Electronics_2022_2023_with_category_ratings_100.jsonl", "a"
    ) as fp_out:
        id_list = set(df_ratings_100["parent_asin"].values)
        i = 0
        for line in fp:
            data = json.loads(line.strip())
            if data["parent_asin"] in id_list:
                json.dump(data, fp_out)
                fp_out.write("\n")
                fp_out.flush()
            i += 1
            if i % 100000 == 0:
                print(f"Processed {i} lines")

In [None]:
# Now enhance the sample of 1000 items dataset with all details from the Electronics_2022_2023_with_category_ratings_100.jsonl file
# we created in the previous step, since df_sample_1000 is a subset of df_ratings_100.
with open(
    "../../data/Electronics_2022_2023_with_category_ratings_100.jsonl", "r"
) as fp:
    with open(
        "../../data/Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl",
        "a",
    ) as fp_out:
        id_list = set(df_sample_1000["parent_asin"].values)
        i = 0
        for line in fp:
            data = json.loads(line.strip())
            if data["parent_asin"] in id_list:
                json.dump(data, fp_out)
                fp_out.write("\n")
                fp_out.flush()
            i += 1
            if i % 100000 == 0:
                print(f"Processed {i} lines")

In [None]:
df_final = pd.read_json("../../data/Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl",lines=True)

In [None]:
df_final.head()