# Specify and set the filepath structure for the project data to work with

## 1. Import libraries

In [1]:
# Import necessary libraries
import os
import requests
import gzip
import shutil
import pandas as pd
from pandas import json_normalize
import json

# Optional: to suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")


## 2. Introductionary examples for os library

In [None]:
# Illustrate how os.getcwd(), os.path.join(), os.path.exists() work

path = os.path.join(os.getcwd(),"..", 'data')
print(f"The path {path} exists:\n{os.path.exists(path)}\n") # returns True if the folder or file exists

path = os.path.join(path, '_original', 'All_Beauty.jsonl.gz')
print(f"The path {path} exists:\n{os.path.exists(path)}")

In [None]:
# Illustrate how os.path.pardir and os.path.abspath() work

# get the relative path of the parent folder of the working directory
path = os.path.join(os.getcwd(), os.pardir)
print(path)

# and return the absolute path of the parent directory
path = os.path.abspath(path)
print(path)

## 3. Create project folder structure and download data

In [14]:
# Get the current notebook working directory ("./notebooks")  for the "./../data" folder and it's subolders to be created:
path = os.path.join(os.getcwd(), os.pardir) # get the relative path of the parent folder of the working directory
path = os.path.abspath(path) # and return the absolute path of the parent directory for the data folder
path = os.path.join(path, 'data') # path variable for "data" folder within the local github project on your machine

# Check if the structure within data folder is existing:
if os.path.exists(os.path.join(path, '_original')) == False:
    # create the folder
    os.mkdir(os.path.join(path, '_original'))
    # download the data from the web (XXX_ToBeTested)
    url_user = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories/All_Beauty.jsonl.gz'
    url_meta = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_All_Beauty.jsonl.gz'
    url_BERT = 'https://drive.google.com/file/d/1a_HQWlphn-wqm6MD5IDPTuzY_-Q7bw6A'

    response_user = requests.get(url_user)
    response_meta = requests.get(url_meta)
    response_BERT = requests.get(url_BERT)

    if response_user.status_code == 200:
        with open(os.path.join(path, '_original','All_Beauty.jsonl.gz'), 'wb') as file:
            file.write(response_user.content)
        print(f" File 'All_Beaty.json.gz' dowloaded sucessfully")
    else:
        print(f" Failed to download 'All_Beaty.json.gz'. \nPlease download manually to './data/_original' folder.")

    if response_meta.status_code == 200:
        with open(os.path.join(path, '_original', 'meta_All_Beauty.jsonl.gz'), 'wb') as file:
            file.write(response_meta.content)
        print(f" File 'meta_All_Beaty.json.gz' dowloaded sucessfully")
    else:
        print(f" Failed to download 'meta_All_Beaty.json.gz'. \nPlease download manually to './data/_original' folder.")
    
    if response_BERT.status_code == 200:
        with open(os.path.join(path, '_original', 'df_user_embeddings_BERT.csv.gz'), 'wb') as file:
            file.write(response_BERT.content)
        print(f" File 'df_user_embeddings_BERT.csv.gz' dowloaded sucessfully")
    else:
        print(f" Failed to download 'df_user_embeddings_BERT.csv.gz'. \nPlease download manually to './data/_original' folder.")

elif os.path.exists(os.path.join(path, 'json_files')) == False:    
    source = os.path.join(path, '_original')
    destination = os.path.join(path, 'json_files')
    
    # create the folder
    os.mkdir(destination)
    ExtractAndMoveUnzipped(source, destination)
    
elif os.path.exists(os.path.join(path, 'csv_transformed')) == False:
    os.mkdir(os.path.join(path, 'csv_transformed'))

elif os.path.exists(os.path.join(path, 'data_clean')) == False:
    os.mkdir(os.path.join(path, 'data_clean'))

elif os.path.exists(os.path.join(path, 'json_normalized')) == False:
    os.mkdir(os.path.join(path, 'json_normalized'))

elif os.path.exists(os.path.join(path, 'embeddings_output')) == False:
    source = os.path.join(path, '_original')
    destination = os.path.join(path, 'embeddings_output')

    # create the folder
    os.mkdir(destination)
    ExtractAndMoveUnzipped(source, destination)

elif os.path.exists(os.path.join(path, 'embeddings_dim_reduction')) == False:
    os.mkdir(os.path.join(path, 'embeddings_dim_reduction'))

elif os.path.exists(os.path.join(path, 'embeddings_dim_reduction')) == False:
    os.mkdir(os.path.join(path, 'embeddings_dim_reduction'))

elif os.path.exists(os.path.join(path, 'embeddings_dim_reduction')) == False:
    os.mkdir(os.path.join(path, 'embeddings_dim_reduction'))

elif os.path.exists(os.path.join(path, 'text_analysis')) == False:
    os.mkdir(os.path.join(path, 'text_analysis'))
    os.mkdir(os.path.join(path, 'text_analysis', 'user_vectors'))
    os.mkdir(os.path.join(path, 'text_analysis', 'product_vectors'))
   
elif os.path.exists(os.path.join(path, 'embeddings_output')) == False:
    os.mkdir(os.path.join(path, 'embeddings_output'))

elif os.path.exists(os.path.join(path, 'embeddings_dim_reduction')) == False:
    os.mkdir(os.path.join(path, 'embeddings_dim_reduction'))

elif os.path.exists(os.path.join(path, 'cos_similarity')) == False:
    os.mkdir(os.path.join(path, 'cos_similarity')) 


def ExtractAndMoveUnzipped(source, destination):
    # check if unzip and shutil packages are installed:
    pip_list = os.popen('pip list').read().strip() #pip list all installed packages and remove spaces
    #print(f"pip list: {pip_list}")
    Package = list(pip_list.split("\n")) #split by end of line
    #print(f"Package: {Package}")

    c = 0
    for i in Package:
        if "gzip" in i or "shutil" in i:
            c = c + 1

    if c == 2:
        # unzip downloaded json files from _original into json_files
        for filename in os.listdir(source):
            if filename.endswith('.gz'): 
                with gzip.open(filename, 'rb') as f_in:
                    with open(filename[:-3], 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                        shutil.move(os.path.join(source, f_out), destination)
    else:
        print("Please extract the .gz files in './data/_original' and \nmove the JSON files to './data/json_files' and \move the embedding CSV files to '.data/embeddings_output' \nfor this notebook to work properly!")

## 4. Create a flattend CSV from the downloaded JSON files

Convert the nested JSON datasets of the items descpritions and users ratings into flattend CSV files

In [None]:
# get the relative path of the parent folder of the working directory
# and return the absolute path of the parent directory for the data folder
path = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) 
path = os.path.join(path, 'data') # path variable for "data" folder within the local github project on your machine

if os.path.exists(os.path.join(path, 'json_normalized', 'normalized_user.csv')) == False:
    # Flatten nested json file of 'user' data
    json_file_user = os.path.join(path, "json_files", "All_Beauty.jsonl") #'.\..\data\json_files\All_Beauty.jsonl'
    nested_data = []
    with open(json_file_user, 'r') as file:
        for line in file:
            nested_data.append(json.loads(line))

    df_user = json_normalize(nested_data)
    df_user.to_csv(r'.\..\data\json_normalized\normalized_user.csv', index=False)
    
    print("Nested JSON file has been flattened and saved as \n'.\data\json_normalized\normalized_user.csv'.")

# 
elif os.path.exists(os.path.join(path, 'csv_transformed', 'meta.csv')) == False:
    json_file_meta = os.path.join(path, "json_files", "meta_All_Beauty.jsonl") #'.\..\data\json_files\meta_All_Beauty.jsonl'
    df_meta = pd.read_json(json_file_meta, lines=True)
    # Specify the path for the CSV:
    csv_meta = r'.\..\data\csv_transformed\meta.csv'
    # Save the DataFrame as a CSV file
    df_meta.to_csv(csv_meta, index=False)
    
    print(f"JSON file has been converted to CSV and saved as '{csv_meta}'.")

else:
    print("Most likely the corresponding folder structure and csv file already exists.\nIn case it does not please repeat the previous steps or adapt the folder structure manually")