# Specify and set the filepath structure for the project data to work with

## 1. Import libraries

In [1]:
# Import necessary libraries
import os
import requests
import gzip
import shutil
import pandas as pd
from pandas import json_normalize
import json

# Optional: to suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")


## 2. Introductionary examples for os library

In [None]:
# Illustrate how os.getcwd(), os.path.join(), os.path.exists() work

path = os.path.join(os.getcwd(),"..", "data")
print(f"The path {path} exists:\n{os.path.exists(path)}\n") # returns True if the folder or file exists

path = os.path.join(path, '_original', 'All_Beauty.jsonl.gz')
print(f"The path {path} exists:\n{os.path.exists(path)}")

In [None]:
# Illustrate how os.path.pardir and os.path.abspath() work

# get the relative path of the parent folder of the working directory
path = os.path.join(os.getcwd(), os.pardir)
print(path)

# and return the absolute path of the parent directory
path = os.path.abspath(path)
print(path)

## 3. Create project folder structure and download data

In [4]:
'''___FUNCTIONS___
=================='''

def VerifyOrMakePath(WorkingDirectory, Subfolder):
    '''
    Checks if a subfolder exist and creates it if it does not. 
    Requires the os library to work.
    
    Parameters
    ----------
    WorkingDirectory : str (e.g.: os.getcwd())
    Subfolder : str OR List of str (e.g.: ['foo','bar'])
    '''    
    if len(Subfolder) < 1: # no subfolder to be created
        return 
        
    if type(Subfolder) is list:        
        if len(Subfolder) < 2: # just one element
            Subfolder = str(Subfolder[0]) # convert to string
        else: # convert to string with "," as delimiter
            Subfolder = ",".join(Subfolder)

    # remove all whitespaces in the strings
    if WorkingDirectory.isspace():
        WorkingDirectory.replace(" ","")
    if Subfolder.isspace():
        Subfolder.replace(" ","")

    FullPath = ",".join((WorkingDirectory, Subfolder)) # full path, comma separated without spaces
    FullPath = FullPath.split(",") # converted to list of arguments for os.path.join() function

    # Check if the structure within the working directory is existing:
    if os.path.exists(os.path.join(*FullPath)) == False: # the * converts the list to comma seperated string arguments
        # If not create the subfolder    
        os.mkdir(os.path.join(*FullPath))


def ExtractAndMoveUnzipped(Source, Destination, FileList="ALL", FileEnding=".gz" ):
    '''
    Extract gz files from a source and move the extracted files to a destination. 
    Can be specified by FileList and FileEnding. Default will extract all gz files.
    Requires the os, shutil and gzip libraries to work. 

    Parameters
    ----------
    Source : str
    Destination : str
    FileList : list of strings
    FileEnding : str
    '''
    
    # check if unzip and shutil packages are installed:
    pip_list = os.popen('pip list').read().strip() #pip list all installed packages and remove spaces
    #print(f"pip list: {pip_list}")
    Package = list(pip_list.split("\n")) #split by end of line
    #print(f"Package: {Package}")

    c = 0
    for i in Package:
        if "gzip" in i or "shutil" in i:
            c = c + 1
    
    if c !=2:
        print("Please verify that gzip and shutil libary are installed in your environment.\nYou can try install them or extract and move the files manually as needed.")
        return
    
    if FileList == "ALL":
        FileList = os.listdir(Source)

    for filename in FileList:
        if os.path.isfile(os.path.join(Source, filename)): 
            if filename.endswith(FileEnding):
                # unzip downloaded files from _original into destination
                with gzip.open(filename, 'rb') as f_in:
                    with open(filename[:-3], 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                        shutil.move(os.path.join(Source, f_out), Destination)
                print(f"File {filename} has been unzipped and the extracted file was moved to {Destination}")
        else:
            print("Please verify the {filename} file exists in\n{Source}")


def DownloadDataFiles(dctFilenameUrl, DownloadFolder):
    '''Download data files from the internet provided in a dictionary to a specified folder. 
    The dictionary key is the filename and the value contains the download url.
    Requires the os library to work.

    Parameters
    ----------
    dctFilenameUrl : dict (e.g.: {"key1FileName": 'value1Url', "key2FileName": 'value2Url'})
    DownloadFolder : str (e.g.: os.getcwd())

    '''
    for filename, url in dctFilenameUrl.items():
        response = requests.get(url)

        if response.status_code == 200:
            with open(os.path.join(DownloadFolder, filename), 'wb') as file:
                file.write(response.content)
            print(f" File {filename} dowloaded sucessfully")
        else:
            print(f" Failed to download {filename}. \nPlease download manually to {DownloadFolder}.")



'''___MAIN___
================='''

# Get the current notebook working directory ("./notebooks")  for the "./../data" folder and it's subolders to be created:
path = os.path.join(os.getcwd(), os.pardir) # get the relative path of the parent folder of the working directory
path = os.path.abspath(path) # and return the absolute path of the parent directory for the data folder
path = os.path.join(path, 'data') # path variable for "data" folder within the local github project on your machine

# define the folder structure to be created. Subfolders can be created by a list of strings
lstSubfolders = [
    "_original", 
    "json_files", 
    "csv_transformed", 
    "data_clean", 
    "json_normalized", 
    "embeddings_output",
    "embeddings_dim_reduction",
    "text_analysis",
    ["text_analysis","user_vectors"],
    ["text_analysis","product_vectors"],
    "cos_similarity"
    ]

# Check if the structure within data folder is existing and create them otherwise:
for ListItem in lstSubfolders:
    VerifyOrMakePath(path, ListItem)

# set variable for download of files and later unpacking
source = os.path.join(path, '_original')

# create a dictionary of files and their download urls:
dctDownloads = {
    "All_Beauty.jsonl.gz": 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories/All_Beauty.jsonl.gz',
    "meta_All_Beauty.jsonl.gz": 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_All_Beauty.jsonl.gz',
    "df_user_embeddings_BERT.csv.gz": 'https://drive.google.com/file/d/1a_HQWlphn-wqm6MD5IDPTuzY_-Q7bw6A',
    "df_user_embeddings_BERT_merged.csv.gz": 'https://drive.google.com/file/d/1szxv2o2GR-wIIPpgZowsHvE4hGtEJzQ2',
    "merged_user_meta_df.csv": 'https://drive.google.com/file/d/1KUm8JMC8L5VrLVKNnJ7i9jDTQtXH85cQ'
    }

# download the data files to './data/_original' folder
if os.path.exists(source):
    DownloadDataFiles(dctDownloads, source)

# extract the downloaded .gz files and move the unzipped files
if os.path.exists(os.path.join(path, 'json_files')):
    destination = os.path.join(path, 'json_files')
    ExtractAndMoveUnzipped(source, destination, FileEnding= 'jsonl.gz')

if os.path.exists(os.path.join(path, 'embeddings_output')):
    destination = os.path.join(path, 'embeddings_output')
    ExtractAndMoveUnzipped(source, destination, FileEnding= 'csv.gz')

KeyboardInterrupt: 

## 4. Create a flattend CSV from the downloaded JSON files

Convert the nested JSON datasets of the items descpritions and users ratings into flattend CSV files

In [None]:
# get the relative path of the parent folder of the working directory
# and return the absolute path of the parent directory for the data folder
path = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) 
path = os.path.join(path, 'data') # path variable for "data" folder within the local github project on your machine

if os.path.exists(os.path.join(path, 'json_normalized', 'normalized_user.csv')) == False:
    # Flatten nested json file of 'user' data
    json_file_user = os.path.join(path, "json_files", "All_Beauty.jsonl") #'.\..\data\json_files\All_Beauty.jsonl'
    nested_data = []
    with open(json_file_user, 'r') as file:
        for line in file:
            nested_data.append(json.loads(line))

    df_user = json_normalize(nested_data)
    df_user.to_csv(r'.\..\data\json_normalized\normalized_user.csv', index=False)
    
    print("Nested JSON file has been flattened and saved as \n'.\data\json_normalized\normalized_user.csv'.")

# 
elif os.path.exists(os.path.join(path, 'csv_transformed', 'meta.csv')) == False:
    json_file_meta = os.path.join(path, "json_files", "meta_All_Beauty.jsonl") #'.\..\data\json_files\meta_All_Beauty.jsonl'
    df_meta = pd.read_json(json_file_meta, lines=True)
    # Specify the path for the CSV:
    csv_meta = r'.\..\data\csv_transformed\meta.csv'
    # Save the DataFrame as a CSV file
    df_meta.to_csv(csv_meta, index=False)
    
    print(f"JSON file has been converted to CSV and saved as '{csv_meta}'.")

else:
    print("Most likely the corresponding folder structure and csv file already exists.\nIn case it does not please repeat the previous steps or adapt the folder structure manually")