In [12]:
import requests
import pandas as pd
import pyarrow.parquet as pq

# Fungsi untuk mendownload baris secara langsung
def download_rows(offset, length):
    url = f"https://datasets-server.huggingface.co/rows?dataset=garythung%2Ftrashnet&config=default&split=train&offset={offset}&length={length}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        # Cetak struktur respons JSON untuk memeriksa isinya
        print("Response JSON Structure:", data)
        # Periksa kunci 'rows' di dalam data yang diterima
        if 'rows' in data:
            return data['rows']
        else:
            raise KeyError("'rows' key not found in the response JSON")
    else:
        response.raise_for_status()

# Fungsi untuk mendapatkan daftar split dari dataset
def get_splits():
    url = "https://datasets-server.huggingface.co/splits?dataset=garythung%2Ftrashnet"
    response = requests.get(url)
    if response.status_code == 200:
        splits = response.json()
        return splits
    else:
        response.raise_for_status()

# Fungsi untuk mendapatkan daftar file Parquet dari dataset
def get_parquet_files():
    url = "https://huggingface.co/api/datasets/garythung/trashnet/parquet/default/train"
    response = requests.get(url)
    if response.status_code == 200:
        parquet_files = response.json()
        # Pastikan bahwa kita mengakses list dengan indeks numerik
        return parquet_files
    else:
        response.raise_for_status()

# Fungsi untuk membaca data dari file Parquet
def read_parquet_file(file_url):
    df = pd.read_parquet(file_url)
    return df

# Contoh Penggunaan
if __name__ == "__main__":
    try:
        # Mendapatkan daftar file Parquet
        parquet_files = get_parquet_files()
        print("Parquet files:", parquet_files)
        
        # Baca data dari file Parquet pertama
        if parquet_files:
            first_file_url = parquet_files[0]['url']  # Akses URL file Parquet pertama dengan benar
            print(f"Reading data from {first_file_url}")
            df = read_parquet_file(first_file_url)
            print("Dataframe shape:", df.shape)
            print(df.head())
        
        # Mendapatkan daftar split
        splits = get_splits()
        print("Splits available:", splits)
        
        # Mendownload 100 baris data dari dataset sebagai contoh
        # Jika ingin mendownload menggunakan API rows (tidak direkomendasikan jika ukuran terlalu besar)
        # rows = download_rows(0, 100)
        # df_rows = pd.DataFrame(rows)
        # print("Dataframe shape from rows API:", df_rows.shape)
    
    except KeyError as e:
        print(f"Key error: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")


Parquet files: ['https://huggingface.co/api/datasets/garythung/trashnet/parquet/default/train/0.parquet', 'https://huggingface.co/api/datasets/garythung/trashnet/parquet/default/train/1.parquet', 'https://huggingface.co/api/datasets/garythung/trashnet/parquet/default/train/2.parquet', 'https://huggingface.co/api/datasets/garythung/trashnet/parquet/default/train/3.parquet']
An error occurred: string indices must be integers, not 'str'
