<a href="https://colab.research.google.com/github/nsikak-akpakpan/nakpakpan-repo/blob/master/h5_harvester.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prompt: as a python developer, create python code to download file at https://zenodo.org/records/10782800/files/mae_experiments.h5?download=1, extract mae_expreiments.h5 and create a dataframe.

import requests
import os
import h5py
import pandas as pd

def download_and_extract_h5(url, filename="mae_experiments.h5"):
    """
    Downloads an .h5 file from a given URL, extracts its contents, and creates a Pandas DataFrame.
    """

    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an exception for bad status codes

        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded {filename} successfully.")


        with h5py.File(filename, 'r') as hf:
            # Access the data within the HDF5 file
            # Assuming the relevant data is stored in a dataset named 'data' within a group named 'experiment1'
            # Adapt this line as needed to correspond to the actual organization of your data
            try:
                data_group_key = list(hf.keys())[0]  # Get the first key (group)
                dataset_key = list(hf[data_group_key].keys())[0] # Get first key (dataset) within the group
                data = hf[data_group_key][dataset_key][:]  # Or any other path to your dataset

                df = pd.DataFrame(data)
                print("DataFrame created successfully.")
                return df
            except (KeyError, IndexError):
                print("Error: Could not find the expected dataset within the h5 file.")
                return None

    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
        return None
    except OSError as e:
        print(f"Error writing file: {e}")
        return None
    except Exception as e:
      print(f"An unexpected error occurred: {e}")
      return None

# Example usage
url = "https://zenodo.org/records/10782800/files/mae_experiments.h5?download=1"
df = download_and_extract_h5(url)

if df is not None:
  print(df.head()) # Display the first 5 rows



Downloaded mae_experiments.h5 successfully.
DataFrame created successfully.
                       0
0         b'L-Carnitine'
1   b'L-Acetylcarnitine'
2  b'Propionylcarnitine'
3    b'C4-acylcarnitine'
4    b'C5-acylcarnitine'


In [3]:
df

Unnamed: 0,0
0,b'L-Carnitine'
1,b'L-Acetylcarnitine'
2,b'Propionylcarnitine'
3,b'C4-acylcarnitine'
4,b'C5-acylcarnitine'
5,b'Hexanoylcarnitine'
6,b'Octanoylcarnitine'
7,b'Decanoylcarnitine'
8,b'Dodecanoylcarnitine'
9,b'Tetradecanoylcarnitine'


In [10]:
# prompt: as a python developer, create python code to download file at https://zenodo.org/records/10782800/files/mae_mae.rds?download=1, extract mae_mae.rds into dataframe.

#!pip install pyreadr
import requests
import pandas as pd
import io

def download_and_extract_rds(url):
    """
    Downloads an .rds file from a given URL and attempts to extract it into a Pandas DataFrame.
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Use a BytesIO object to treat the response content as a file-like object
        try:
          # Attempt to read as RDS using pyreadr
          import pyreadr
          #result = pyreadr.read_r(os.PathLike())
          #result = pyreadr.read_r(response.content)
          result = pyreadr.read_r(io.BytesIO(response.content))
          # Assuming the .rds file contains a single data frame
          df = result[None]
          print("DataFrame created successfully using pyreadr.")
          return df
        except ImportError:
          print("Error: pyreadr is not installed. Please install it using 'pip install pyreadr'.")
          return None
        except Exception as e:
          print(f"Error reading .rds file using pyreadr: {e}")
          return None

    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

# Example usage
url = "https://zenodo.org/records/10782800/files/mae_mae.rds?download=1"
df = download_and_extract_rds(url)

if df is not None:
    print(df.head())
    print(df.info())


Error reading .rds file using pyreadr: expected str, bytes or os.PathLike object, not BytesIO


In [11]:
# prompt: fix "Error reading .rds file using pyreadr: expected str, bytes or os.PathLike object, not BytesIO" in the above code

import requests
import os
import h5py
import pandas as pd
import io
import pyreadr

# ... (rest of your existing code for download_and_extract_h5)

def download_and_extract_rds(url, filename="temp.rds"):
    """
    Downloads an .rds file from a given URL and attempts to extract it into a Pandas DataFrame.
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        # Save the content to a temporary file
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        try:
            # Read the .rds file from the temporary file
            result = pyreadr.read_r(filename)
            df = result[None]
            print("DataFrame created successfully using pyreadr.")
            return df
        except ImportError:
            print("Error: pyreadr is not installed. Please install it using 'pip install pyreadr'.")
            return None
        except Exception as e:
            print(f"Error reading .rds file using pyreadr: {e}")
            return None
        finally:
            # Remove the temporary file
            os.remove(filename)

    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None
# Example usage
url = "https://zenodo.org/records/10782800/files/mae_mae.rds?download=1"
df = download_and_extract_rds(url)

if df is not None:
    print(df.head())
    print(df.info())

Error reading .rds file using pyreadr: The file contains an unrecognized object


In [12]:
# prompt: as a python developer, create python code to download file at https://zenodo.org/records/10782800/files/mae_mae.rds?download=1 as temp.rds

import requests
import os

def download_file(url, filename):
    """Downloads a file from a given URL and saves it to the specified filename."""
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an exception for bad status codes

        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded {filename} successfully.")

    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage
url = "https://zenodo.org/records/10782800/files/mae_mae.rds?download=1"
filename = "temp.rds"
download_file(url, filename)


Downloaded temp.rds successfully.


In [16]:
# prompt: as a python developer, read and convert temp.rds into a dataframe

#!pip install pyreadr

import pandas as pd
import pyreadr

# Assuming 'temp.rds' is in the current working directory or provide the full path
result = pyreadr.read_r('/content/mae_mae.rds')

# Assuming the .rds file contains a single data frame named 'df'
df = result[None]  # Or result[list(result.keys())[0]] if you're unsure of the name
print(df.head())
print(df.info())


LibrdataError: The file contains an unrecognized object

In [18]:
import pyreadr

def extract_rds(file_path):
    """
    Extracts data from an RDS file and returns it as a Pandas DataFrame.

    Args:
        file_path (str): The path to the RDS file.

    Returns:
        pandas.DataFrame: The extracted data as a DataFrame, or None if an error occurs.
    """
    try:
        result = pyreadr.read_r(file_path)
        # RDS files typically contain a single object, accessible by the key None
        data = result[None]
        return data
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
         print(f"An error occurred: {e}")
         return None

# Example usage:
file_path = '/content/temp.rds'
df = extract_rds(file_path)

if df is not None:
    print(df.head())

An error occurred: The file contains an unrecognized object


In [17]:
!pip install rds2py

Collecting rds2py
  Downloading rds2py-0.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)
Collecting biocframe (from rds2py)
  Downloading BiocFrame-0.6.2-py3-none-any.whl.metadata (12 kB)
Collecting biocutils>=0.1.5 (from rds2py)
  Downloading biocutils-0.2.2-py3-none-any.whl.metadata (3.5 kB)
Collecting genomicranges>=0.4.9 (from rds2py)
  Downloading GenomicRanges-0.6.2-py3-none-any.whl.metadata (10 kB)
Collecting summarizedexperiment>=0.4.1 (from rds2py)
  Downloading SummarizedExperiment-0.5.3-py3-none-any.whl.metadata (4.9 kB)
Collecting singlecellexperiment>=0.4.1 (from rds2py)
  Downloading SingleCellExperiment-0.5.7-py3-none-any.whl.metadata (4.8 kB)
Collecting multiassayexperiment (from rds2py)
  Downloading MultiAssayExperiment-0.5.0-py3-none-any.whl.metadata (5.9 kB)
Collecting iranges>=0.4.2 (from genomicranges>=0.4.9->rds2py)
  Downloading IRanges-0.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)
Collecting 

In [22]:
import pyreadr
result = pyreadr.read_r('/content/mae_mae.rds') # also works for RData
print(result)
# done!
# result is a dictionary where keys are the name of objects and the values python
# objects. In the case of Rds there is only one object with None as key
df = result[None]

LibrdataError: The file contains an unrecognized object