<a href="https://colab.research.google.com/github/mauro-nievas-offidani/test/blob/main/data_download_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MAIN PARAMS

In [1]:
### COMPLETE WITH YOUR GITHUB ACCOUNT DETAILS AND RUN ALL.

github_username = ''
github_token = ''

### You must have access to the private repository https://github.com/mauro-nievas-offidani/scientific-resources

# OPTIONAL

In [2]:
main_folder = '/content'
data_directory = main_folder + '/dataset'
resource_amount_per_data_source = 5

In [3]:
### OPTIONAL

path_to_pubmed_api_json = ''

### path_to_pubmed_api_json (str): Path to the json file containing a dict with the keys below:
###   'email': The email address from your PubMed account.
###   'token' (optional, recommended): The API token from your PubMed account.
###   'elsevier_api_key' (optional, recommended): The API token from ElSevier account.

# GitHub Setup

In [4]:
import os
import subprocess

class GitHubAccount:
    def __init__(self, username: str, token: str, email: str = None, name: str = None,
                 drive_folder: str = '/content'):
        """
        Initialize a GitHub account handler for the fixed repo:
        'scientific-resources'.

        Args:
            username (str): GitHub username.
            token (str): GitHub Personal Access Token.
            email (str, optional): Email for git config. Defaults to None.
            name (str, optional): Name for git config. Defaults to None.
            drive_folder (str, optional): Base folder to clone repo. Defaults to '/content'.
        """
        self.username = username
        self.token = token
        self.email = email
        self.name = name
        self.base_folder = drive_folder
        self.repo_name = "scientific-resources"
        self.repo_url = f"https://{self.username}:{self.token}@github.com/mauro-nievas-offidani/{self.repo_name}.git"
        self.project_path = os.path.join(self.base_folder, self.repo_name)

        if not os.path.exists(self.base_folder):
            os.makedirs(self.base_folder)

        if self.email and self.name:
            self._configure_git()

    def _run_command(self, command, cwd=None):
        """Run a shell command and capture output or error."""
        try:
            result = subprocess.run(command, shell=True, check=True, text=True, cwd=cwd,
                                    stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            return result.stdout.strip()
        except subprocess.CalledProcessError as e:
            print(f"Error running command: {command}")
            print(e.stderr)
            return None

    def _configure_git(self):
        """Configure git global username and email."""
        self._run_command(f'git config --global user.email "{self.email}"')
        self._run_command(f'git config --global user.name "{self.name}"')
        print("Git global config set.")

    def clone_repo(self):
        """Clone the fixed repo if it doesn't exist and set working directory."""
        if not os.path.exists(self.project_path):
            print(f"Cloning repo {self.repo_name} into {self.base_folder} ...")
            self._run_command(f'git clone {self.repo_url} {self.project_path}')
        else:
            print(f"Repo already exists at {self.project_path}, skipping clone.")

        os.chdir(self.project_path)
        print(f"Working directory set to: {self.project_path}")

        # Install dependencies if requirements.txt exists
        requirements_path = os.path.join(self.project_path, 'requirements.txt')
        if os.path.exists(requirements_path):
            print("Installing dependencies from requirements.txt ...")
            self._run_command(f'pip install -r requirements.txt')

        # Run setup.sh if exists
        setup_sh = os.path.join(self.project_path, 'setup.sh')
        if os.path.exists(setup_sh):
            print("Running setup.sh ...")
            with open(setup_sh, 'r') as f:
                for line in f.read().splitlines():
                    self._run_command(line)

In [5]:
gha = GitHubAccount(username = github_username, token = github_token, email = None, name = None, drive_folder = main_folder)
gha.clone_repo()

Cloning repo scientific-resources into /content ...
Working directory set to: /content/scientific-resources
Installing dependencies from requirements.txt ...
Running setup.sh ...


# Data Download

In [6]:
from modules.full_data_download import MasterDownloader

In [7]:
md = MasterDownloader(main_directory = data_directory,
                      max_resource_amount = resource_amount_per_data_source,
                      path_to_pubmed_api_dict = path_to_pubmed_api_json)

In [None]:
md.download_data()