<a href="https://colab.research.google.com/github/monjurkuet/yt-crawler/blob/main/move_transcript_to_google_drive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sshtunnel paramiko==3.4.0

In [None]:
import os

sftp_source_directory = '/home/administrator/Downloads/testranscript'
local_drive_download_path = '/content/drive/MyDrive/AI/transcripts/test'
file_extension_filter = '.json'

# Create the local directory if it does not exist
os.makedirs(local_drive_download_path, exist_ok=True)

print(f"SFTP Source Directory: {sftp_source_directory}")
print(f"Local Download Path: {local_drive_download_path}")
print(f"File Extension Filter: {file_extension_filter}")
print(f"Local download directory created/ensured at: {local_drive_download_path}")

In [None]:
from google.colab import userdata
from google.colab.userdata import SecretNotFoundError # Import specific exception
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('ConfigManager')

class ConfigManager:
    """Manages application configuration, implementing a singleton pattern.
    Configuration values are loaded from environment variables first, then Colab secrets.
    """
    _instance = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(ConfigManager, cls).__new__(cls)
            cls._instance._load_config()
        return cls._instance

    def _get_config_value(self, key):
        """Helper to get config value, prioritizing environment variables over Colab secrets."""
        value = os.environ.get(key)
        if value is not None:
            return value

        try:
            colab_value = userdata.get(key)
            return colab_value
        except SecretNotFoundError:
            # This is expected for optional secrets that might not be set in Colab.
            # Log as debug to avoid warnings for normal fallback scenarios.
            logger.debug(f"Colab Secret '{key}' not found, will use fallback or default if available.")
            return None
        except Exception as e:
            # Log a warning for other unexpected errors during secret retrieval.
            logger.warning(f"Unexpected error retrieving config key '{key}' from Colab secrets: {e}")
            return None

    def _load_config(self):
        """Loads all necessary configuration parameters into instance attributes."""
        logger.info("Loading configuration...")

        # YouTube API Credentials
        self.API_KEY = self._get_config_value('API_KEY')

        # SSH Tunnel & MySQL Database Credentials
        self.SSH_HOST = self._get_config_value('SSH_HOST')
        self.SSH_USERNAME = self._get_config_value('SSH_USERNAME') or 'administrator' # Default to 'administrator'
        self.SSH_PRIVATEKEY_PATH = self._get_config_value('SSH_PRIVATEKEY_PATH') # Path to private key
        self.LOCAL_PORT = 3307 # Local port for MySQL SSH tunnel
        self.REMOTE_MYSQL_HOST = '127.0.0.1'
        self.REMOTE_MYSQL_PORT = 3306
        self.DATABASE_NAME = self._get_config_value('DATABASE_NAME')
        self.DATABASE_PASSWORD = self._get_config_value('DATABASE_PASSWORD')

        # SFTP Server Credentials (formerly FTP)
        # Use SFTP_HOST if available, otherwise fallback to FTP_HOST for compatibility
        self.SFTP_HOST = self._get_config_value('SFTP_HOST') or self._get_config_value('FTP_HOST')
        self.SFTP_PORT = 22 # SFTP typically uses port 22
        self.SFTP_USERNAME = self._get_config_value('SFTP_USERNAME') or 'administrator' # Default to 'administrator'
        self.SFTP_PASSWORD = self._get_config_value('SFTP_PASSWORD') or self._get_config_value('FTP_PASSWORD') # Fallback for password
        self.LOCAL_SFTP_PORT = 2121 # Local port for SFTP SSH tunnel
        logger.info("Configuration loading complete.")

    def get_config(self):
        """Returns a dictionary of all loaded configuration parameters."""
        return {
            'API_KEY': self.API_KEY,
            'SSH_HOST': self.SSH_HOST,
            'SSH_USERNAME': self.SSH_USERNAME,
            'SSH_PRIVATEKEY_PATH': self.SSH_PRIVATEKEY_PATH,
            'LOCAL_PORT': self.LOCAL_PORT,
            'REMOTE_MYSQL_HOST': self.REMOTE_MYSQL_HOST,
            'REMOTE_MYSQL_PORT': self.REMOTE_MYSQL_PORT,
            'DATABASE_NAME': self.DATABASE_NAME,
            'DATABASE_PASSWORD': self.DATABASE_PASSWORD,
            'SFTP_HOST': self.SFTP_HOST,
            'SFTP_PORT': self.SFTP_PORT,
            'SFTP_USERNAME': self.SFTP_USERNAME,
            'SFTP_PASSWORD': self.SFTP_PASSWORD,
            'LOCAL_SFTP_PORT': self.LOCAL_SFTP_PORT
        }

# Example usage (for testing purposes)
if __name__ == '__main__':
    config = ConfigManager()
    print(f"ConfigManager initialized. SSH Host: {config.SSH_HOST}, SFTP Host: {config.SFTP_HOST}, SFTP Port: {config.SFTP_PORT}, Local SFTP Tunnel Port: {config.LOCAL_SFTP_PORT}. Missing values will be None or default.")

In [27]:
import logging
import warnings
from sshtunnel import SSHTunnelForwarder, BaseSSHTunnelForwarderError
import paramiko

# --- Logging Configuration ---
warnings.filterwarnings('ignore', category=UserWarning, module='paramiko')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('SFTPConnector')

# --- SFTP Connector Class ---
class SFTPConnector:
    """Manages the lifecycle of the SSH tunnel required for SFTP access."""
    def __init__(self):
        self.config = ConfigManager() # Instantiate the unified ConfigManager
        self.ssh_tunnel = None
        self.logger = logger

        # Retrieve configuration values from the unified ConfigManager
        self.ssh_host = self.config.SSH_HOST
        self.ssh_username = self.config.SSH_USERNAME
        self.ssh_privatekey_path = self.config.SSH_PRIVATEKEY_PATH
        self.sftp_host = self.config.SFTP_HOST
        self.sftp_port = self.config.SFTP_PORT
        self.local_sftp_port = self.config.LOCAL_SFTP_PORT

        # Robust checks for critical missing configuration values
        missing_configs = []
        if not self.ssh_host: missing_configs.append('SSH_HOST')
        if not self.ssh_privatekey_path: missing_configs.append('SSH_PRIVATEKEY_PATH')
        if not self.sftp_host: missing_configs.append('SFTP_HOST')

        if missing_configs:
            error_msg = f"Missing critical SSH/SFTP configuration for tunnel: {', '.join(missing_configs)}."
            self.logger.error(error_msg)
            raise ValueError(error_msg)

        try:
            self.ssh_tunnel = SSHTunnelForwarder(
                (self.ssh_host, 22), # SSH host and standard SSH port
                ssh_username=self.ssh_username,
                ssh_pkey=self.ssh_privatekey_path,
                remote_bind_address=(self.sftp_host, self.sftp_port),
                local_bind_address=('127.0.0.1', self.local_sftp_port)
            )
            self.logger.info("SFTPConnector initialized successfully with tunnel configuration.")
        except Exception as e:
            self.logger.critical(f"Error initializing SSHTunnelForwarder for SFTP: {e}")
            raise

    def start_tunnel(self):
        """Starts the SSH tunnel for SFTP access."""
        if self.ssh_tunnel is None:
            self.logger.error("SSH tunnel not initialized. Cannot start.")
            return False
        try:
            self.logger.info(f"Attempting to start SSH tunnel for SFTP to {self.sftp_host}:{self.sftp_port} via {self.ssh_host} on local port {self.local_sftp_port}...")
            self.ssh_tunnel.start()
            self.logger.info("SSH tunnel for SFTP started successfully.")
            return True
        except (paramiko.SSHException, BaseSSHTunnelForwarderError) as e:
            self.logger.error(f"Failed to start SSH tunnel for SFTP: {e}. Check SSH key path, SSH host, and remote firewall rules.")
            if self.ssh_tunnel.is_active:
                self.ssh_tunnel.stop()
            return False
        except Exception as e:
            self.logger.critical(f"An unexpected error occurred while starting SFTP SSH tunnel: {e}")
            if self.ssh_tunnel and self.ssh_tunnel.is_active:
                self.ssh_tunnel.stop()
            return False

    def stop_tunnel(self):
        """Stops the SSH tunnel if it is active."""
        if self.ssh_tunnel and self.ssh_tunnel.is_active:
            self.logger.info("Stopping SSH tunnel for SFTP...")
            self.ssh_tunnel.stop()
            self.logger.info("SSH tunnel for SFTP stopped.")
        else:
            self.logger.info("SSH tunnel for SFTP is not active or not initialized.")

In [None]:
from tqdm.notebook import tqdm

sftp_connector = None
transport = None
sftp_client = None

# Re-define these variables as they are used in this context. They were previously defined in cell 9aa2a250.
sftp_source_directory = '/home/administrator/Downloads/testranscript'
local_drive_download_path = '/content/drive/MyDrive/AI/transcripts/test'
file_extension_filter = '.json'

try:
    # 1. Instantiate the SFTPConnector class and call its start_tunnel()
    sftp_connector = SFTPConnector()
    if not sftp_connector.start_tunnel():
        raise Exception("Failed to start SFTP tunnel for download.")

    # Retrieve necessary config values from the connected tunnel
    local_sftp_port = sftp_connector.local_sftp_port
    sftp_username = sftp_connector.ssh_username # Use ssh_username for SFTP authentication
    private_key_path = sftp_connector.ssh_privatekey_path

    # 2. Create a paramiko.Transport object and authenticate
    transport = paramiko.Transport(('127.0.0.1', local_sftp_port))
    private_key = paramiko.RSAKey.from_private_key_file(private_key_path)
    transport.connect(username=sftp_username, pkey=private_key)

    # 3. Create an SFTPClient instance
    sftp_client = paramiko.SFTPClient.from_transport(transport)
    print("ðŸŽ‰ SFTP client established successfully for download operation.")

    # 4. List and filter files, handling IOError
    sftp_json_files = []
    try:
        sftp_files = sftp_client.listdir(sftp_source_directory)
        for filename in sftp_files:
            if filename.endswith(file_extension_filter):
                sftp_json_files.append(filename)
    except IOError as io_err:
        print(f"Error: Directory '{sftp_source_directory}' might not exist or is inaccessible on SFTP server: {io_err}")
        sftp_json_files = [] # Ensure it's empty on error

    if not sftp_json_files:
        print(f"No {file_extension_filter} files found to download in {sftp_source_directory}.")
    else:
        print(f"\nAttempting to download {len(sftp_json_files)} {file_extension_filter} files...")
        downloaded_count = 0

        # 5. Iterate through filtered files with tqdm and 6. download with error handling
        for filename in tqdm(sftp_json_files, desc="Downloading files"): # Added tqdm here
            remote_file_path = os.path.join(sftp_source_directory, filename).replace('\\', '/')
            local_file_path = os.path.join(local_drive_download_path, filename)

            try:
                sftp_client.get(remote_file_path, local_file_path)
                downloaded_count += 1
            except Exception as e:
                print(f"Error downloading '{filename}': {e}")
        print(f"\nCompleted download. Total files downloaded: {downloaded_count}/{len(sftp_json_files)}")

except Exception as e:
    print(f"An error occurred during SFTP download operation: {e}")

finally:
    # 7. Ensure all connections are closed safely
    if sftp_client:
        try:
            sftp_client.close()
            print("SFTP client closed.")
        except Exception as e:
            print(f"Error closing SFTP client: {e}")
    if transport:
        try:
            transport.close()
            print("Paramiko Transport closed.")
        except Exception as e:
            print(f"Error closing Paramiko Transport: {e}")
    if sftp_connector:
        sftp_connector.stop_tunnel()
