# Download PDFs from google drive and not

In [1]:
import pandas as pd
import os
import requests
from urllib.parse import urlparse
import gdown


In [2]:
def download_and_rename_pdfs(pdf_urls, download_folder):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    for pdf_url in pdf_urls:
        try:
            if "drive.google.com" in pdf_url:
                # Handle Google Drive links
                file_id = pdf_url.split('/')[-2]
                download_link = f'https://drive.google.com/uc?id={file_id}'
                filename = f'{file_id}.pdf'
                
                # Download the file
                gdown.download(download_link, output=os.path.join(download_folder, filename), quiet=False)
                print(f'Downloaded: {filename}')
            else:
                # Handle other links
                response = requests.get(pdf_url)
                response.raise_for_status()  # Check for successful response
                
                # Get the file name from the URL
                url_path = urlparse(pdf_url).path
                filename = os.path.basename(url_path)
                
                # Construct the full path to save the PDF
                pdf_path = os.path.join(download_folder, filename)
                
                # Check if the PDF file already exists in the download folder
                if not os.path.exists(pdf_path):
                    # Download the PDF and save it with the new name
                    with open(pdf_path, "wb") as pdf_file:
                        pdf_file.write(response.content)
                    
                    print(f"Downloaded and saved: {pdf_path}")
                else:
                    print(f"Skipped download (already exists): {pdf_path}")
        
        except Exception as e:
            print(f"Error downloading {pdf_url}: {e}")

# Read the CSV file
df = pd.read_csv('/Users/carboni/Downloads/test_combined.csv')  # Replace with your CSV file path

pdf_urls = df['Media URL'].tolist() 

download_folder = "pdf_download"
download_and_rename_pdfs(pdf_urls, download_folder)


Downloaded and saved: pdf_download/Anno.1_n.01_45000033259_OUTPUT.o.pdf


Downloading...
From: https://drive.google.com/uc?id=162kEOLsT1ZKgkaS-brxJBDfmyV6vxH7_
To: /Users/carboni/Documents/UNIGE/pynotebook/sourcesVC/Raoul - SA/pdf_download/162kEOLsT1ZKgkaS-brxJBDfmyV6vxH7_.pdf
100%|██████████| 3.16M/3.16M [00:00<00:00, 7.99MB/s]


Downloaded: 162kEOLsT1ZKgkaS-brxJBDfmyV6vxH7_.pdf


Downloading...
From: https://drive.google.com/uc?id=1riIeomXJrLon7XQiSl4Jy1K7xNbDYher
To: /Users/carboni/Documents/UNIGE/pynotebook/sourcesVC/Raoul - SA/pdf_download/1riIeomXJrLon7XQiSl4Jy1K7xNbDYher.pdf
100%|██████████| 3.18M/3.18M [00:00<00:00, 6.31MB/s]


Downloaded: 1riIeomXJrLon7XQiSl4Jy1K7xNbDYher.pdf


Downloading...
From: https://drive.google.com/uc?id=1NEyfsSc-EoSEDAL4xiXeJjDrqxzuvvKw
To: /Users/carboni/Documents/UNIGE/pynotebook/sourcesVC/Raoul - SA/pdf_download/1NEyfsSc-EoSEDAL4xiXeJjDrqxzuvvKw.pdf
100%|██████████| 3.30M/3.30M [00:00<00:00, 7.33MB/s]


Downloaded: 1NEyfsSc-EoSEDAL4xiXeJjDrqxzuvvKw.pdf
Downloaded and saved: pdf_download/Ano.1_n.01_45000033188_Output.o.pdf
Downloaded and saved: pdf_download/Ano.1_n.02_45000033188_Output.o.pdf


KeyboardInterrupt: 