# Integrate Google drive

In [6]:
def get_size_format(b, factor=1024, suffix="B"):
    """
    Scale bytes to its proper byte format
    e.g:
        1253656 => '1.20MB'
        1253656678 => '1.17GB'
    """
    for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
        if b < factor:
            return f"{b:.2f}{unit}{suffix}"
        b /= factor
    return f"{b:.2f}Y{suffix}"

def list_files(items):
    """given items returned by Google Drive API, prints them in a tabular way"""
    if not items:
        # empty drive
        print('No files found.')
        return []
    rows = []
    for item in items:
        # get the File ID
        id = item["id"]
        # get the name of file
        name = item["name"]
        try:
            # parent directory ID
            parents = item["parents"]
        except:
            # has no parrents
            parents = "N/A"
        try:
            # get the size in nice bytes format (KB, MB, etc.)
            size = get_size_format(int(item["size"]))
        except:
            # not a file, may be a folder
            size = "N/A"
        # get the Google Drive type of file
        mime_type = item["mimeType"]
        # get last modified date time
        modified_time = item["modifiedTime"]
        # append everything to the list
        rows.append((id, name, parents, size, mime_type, modified_time))
    # table = tabulate(rows, headers=["ID", "Name", "Parents", "Size", "Type", "Modified Time"])
    # print(table)
    return rows


## Create service

In [3]:
import os.path
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaIoBaseDownload, MediaFileUpload
# from tabulate import tabulate

# If modifying these scopes, delete the file token.json.
SCOPES = ["https://www.googleapis.com/auth/drive"]

def gdrive_service():
  credential_path = "../.keys/drive_desktop.json"
  creds = None
  # The file token.json stores the user's access and refresh tokens, and is
  # created automatically when the authorization flow completes for the first
  # time.
  if os.path.exists("token.json"):
    creds = Credentials.from_authorized_user_file("token.json", SCOPES)
  # If there are no (valid) credentials available, let the user log in.
  if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
      creds.refresh(Request())
    else:
      flow = InstalledAppFlow.from_client_secrets_file(
        credential_path, SCOPES
      )
      creds = flow.run_local_server(port=0)
    # Save the credentials for the next run
    with open("token.json", "w") as token:
      token.write(creds.to_json())

  try:
    service = build("drive", "v3", credentials=creds)
    return service
  except HttpError as error:
    # TODO(developer) - Handle errors from drive API.
    print(f"An error occurred: {error}")

In [17]:
service = gdrive_service()


In [23]:
service.about()

<googleapiclient.discovery.Resource at 0x7fd8219b0fa0>

## List all files

In [13]:
def search_file_or_folder(service, name):
    """Search for a file or folder in the user's Google Drive."""
    query = f"name='{name}'"
    results = service.files().list(q=query, fields="nextPageToken, files(id, name, mimeType, size, parents, modifiedTime)").execute()
    items = results.get('files', [])
    if not items:
        print('No files found.')
        return None
    else:
        print('Search result:')
        list_files(items)
        print(len(items))
        return items

afile = search_file_or_folder(service, 'finalreport.pdf')

Search result:
1


In [16]:
afile

[{'mimeType': 'application/pdf',
  'parents': ['0AIcsLxwAQS3zUk9PVA'],
  'size': '1886160',
  'id': '10QNEaFza_gRJFsoB0KstD2kjSsIguVlA',
  'name': 'finalreport.pdf',
  'modifiedTime': '2023-12-17T22:59:58.000Z'}]

In [107]:
def gdrive_folder(service):
    """List all folders in the user's Google Drive."""
    print("Folders:")
    results = service.files().list(q="mimeType='application/vnd.google-apps.folder'",pageSize=100, fields="nextPageToken, files(id, name, mimeType, size, parents, modifiedTime)").execute()
    items = results.get('files', [])
    list_files(items)

def gdrive_file(service):
    """List all files in the user's Google Drive."""
    print("Files:")
    results = service.files().list(q="mimeType!='application/vnd.google-apps.folder'",pageSize=100, fields="nextPageToken, files(id, name, mimeType, size, parents, modifiedTime)").execute()
    items = results.get('files', [])
    list_files(items)

In [109]:
def get_files_or_folders_by_parent(service, parent_id):
    """Get files or folders by parent ID."""
    query = f"'{parent_id}' in parents and trashed = false"
    results = service.files().list(q=query, fields="nextPageToken, files(id, name, mimeType, size, parents, modifiedTime)").execute()
    items = results.get('files', [])
    if not items:
        print('The folder is empty or not found or not a folder.')
    else:
        print('The folder contents:')
        list_files(items)  

In [110]:
import csv
def read_csv_file(file_path):
    """Read a CSV file and return a list of dictionaries."""
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        return list(reader)

def write_files_to_csv(all_files):
    """Write file data to a CSV file."""
    with open('drive_files.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["id", "name", "mimeType", "size", "parents", "modifiedTime"])
        for file in all_files:
            writer.writerow([file['id'], file['name'], file['mimeType'], file.get('size', 'N/A'), file['parents'], file['modifiedTime']])

In [111]:
def list_files_recursive(service, folder_id):
    """List all files under the specified folder."""
    # Query to get all files and folders under the current folder
    query = f"'{folder_id}' in parents and trashed = false"

    # Call the Drive v3 API
    results = service.files().list(q=query, fields="nextPageToken, files(id, name, mimeType, size, parents, modifiedTime)").execute()
    items = results.get('files', [])

    all_files = []
    for item in items:
        # If the item is a file, add it to the list
        if item['mimeType'] != 'application/vnd.google-apps.folder':
            all_files.append(item)
        else:
            # If the item is a folder, get all files in this folder (recursive call)
            all_files.extend(list_files_recursive(service, item['id']))
    return all_files

In [112]:
def check_delete(service, folder_id, csv_file):
    all_files = list_files_recursive(service, folder_id)
    csv_files = read_csv_file(csv_file)
    
    # Create a set of file IDs from the csv_files
    csv_file_dict = {file['id']: file for file in csv_files}
    all_file_ids = set([file['id'] for file in all_files])
    # Get the difference between the two sets
    deleted_files = set(csv_file_dict.keys()) - all_file_ids
    #print the deleted files
    print("Deleted files:")
    deleted_files_list = [csv_file_dict[file_id] for file_id in deleted_files]
    list_files(deleted_files_list)
    

In [113]:
def check_rename(service, folder_id, csv_file):
    all_files = list_files_recursive(service, folder_id)
    csv_files = read_csv_file(csv_file)

    # Create a dictionary of file id and name from the csv_files
    csv_file_dict = {file['id']: file['name'] for file in csv_files}
    # Create a dictionary of file id and name from the all_files
    all_file_dict = {file['id']: file['name'] for file in all_files}
    # Get the difference between the two dictionaries
    renamed_files = {file_id: (csv_file_dict[file_id], all_file_dict[file_id]) for file_id in csv_file_dict if file_id in all_file_dict and csv_file_dict[file_id] != all_file_dict[file_id]}
    #print the renamed files
    print("Renamed files:")
    print(renamed_files)

    

In [114]:
import os
from docx import Document
import io

def extract_doc_content(files, service):
    """Extract the content of the .docx files and save it to a .txt file."""
    for file in files:
        file_id = file['id']
        # Get the file's metadata
        file_metadata = service.files().get(fileId=file_id).execute()

        # Check if it's a .docx file
        if file_metadata['mimeType'] in ['application/msword', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']:
        # It's a .docx file, download it
            request = service.files().get_media(fileId=file_id)
            fh = io.BytesIO()
            downloader = MediaIoBaseDownload(fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()
        
            # Use python-docx to read the content of the .docx file
            document = Document(io.BytesIO(fh.getvalue()))
            content = ' '.join(paragraph.text for paragraph in document.paragraphs)
        elif file_metadata['mimeType'] in ['application/vnd.google-apps.document']:
            # It's a Google Docs file, export it to text plain and read it
            request = service.files().export_media(fileId=file_id, mimeType='text/plain')
            fh = io.BytesIO()
            downloader = MediaIoBaseDownload(fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()
            content = fh.getvalue().decode('utf-8')
        else:
            # It's not a .docx file, skip it
            print(f"Skipping file '{file_metadata['name']}' as it's not a word file")
            continue

        # Create a .txt file with the content of the document
        file_name = os.path.splitext(file_metadata['name'])[0] + '.txt'
        with open(os.path.join('text_data', file_name), 'w') as f:
            f.write(content)
        print(f'The content of the file has been saved to text_data/{file_name}')

# Main

In [115]:
def main():
    service = gdrive_service()
    #gdrive_folder(service)
    #gdrive_file(service)
    #print("\n\n\n")
    # get the ID of the folder named 'CV'
    id = search_file_or_folder(service, 'CV')
    print("\n\n\n")
    get_files_or_folders_by_parent(service, id)
    print("\n\n\n")
    print("Recursive list of files under the specified folder:")
    all_files = list_files_recursive(service, id)
    list_files(all_files)
    # write the file data to a CSV file
    #write_files_to_csv(all_files)
    print("\n\n\n")
    #extract_doc_content(all_files, service)
    print("\n\n\n")
    #check_delete(service, id, 'drive_files.csv')
    print("\n\n\n")
    check_rename(service, id, 'drive_files.csv')

    

if __name__ == "__main__":
    main()

Search result:
ID                                 Name    Parents                  Size    Type                                Modified Time
---------------------------------  ------  -----------------------  ------  ----------------------------------  ------------------------
1JqkD75WXrok6iY4qEafzZtNcgHhdmEIS  CV      ['0AHU8XzNXkA9AUk9PVA']  N/A     application/vnd.google-apps.folder  2023-11-07T14:26:33.945Z




The folder contents:
ID                                            Name                   Parents                                Size      Type                                                                     Modified Time
--------------------------------------------  ---------------------  -------------------------------------  --------  -----------------------------------------------------------------------  ------------------------
1Q5hgY9JUtaj31TCCD12LQre-a2_qTFXm             CV_NguyenAnhKhoa1.pdf  ['1JqkD75WXrok6iY4qEafzZtNcgHhdmEIS']  35.51KB   application/pdf      