**This Notebook contains a script that downloads and saves AWS documents onto google drive**

**Import Libraries**

In [29]:
import pandas as pd
import numpy as np
import requests
import os
import gspread
import gspread_dataframe as gd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

**Authenticate Google Drive**

In [39]:
gauth = GoogleAuth()
gauth.LocalWebserverAuth()
drive = GoogleDrive(gauth)

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=1017190226189-f1d5s7cpjrj54u2rqk1ufh9pevguqoap.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


**Authenticate Gspread**

**Local Library**

In [114]:
def import_worksheet(google_spreadsheet, google_worksheet):
    """
    Imports a google worksheet and saves it as a dataframe.
    
    Args:
        google_spreadsheet: A string representing the target google spreadsheet name.
        google_worksheet: A string representing the target google worksheet name.
        
    Returns:
        target_df: A dataframe containing data from the target google worksheet.
    """
    
    #authenticate gspread
    gc = gspread.oauth()

    #import worksheet
    data_worksheet = gc.open(google_spreadsheet).worksheet(google_worksheet)
    
    #import worksheet data to dataframe
    target_df = gd.get_as_dataframe(data_worksheet)
    
    #remove null rows
    target_df = target_df.dropna(how='all')
    
    #remove null columns
    new_columns = list(filter(lambda column: True if 'Unnamed' not in column else False, target_df.columns))
    target_df = target_df[new_columns]
    
    return target_df


def export_worksheet(google_spreadsheet, google_worksheet, target_df):
    """
    Exports data from a dataframe onto a google worksheet on a particular spreadsheet.
    
    Args:
        google_spreadsheet: A string representing the target google spreadsheet name.
        google_worksheet: A string representing the target google worksheet name.
        target_df: A dataframe containing data to load ont the target google worksheet.
    
    Returns:
        N/A
    """
    
    #authenticate gspread
    gc = gspread.oauth()

    #import worksheet
    data_worksheet = gc.open(google_spreadsheet).worksheet(google_worksheet)
    
    #export cohort file ids
    gd.set_with_dataframe(data_worksheet, target_df)
    
    return


def import_download_df():
    """
    Imports the student and signrequests dfs and merges them
    
    Args:
        N/A
        
    Returns:
        download_df: A dataframe containing merged signrequests and students data.
    """
    
    #import signrequest update sheet
    signrequests_df = import_worksheet("SRS Cohort 2021 Contracting", "Status Updates")

    #import student contract data
    student_df = import_worksheet("SRS Cohort 2021 Contracting", "Applicants")
    
    #create dataframe columns
    student_columns = ["First Name", "Last Name", "Email Address"]
    signrequests_columns = ["email", "contract document", "contract document url", "code of conduct document", "code of conduct document url"]
    download_columns = ["First Name", "Last Name", "Email Address", "contract document", "contract document url", "code of conduct document", "code of conduct document url"]

    #subset dataframes using columns
    student_df = student_df[student_columns]
    signrequests_df = signrequests_df[signrequests_columns]

    #merge signrequests with students dataframes
    download_df = pd.merge(student_df, signrequests_df, how="left", left_on="Email Address", right_on="email")
    
    return download_df[download_columns]


def create_folder_ids(download_df):
    """
    Creates folder ids which represent the folder where the document will be saved.
    
    Args:
        target_df: A dataframe which will contain the folder ids.
        
    Returns:
        _: None
    """
    
        
    #map to folder id's
    download_folder_map = {"Durban Contract":"1R2OKfgQQBbHJHyOTZEwxnaEmfVHUbKEj", "CPT Contract":"135N7dMe099Xmo3NOCdzEGchgfkbjnfnv",
                         "JHB Contract":"1rfGuq3hJZOmqhSzo57OO9Ar0gI0XICwk", "Durban Code of Conduct":"11H2eHLbVIVBXkVk5h00BT3fd2uj3LtYJ",
                         "CPT Code of Conduct":"19G_zlCBjkF50uqg_s98p7F5WOuYO7oGE", "JHB Code of Conduct":"1kekVkH_jYo-GrA624QAcPApp8LN91y5Y"}
    
    #create contract file ids
    download_df["contract_folder_id"] = download_df["contract document"].map(download_folder_map)

    #create code of conduct file ids
    download_df["code_of_conduct_folder_id"] = download_df["code of conduct document"].map(download_folder_map)
    
    return


def create_file_names(download_df):
    """
    Creates file names for the files to be saved.
    
    Args:
        download_df: A dataframe containing the data to create the filenames.
        
    Returns:
        _: None
    """
    
    #create contract file names
    download_df["contract_file_name"] = download_df["First Name"].str.strip() + " " + download_df["Last Name"].str.strip() + " " + "Student-Contract"

    #create code of conduct file names
    download_df["code_of_conduct_file_name"] = download_df["First Name"].str.strip() + " " + download_df["Last Name"].str.strip() + " " + "Student-Code-of-Conduct"
    
    return


def create_status_columns(download_df):
    """
    Creates status updates columns for the input dataframe.
    
    Args:
        download_df: A dataframe for which the status updates columns will be created.
        
    Returns:
        _: None
    """
    
    #create contract status column
    download_df["contract status"] = np.nan

    #create code of conduct status column
    download_df["code of conduct status"] = np.nan

    
def subset_download_df(download_df):
    """
    Subsets the download dataframe using set out columns.
    
    Args:
        download_df: A dataframe for which the status updates columns will be created.
        
    Returns:
        _: None
    """
    
    #create download columns
    download_columns = ["First Name", "Last Name", "Email Address", "contract_file_name", "contract status", "contract_folder_id", "code_of_conduct_file_name", "code of conduct status", "code_of_conduct_folder_id"]

    #subset download dataframe using download columns
    return download_df[download_columns]
    

def create_download_df():
    """
    Creates the dataframe containing all the information regarding dowloading files.
    
    Args:
        N/A
        
    Returns:
        download_df: A dataframe containing all the information regarding dowloading files.
    """
    download_df = import_download_df()
    create_folder_ids(download_df)
    create_file_names(download_df)
    create_status_columns(download_df)
    download_df = subset_download_df(download_df)

    return download_df
    
    
def get_download_df():
    """
    Gets or creates the dataframe containing all data for file downloads.
    
    Args:
        N/A
        
    Returns:
        download_df: A dataframe containing all the information regarding dowloading files.
    """

    #import download update sheet
    download_df = import_worksheet("SRS Cohort 2021 Contracting", "Downloaded Documents")
    if download_df.empty:
        download_df = create_download_df()
        export_worksheet("SRS Cohort 2021 Contracting", "Downloaded Documents", download_df)
    
    return download_df


def add_download_links(download_df):
    """
    Adds download links to the download df dataframe.
    
    Args:
        download_df: A dataframe containing all the information regarding dowloading files.
        
    Returns:
        _: None
    """
    
    #get download links
    links_df = import_worksheet("SRS Cohort 2021 Contracting", "Status Updates")

    links_df = links_df[(links_df["contract status"] == "signed") | (links_df["code of conduct status"] == "signed")]
    
    #update download_df to contain links
    download_df = pd.merge(download_df, links_df[["email", "contract document url", "code of conduct document url"]],
                      how='left', left_on="Email Address", right_on="email")
    
    return download_df
    
    
def download_file(file_link, file_name):
    """
    Downloads a file from a link.
    
    Args:
        file_link: A string representing the link to the file.
        file_name: A string representing the name of the file.
        
    Returns:
        file_path: A string representing a path to the downloaded file.
    """
    
    r = requests.get(file_link, allow_redirects=True)
    file_path = "../downloads_folder/" + file_name + f".pdf"
    open(file_path, 'wb').write(r.content)
    
    return file_path


def upload_file(file_name, file_path, folder_id):
    """
    Uploads a file to google shared drive folder.
    
    Args:
        file_name: A string representing the name of the file.
        file_path: A string representing a path to the file.
        folder_id: A string representing the ID of the folder the file is to be uploaded to.
        
    Returns:
        _: None
    """
    
    file = drive.CreateFile({
        'title': file_name,
        'mimeType': 'application/pdf',
        'parents': [{
            'kind': 'drive',
            'teamDriveId': '0ALGjY-PCeStEUk9PVA',
            'id': folder_id
        }]
    })
    
    file.SetContentFile(file_path)
    
    file.Upload(param={'supportsTeamDrives': True})
    
    return


def delete_local_file(file_path):
    """
    Deletes local file if it exists.
    
    Args:
        file_path: A string representing a path to the file.
    
    Returns:
        _: A boolean indicating if a file existed or not.
    """
    
    #delete local file
    if os.path.exists(file_path):
        os.remove(file_path)
        return True
    return False


def upload_to_gdrive(file_link, file_name, folder_id):
    """
    Uploads a file to google drive directly from a link.
    
    Args:
        file_link: A string representing the link to the file.
        file_name: A string representing the name of the file.
        folder_id: A string representing the ID of the folder the file is to be uploaded to.
        
    Returns:
        _: A boolean indicating if the file upload was a success or not.
    """
    
    if not file_link:
        return False
    file_path = download_file(file_link, file_name)
    if not file_path:
        return False
    upload_file(file_name, file_path, folder_id)
    if delete_local_file(file_path):
        return True
    return False
    

def bulk_gdrive_upload(download_df):
    """
    Bulk uploads contracts to google drive.
    
    Args:
    download_df: A dataframe containing all the links to be downloaded and uploaded.
    
    Returns:
        N/A
    """
    
    for index, row in download_df.iterrows():
        
        if upload_to_gdrive(row['contract document url'], row['contract_file_name'], row['contract_folder_id']):
            download_df.loc[download_df['contract_file_name'] == row['contract_file_name'], 'contract status'] = 'Saved'
        else:
            print(f"The following file could not be uploaded: {row['contract_file_name']}")
        
        if upload_to_gdrive(row['code of conduct document url'], row['code_of_conduct_file_name'], row['code_of_conduct_folder_id']):
            download_df.loc[download_df['code_of_conduct_file_name'] == row['code_of_conduct_file_name'], 'code of conduct status'] = 'Saved'
        else:
            print(f"The following file could not be uploaded: {row['code_of_conduct_file_name']}")
    
    return


In [105]:
download_df = get_download_df()


In [106]:
download_df = add_download_links(download_df)


In [115]:
#bulk_gdrive_upload(download_df)


In [112]:
download_df.head()

Unnamed: 0,First Name,Last Name,Email Address,contract_file_name,contract status,contract_folder_id,code_of_conduct_file_name,code of conduct status,code_of_conduct_folder_id,email,contract document url,code of conduct document url
0,Avishkar,Motheelal,avimothe020@student.wethinkcode.co.za,Avishkar Motheelal Student-Contract,Saved,1R2OKfgQQBbHJHyOTZEwxnaEmfVHUbKEj,Avishkar Motheelal Student-Code-of-Conduct,Saved,11H2eHLbVIVBXkVk5h00BT3fd2uj3LtYJ,avimothe020@student.wethinkcode.co.za,https://signrequest-pro.s3.amazonaws.com/pdfs/...,https://signrequest-pro.s3.amazonaws.com/pdfs/...
1,Angelique,Abrahams,anabraha020@student.wethinkcode.co.za,Angelique Abrahams Student-Contract,,135N7dMe099Xmo3NOCdzEGchgfkbjnfnv,Angelique Abrahams Student-Code-of-Conduct,,19G_zlCBjkF50uqg_s98p7F5WOuYO7oGE,anabraha020@student.wethinkcode.co.za,,https://signrequest-pro.s3.amazonaws.com/pdfs/...
2,Gugu,Zitha,gzitha020@student.wethinkcode.co.za,Gugu Zitha Student-Contract,,1rfGuq3hJZOmqhSzo57OO9Ar0gI0XICwk,Gugu Zitha Student-Code-of-Conduct,,1kekVkH_jYo-GrA624QAcPApp8LN91y5Y,gzitha020@student.wethinkcode.co.za,https://signrequest-pro.s3.amazonaws.com/pdfs/...,https://signrequest-pro.s3.amazonaws.com/pdfs/...
3,Sicelo,Ntombana,sntomban020@student.wethinkcode.co.za,Sicelo Ntombana Student-Contract,,1rfGuq3hJZOmqhSzo57OO9Ar0gI0XICwk,Sicelo Ntombana Student-Code-of-Conduct,,1kekVkH_jYo-GrA624QAcPApp8LN91y5Y,sntomban020@student.wethinkcode.co.za,https://signrequest-pro.s3.amazonaws.com/pdfs/...,https://signrequest-pro.s3.amazonaws.com/pdfs/...
4,Paballo,Sithole,pabsitho020@student.wethinkcode.co.za,Paballo Sithole Student-Contract,,1rfGuq3hJZOmqhSzo57OO9Ar0gI0XICwk,Paballo Sithole Student-Code-of-Conduct,,1kekVkH_jYo-GrA624QAcPApp8LN91y5Y,pabsitho020@student.wethinkcode.co.za,https://signrequest-pro.s3.amazonaws.com/pdfs/...,https://signrequest-pro.s3.amazonaws.com/pdfs/...
