In [46]:
import pandas as pd
import numpy as np

import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.alert import Alert


import time
import os

from tqdm import tqdm
import requests

In [47]:
def requirements(dic_file, progress):
    if not os.path.exists('../data/morningstar'):
        os.makedirs('../data/morningstar')
        
    dic_cat = pd.read_csv(dic_file)
        
    if "Morningstar" not in dic_cat.columns:
        dic_cat['Morningstar'] = 'Not Found'
        
    #Dic cat from the last isin in progress to the end
    with open(progress, 'r') as f:
        #Last line in progress
        last_isin = f.readlines()[-1].split(':')[0].strip()
        

    progress = dic_cat[dic_cat['CODE ISIN'] == last_isin].index[0]
    dic_cat = dic_cat.iloc[progress:]
    
    return dic_cat

In [48]:
def login(mail):
    
    driver = webdriver.Chrome()
    url = "https://doc.morningstar.com/Fund.aspx?u=ALL#"
    driver.get(url)
    
    Alert(driver).accept()
    
    login = driver.find_element(By.XPATH, '/html/body/div[10]/div[3]/div[1]/div[2]/div/div/form/input')
    login.click()
    
    mail_input = driver.find_element(By.XPATH, '/html/body/div/ctrsi-signin-component/div/div/div[2]/main/section/div/div[2]/div/div/form/label[1]/input')
    mail_input.send_keys(mail)
    
    password = driver.find_element(By.XPATH, '/html/body/div/ctrsi-signin-component/div/div/div[2]/main/section/div/div[2]/div/div/form/label[2]/div[2]/input')
    password.send_keys(mail)
    
    sign = driver.find_element(By.XPATH, '/html/body/div/ctrsi-signin-component/div/div/div[2]/main/section/div/div[2]/div/div/form/div/button[2]/span')
    sign.click()
    
    nav = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[10]/ul/li[3]/div/table/tbody/tr/td[1]')))
    nav.click()
    
    driver.get('https://doc.morningstar.com/Fund.aspx?u=ALL')
    return driver

In [52]:
def download_classic(driver, link, isin):
    headers = {
        "User-Agent":
            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
        }
    s = requests.session()
    s.headers.update(headers)

    for cookie in driver.get_cookies():
        c = {cookie['name']: cookie['value']}
        s.cookies.update(c)
    
    response = s.get(link)
    if response.status_code == 200:
        with open(f"../data/morningstar/{isin}.pdf", 'wb') as f:
            f.write(response.content)
            
    return None
        
        
def download_from_progress(progress):
    driver = login("piron85023@lucvu.com")
    
    downloaded = os.listdir('../data/morningstar')
    
    with open(progress, 'r') as f:
        lines = f.readlines()
        for line in tqdm(lines, desc="Downloading..."):
            isin, link = line.split(':', 1)
            isin = isin.strip()
            link = link.strip()
            if (f"{isin}.pdf" not in downloaded) and (link != "Not Found"):
                # response = driver.request("GET", link)
                headers = {
                    "User-Agent":
                        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
                    }
                s = requests.session()
                s.headers.update(headers)

                for cookie in driver.get_cookies():
                    c = {cookie['name']: cookie['value']}
                    s.cookies.update(c)
                
                response = s.get(link)
                if response.status_code == 200:
                    with open(f"../data/morningstar/{isin}.pdf", 'wb') as f:
                        f.write(response.content)
    return driver

In [50]:
def scrap(mail, dic_file, progress):
    
    dic_cat = requirements(dic_file, progress)
    
    driver = login(mail)
    
    for row in tqdm(dic_cat.iterrows(), total=dic_cat.shape[0], desc="Scraping..."):
        
        isin = row[1]["CODE ISIN"]
        input = driver.find_element(By.XPATH, '//*[@id="SearchInput"]')
        input.clear()    
        input.send_keys(isin + Keys.RETURN)
    
        
        tr_elements = driver.find_elements(By.XPATH, '/html/body/div[10]/div[3]/div[3]/table/tbody[2]/tr')
                
        if len(tr_elements) > 0:
            for tr in tr_elements:
                if ("KID" in tr.text) and ("Français" in tr.text):
                    # print(tr.find_element(By.XPATH, 'td[5]/a[2]').get_attribute("href"))
                    link = tr.find_element(By.XPATH, 'td[5]/a[2]').get_attribute("href")
                    # response = requests.get(link)
                    # if response.status_code == 200:
                    #     with open(f"../data/morningstar/{isin}.pdf", 'wb') as f:
                    #         f.write(response.content)
                    dic_cat.loc[row[0], "Morningstar"] = link
                    # download_classic(driver, link, isin)
                    break
                else:
                    dic_cat.loc[row[0], "Morningstar"] = "Not Found"
            
        with open('progress.txt', 'a') as f :
            f.write(f"{row[1]['CODE ISIN']} : {dic_cat.loc[row[0], 'Morningstar']}\n")
            
    return driver 

In [51]:
driver = scrap("piron85023@lucvu.com", './dic_cat.csv', 'progress.txt')

Scraping...: 100%|██████████| 3671/3671 [1:25:10<00:00,  1.39s/it]  


In [53]:
download_from_progress('progress.txt')

Downloading...: 100%|██████████| 6000/6000 [46:19<00:00,  2.16it/s]  


<selenium.webdriver.chrome.webdriver.WebDriver (session="bc3bb79d1104aaaeca8a570fe7e82f57")>

In [6]:
def check_availability(isin):
    downloaded = os.listdir('../data/morningstar')
    not_available = []
    if f"{isin}.pdf" in downloaded:
        return True
    return False

In [5]:
import streamlit as st
from streamlit_gsheets import GSheetsConnection
import os
import pandas as pd
import gspread 
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from googleapiclient.http import MediaFileUpload
import io
from googleapiclient.errors import HttpError
import json

In [6]:
scope = ['https://www.googleapis.com/auth/drive']
credentials = service_account.Credentials.from_service_account_file('../arcanum-424010-9c87392d2327.json', scopes=scope)
drive = build('drive', 'v3', credentials=credentials)

files = []
page_token = None
while True:
    response = (
        drive.files()
        .list(
            q='"1cTvPKQ0MDJPRR9eve2i4llw7OqkreapI" in parents',
            spaces="drive",
            fields="nextPageToken, files(name, parents, id)",
            pageToken=page_token,
        ).execute()
    )
    # Process change
    # print(f'Found file: {file.get("name")}, {file.get("id")}')
    files.extend(response.get("files", []))
    page_token = response.get("nextPageToken", None)
    if page_token is None:
        break
    
available_files = [(file.get("name"), file.get("id"), file.get("parents")) for file in files]
available_files

[('BE0058182792.pdf',
  '1S_UORlhzlgeMEMOGz4eU5ArE-Q0BWnr-',
  ['1cTvPKQ0MDJPRR9eve2i4llw7OqkreapI']),
 ('BE0057451271.pdf',
  '1d9po9EKqoSg7ytDjv2WfKlzLjVWAl9JX',
  ['1cTvPKQ0MDJPRR9eve2i4llw7OqkreapI']),
 ('AT0000A1TB59.pdf',
  '1loCN7SsoYA8PaXzQdSneGLn50TjFf_pD',
  ['1cTvPKQ0MDJPRR9eve2i4llw7OqkreapI']),
 ('FR0013523263.pdf',
  '1hyHRcDzYdFIMXujBIkFGdrZ7vYTTbKEk',
  ['1cTvPKQ0MDJPRR9eve2i4llw7OqkreapI']),
 ('LU1900065811.pdf',
  '15F5Kl6o72rjhwU87TU0kHylgB_NaqLCw',
  ['1cTvPKQ0MDJPRR9eve2i4llw7OqkreapI']),
 ('FR001400MCQ6.pdf',
  '1vUlyAYctLfkio3sKjuQrIxtq9pTiaxW_',
  ['1cTvPKQ0MDJPRR9eve2i4llw7OqkreapI']),
 ('FR001400KTO9.pdf',
  '1rOTcY6TfkyupnuoGZNNgUerk3Mhhalh5',
  ['1cTvPKQ0MDJPRR9eve2i4llw7OqkreapI']),
 ('FR001400HRI1.pdf',
  '1JKVw7WSxJx1_IRVDJgcR3_RXuDP_m_0W',
  ['1cTvPKQ0MDJPRR9eve2i4llw7OqkreapI']),
 ('FR0014002V29.pdf',
  '1DKH5_FDTX3ywluabdFnfCIrohM7FfsCB',
  ['1cTvPKQ0MDJPRR9eve2i4llw7OqkreapI']),
 ('FR0014002U46.pdf',
  '1ifZyXt7eb1YAPSHmLS5PDG6BX4b67-we',
  ['1cTvPKQ0

In [7]:
file_metadata = {
    "name": "test",
    "mimeType": "application/vnd.google-apps.folder",
}

# pylint: disable=maybe-no-member
file = drive.files().create(body=file_metadata, fields="id").execute()
print(f'Folder ID: "{file.get("id")}".')

Folder ID: "1ZaKP2sulq1SD4XYHgMp1UCSsF4GHfpt1".


In [8]:
from googleapiclient.http import MediaFileUpload

file_metadata = {
    'name' : 'AT0000A0SE25.pdf',
    'parents': '1cTvPKQ0MDJPRR9eve2i4llw7OqkreapI'
}

media_content = MediaFileUpload('../test.pdf', mimetype='application/pdf')

file = drive.files().update(fileId='1d_KHfjZOcJU3C9STlMVhW7W6MD5cndpm', media_body=media_content).execute()
print(file)

HttpError: <HttpError 404 when requesting https://www.googleapis.com/upload/drive/v3/files/1d_KHfjZOcJU3C9STlMVhW7W6MD5cndpm?alt=json&uploadType=media returned "File not found: 1d_KHfjZOcJU3C9STlMVhW7W6MD5cndpm.". Details: "[{'message': 'File not found: 1d_KHfjZOcJU3C9STlMVhW7W6MD5cndpm.', 'domain': 'global', 'reason': 'notFound', 'location': 'fileId', 'locationType': 'parameter'}]">

In [9]:
file = drive.files().list(q=f'name="FR0014002V29.pdf"').execute()

In [10]:
file.get('files')[0].get('id')

'1DKH5_FDTX3ywluabdFnfCIrohM7FfsCB'

In [11]:
permissions = drive.permissions().list(fileId='1cTvPKQ0MDJPRR9eve2i4llw7OqkreapI').execute()

In [12]:
permissions

{'kind': 'drive#permissionList',
 'permissions': [{'id': '16674238332565384703',
   'type': 'user',
   'kind': 'drive#permission',
   'role': 'writer'},
  {'id': 'anyoneWithLink',
   'type': 'anyone',
   'kind': 'drive#permission',
   'role': 'reader',
   'allowFileDiscovery': False},
  {'id': '05579292566633221446',
   'type': 'user',
   'kind': 'drive#permission',
   'role': 'owner'}]}