In [1]:
import tabula, camelot, os
import pandas as pd
import numpy as np

In [3]:
def convert_table(df):
    df.loc[-1] = df.columns
    df = df.sort_index()
    df.columns = np.arange(len(df.columns))
    return df.reset_index(drop=True)

def find_row(df, text):
    list_id = np.where((df[0].str.find(text) > 0) == True)[0]
    if len(list_id) == 0:
        return None
    else:
        return list_id[0] 
    
def get_vol_table_tabula(file_path):
    if 'https' in file_path:
        tables = tabula.read_pdf(file_path, pages="all", multiple_tables=True, silent=True, stream=True)
    else:
        tables = tabula.read_pdf(file_path, pages="all", multiple_tables=True, silent=True)
    for table in range(len(tables)):
        df = pd.DataFrame(tables[table])
        df = convert_table(df)
        row_id = find_row(df, "期末発行済株式数(自己株式を含む)")
        if row_id is not None and len(df.columns) < 10 and len(df[0][0]) < 100:
            return df
    return None


In [4]:
def cut_vol_table(tables):
    for i in range(len(tables)):
        df = tables[i].df
        df_find_key = df[0].str.find('期末発行済株式数（自己株式を含む）')
        for i in df_find_key.index:
            if df_find_key[i] >= 0:
                id_have_key = i
                df = df.iloc[id_have_key:, :].reset_index(drop=True)
                return df

def convert_table_mix_data(df):
    df_add = df[3].str.split(' ', expand=True)
    df_add2 = df[4].str.split(' ', expand=True)
    for i in df.index:
        if df[2][i] == '':
            df[2][i] = df_add[0][i]
            df[3][i] = df_add[1][i]
        if df[5][i] == '':
            df[5][i] = df_add2[1][i]
            df[4][i] = df_add2[0][i]
    return df
     
def drop_empty_col(df):
    for col in df.columns:
        if (df[col]=='').all():
            df = df.drop(columns=col)
    df.columns = np.arange(len(df.columns))
    return df

def get_vol_table_camelot(file_path):
    tables = camelot.read_pdf(file_path, pages="all", multiple_tables=True, flavor="stream", suppress_stdout=True)
    df = cut_vol_table(tables)
    df = convert_table_mix_data(df)
    df = drop_empty_col(df)
    return df

In [5]:
def get_vol_table(file_path):
    df = get_vol_table_tabula(file_path)
    if df is None:
        df = get_vol_table_camelot(file_path)
    return df


In [28]:
import setup
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
import time



class FinancailStatement(setup.Setup):
    def __init__(self):
        super().__init__('Selenium',source="VS")
        # self.link = "https://www.buffett-code.com/company/5486/library"
    
    def get_data(self, link):
        self.driver.get(link)
        time.sleep(5)
        soup = BeautifulSoup(self.driver.page_source,'html.parser',from_encoding='utf-8')
        # self.driver.close()
        # self.driver.quit()
        return soup
    
    def get_table(self,soup = "", id_company = 5486):
        if soup == "":
            soup = self.get_data(f"https://www.buffett-code.com/company/{id_company}/library")
        else:
            soup = BeautifulSoup(soup,'html.parser',from_encoding='utf-8')
        table = soup.find_all('table')
        return table
    
    def get_pdf_link(self,link_):
        self.driver.get(link_)
        time.sleep(1)
        soup = BeautifulSoup(self.driver.page_source,'html.parser',from_encoding='utf-8')
        arr = soup.find_all('a')
        for i in arr:
            if i["href"].find("pdf") != -1:
                return i["href"]
        return ""




In [None]:
def create_link_df(table):
    json_company = {}
    for id_year, tr_year in enumerate(table[0].find_all('tr')):
        json_company_quy = {}
        year = ''
        for id_quy, td_quy in enumerate(tr_year.find_all('td')):
            lst_text, lst_link = [], []
            for li in td_quy.find_all('li'):
                if '決算短信' in li.text:
                    lst_text.append(li.text)
                    lst_link.append(f"https://www.buffett-code.com{li.find('a')['href']}")
            if id_quy != 0 and id_quy != 5:
                json_company_quy[f'Time_Q{id_quy}'] = lst_text
                json_company_quy[f'Link_Q{id_quy}'] = lst_link
            if td_quy['class'][0] == 'center':
                year = td_quy.text
        if year != '':
            json_company[year] = json_company_quy.copy()
    return pd.DataFrame(json_company).T


In [None]:
def save_csv(F, df, id_company, df_check):
    for quy in ['Q1', 'Q2', 'Q3', 'Q4']:
        for year in df.index:
            for id_link in range(len(df[f'Time_{quy}'][year])):
                link_preview = df[f'Link_{quy}'][year][id_link]
                link_pdf = F.get_pdf_link(link_preview)
                name = df[f'Time_{quy}'][year][id_link]
                if link_preview != 'https://www.buffett-code.com#':
                    print(year, quy, link_pdf)
                    try:
                        df_vol = get_vol_table(link_pdf)
                        df_vol.to_csv(f'{id_company}/{year}_{quy}.csv', index=False)
                        df_check[f'Link_{quy}'][year][id_link] = 'Done'
                    except:
                        df_check[f'Link_{quy}'][year][id_link] = 'Error'
                    df_check.to_csv(f'{id_company}/docs/checklist.csv')

In [None]:
def get_vol_buffett(id_company):
    if not os.path.exists(f'{id_company}'):
        os.mkdir(f'{id_company}')
    if not os.path.exists(f'{id_company}/docs'):
        os.mkdir(f'{id_company}/docs')

    F = FinancailStatement()
    table = F.get_table(id_company = id_company)
    df = create_link_df(table)

    df_check = df.copy()
    df.to_csv(f'{id_company}/link.csv')
    df_check.to_csv(f'{id_company}/docs/checklist.csv')

    save_csv(F, df, id_company, df_check)

In [None]:
get_vol_buffett(5468)