<a href="https://colab.research.google.com/github/okana2ki/transcribe/blob/main/transcribe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install python-docx



In [None]:
import os
from docx import Document
from openpyxl import load_workbook

def extract_data(doc_path):
    doc = Document(doc_path)
    subject_name = format = goals = ""
    lesson_plan = []

    for table in doc.tables:
        for i, row in enumerate(table.rows):
            cells = row.cells
            # 科目名
            if i == 0 and len(cells) > 1:
                subject_name = cells[1].text.strip()
            # 授業形態
            elif i == 2 and len(cells) > 6:
                format = cells[6].text.strip()
            # 授業の到達目標
            elif i == 5 and len(cells) > 1:
                goals = cells[1].text.replace('\n', '').strip()
            # 授業計画・内容
            elif 6 <= i <= 20 and len(cells) > 2:  # 15回の内容が存在すると仮定
                content = cells[2].text.strip().replace('\n', '、')  # 番号の右のセルを抽出
                if i == 20:  # 最後の要素
                    lesson_plan.append(content.rstrip('、') + '。')
                else:
                    lesson_plan.append(content)

    # すべての授業計画を1つの文字列に結合
    plan = '、'.join(lesson_plan)
    content = f"本科目の授業形態は{format}である。授業の到達目標は次の通りである。{goals}\n授業計画は次の内容を含む。{plan}"

    return subject_name, content

def main():
    folder_path = '/content/drive/MyDrive/transcribe/sp1'
    excel_path = '/content/drive/MyDrive/transcribe/2_3-1sp1.xlsx'
    start_row = 6  # Excelで書き込みを開始する行番号

    wb = load_workbook(excel_path)
    ws = wb.active

    # for filename in os.listdir(folder_path):
    for filename in sorted(os.listdir(folder_path)):  # filenameの昇順で取り出す
        if filename.endswith('.docx'):
            doc_path = os.path.join(folder_path, filename)
            subject_name, content = extract_data(doc_path)
            if subject_name and content:
                ws.cell(row=start_row, column=3, value=subject_name)  # C列に科目名
                ws.cell(row=start_row, column=5, value=content)  # E列に内容
                start_row += 1  # 次の行に移動

    wb.save(excel_path)

if __name__ == "__main__":
    main()


以下は、作成途中のプログラム。念のために残しておく。

In [None]:
import os
from docx import Document
from openpyxl import load_workbook

def extract_text_from_docx(doc_path):
    doc = Document(doc_path)
    subject_name = ''
    class_format = ''
    goals = ''
    plan = []

    for table in doc.tables:
        for i, row in enumerate(table.rows):
            cells = row.cells
            if i == 0:
                subject_name = cells[1].text.strip()  # 科目名
            elif i == 2:
                class_format = cells[1].text.strip()  # 授業形態
            elif i == 5:
                goals = cells[1].text.strip().replace('\n', '')  # 授業の到達目標
            elif i == 6:
                # 授業計画・内容の抽出
                for j in range(1, 16):  # 1から15回
                    content_cell = cells[1 + j * 2]  # 毎回の内容が続く列
                    content_text = content_cell.text.strip().replace('\n', '、')
                    if j == 15:
                        content_text = content_text.rstrip('、') + '。'
                    plan.append(content_text)

    plan_text = ''.join(plan)
    content = f"本科目の授業形態は{class_format}である。授業の到達目標は次の通りである。{goals}\n授業計画は次の内容を含む。{plan_text}"
    return subject_name, content

def main():
    folder_path = '/content/drive/MyDrive/transcribe'
    excel_path = '/content/drive/MyDrive/transcribe/2_3-1auto.xlsx'
    start_row = 6  # Excelで書き込みを開始する行番号

    wb = load_workbook(excel_path)
    ws = wb.active

    for filename in os.listdir(folder_path):
        if filename.endswith('.docx'):
            doc_path = os.path.join(folder_path, filename)
            subject_name, content = extract_text_from_docx(doc_path)
            if subject_name and content:
                ws.cell(row=start_row, column=3, value=subject_name)  # C列に科目名
                ws.cell(row=start_row, column=5, value=content)  # E列に内容
                start_row += 1  # 次の行に移動

    wb.save(excel_path)

if __name__ == "__main__":
    main()

In [None]:
import os
from docx import Document
from openpyxl import load_workbook

def extract_text_from_table(doc_path):
    doc = Document(doc_path)
    data = {}
    for table in doc.tables:
        for row in table.rows:
            key = row.cells[0].text.strip()
            value = row.cells[1].text.strip().replace('\n', '')
            data[key] = value
    return data

def format_content(format, goals, plan):
    # 教科目計画の内容の各行から番号を取り除き、必要に応じて句読点を挿入
    plan = plan.replace('\n', '、')
    plan = plan[:-1] + '。'  # 最後の改行を句点に変更
    content = f"本科目の授業形態は{format}である。授業の到達目標は次の通りである。{goals}。\n授業計画は次の内容を含む。{plan}"
    return content

def main():
    folder_path = '/content/drive/MyDrive/transcribe'
    excel_path = '/content/drive/MyDrive/transcribe/2_3-1auto.xlsx'
    start_row = 6  # 開始行番号

    # Excel ファイルを読み込む
    wb = load_workbook(excel_path)
    ws = wb.active

    # フォルダ内の全ての Word ファイルを処理
    for filename in os.listdir(folder_path):
        if filename.endswith('.docx'):
            doc_path = os.path.join(folder_path, filename)
            data = extract_text_from_table(doc_path)
            # フォーマットされた内容を生成
            content = format_content(data['授業形態'], data['授業の到達目標'], data['授業計画・内容'])
            # Excel に書き込み
            ws.cell(row=start_row, column=3).value = data['科目名']
            ws.cell(row=start_row, column=5).value = content
            start_row += 1  # 次の行へ

    # 変更を保存
    wb.save(excel_path)

if __name__ == "__main__":
    main()

In [None]:
import os
from docx import Document
from openpyxl import load_workbook

def extract_text_from_table(doc_path):
    doc = Document(doc_path)
    data = {}
    for table in doc.tables:
        for row in table.rows:
            cells = row.cells
            for i, cell in enumerate(cells):
                text = cell.text.strip()
                if '科目名' in text and i+1 < len(cells):
                    data['科目名'] = cells[i+1].text.strip()
                elif '授業形態' in text and i+1 < len(cells):
                    data['授業形態'] = cells[i+1].text.strip()
                elif '授業の到達目標' in text and i+1 < len(cells):
                    data['授業の到達目標'] = cells[i+1].text.strip().replace('\n', '')
                elif '授業計画・内容' in text and i+1 < len(cells):
                    plan_content = []
                    for j in range(1, 16):  # 1から15回の内容を集約
                        if i+j < len(cells):
                            plan_content.append(cells[i+j].text.strip().replace('\n', '、'))
                    data['授業計画・内容'] = ''.join(plan_content[:-1]) + plan_content[-1].rstrip('、') + '。'
    return data

def format_content(format, goals, plan):
    content = f"本科目の授業形態は{format}である。授業の到達目標は次の通りである。{goals}。\n授業計画は次の内容を含む。{plan}"
    return content

def main():
    folder_path = '/content/drive/MyDrive/transcribe'
    excel_path = '/content/drive/MyDrive/transcribe/2_3-1auto.xlsx'
    start_row = 6  # Excelで書き込みを開始する行番号

    wb = load_workbook(excel_path)
    ws = wb.active

    for filename in os.listdir(folder_path):
        if filename.endswith('.docx'):
            doc_path = os.path.join(folder_path, filename)
            data = extract_text_from_table(doc_path)
            if '科目名' in data and '授業形態' in data and '授業の到達目標' in data and '授業計画・内容' in data:
                content = format_content(data['授業形態'], data['授業の到達目標'], data['授業計画・内容'])
                ws.cell(row=start_row, column=3, value=data['科目名'])
                ws.cell(row=start_row, column=5, value=content)
                start_row += 1

    wb.save(excel_path)

if __name__ == "__main__":
    main()