# Amazon Transcribeで文字起こししたJSONファイルを解析するツール

## パッケージのインポート

In [None]:
import json
import spacy
import pandas as pd

## 【入力】設定

In [None]:
INIPUT_JSON_FILE_NAME = 'murakami-20250421-1400-pmo.json'
OUTPUT_EXCEL_FILE_NAME = '{}.xlsx'.format(INIPUT_JSON_FILE_NAME,)

## JSONファイルの読み取り

In [None]:
input_json_file_path = './data/{}'.format(INIPUT_JSON_FILE_NAME,)
input_json_file_path

In [None]:
with open(input_json_file_path, 'r') as f:
    read_json = json.load(f)

## Amazon Transcribeの会話内容の解析

### 話者ごとの会話内容をJSONから取得して先頭5行を表示

In [None]:
audio_segments = read_json['results']['audio_segments']
audio_segments[:5]

### 同じ話者の会話が連続している場合はマージする

In [None]:
merge_list: list = []
prev_speaker_label: str = None
for row in audio_segments:
    if row['speaker_label'] == prev_speaker_label:
        prev_row = merge_list.pop()
        prev_row['transcript'] += row['transcript']
        merge_list.append(prev_row)
    else:
        row: dict = {
            'id': row['id'],
            'speaker_label': row['speaker_label'],
            'speaker_name': None,
            'transcript': row['transcript'],
        }
        merge_list.append(row)
    prev_speaker_label = row['speaker_label']
# merge_list

In [None]:
for row in merge_list:
    print('{}\t{}'.format(row['speaker_label'], row['transcript'],))

### 【入力】会話内容から、speaker_labelごとに名前を設定

In [None]:
speakers: dict= {
    'spk_0': 'Aさん',
    'spk_1': 'Bさん',
    'spk_2': 'Cさん',
    'spk_3': 'Dさん',
    'spk_4': 'Eさん',
    'spk_5': 'Fさん',
    'spk_6': 'Gさん',
    'spk_7': 'Hさん',
    'spk_8': 'Iさん',
    'spk_9': 'Jさん',
}

### 会話リストの各会話に話者名を設定

In [None]:
for row in merge_list:
    row['speaker_name'] = speakers[row['speaker_label']]

## フィラーの削除

フィラー：
- えー
- えーと
- あのー
など
  

In [None]:
nlp = spacy.load('ja_ginza')

In [None]:
for row in merge_list:
    print(row['speaker_name'])
    doc = nlp(row['transcript'])
    result = ''
    for sent in doc.sents:
        for token in sent:
            if token.tag_ != '感動詞-フィラー':
                result += str(token.text)
    row['transcript'] = result
    print(row['transcript'])

## 解析結果をExcelに保存

In [None]:
df: pd.DataFrame = pd.DataFrame(merge_list)
df

In [None]:
output_excel_file_path = './data/{}'.format(OUTPUT_EXCEL_FILE_NAME,)
output_excel_file_path

In [None]:
df.to_excel(output_excel_file_path)