In [65]:
import pandas as pd

json_file_path = "/content/sample_data/addCategory.json"

def read_json_to_df(file_path):
    try:
        # JSON 파일을 DataFrame으로 읽어오기
        df = pd.read_json(file_path)
        return df
    except FileNotFoundError:
        print("File not found.")
        return None

In [66]:
def make_lower_case(text):
    return text.lower()

def getTotalSyllableCnt(text):
    syllable_count = 0
    vowels = "aeiouAEIOU"
    for char in text:
        if char in vowels:
            syllable_count += 1
    return syllable_count

def getTotalWordCnt(text):
    words = text.split()
    return len(words)

In [67]:
# 문장 기반 난이도 추출

sentence_cutline = [42.13323525303835, 45.31457886044376, 50.02139772682483, 53.98793352312772, 56.91700066393434]

def find_sentence_difficulty_section(flesh_ease, cutlines, reverse=False):
    ranking = 0
    for i, cutline in enumerate(cutlines):
        if flesh_ease <= cutline:
            ranking = i + 1
            break
    if ranking == 0:  # 모든 cutline보다 큰 경우, 마지막 구간으로 분류
        ranking = len(cutlines) + 1

    if reverse:
        return len(cutlines) + 2 - ranking  # 순위 뒤집기
    else:
        return ranking

def calculate_difficulty_by_sentence(row) -> float:
    total_sentences = len(row['senteceList'])
    total_syllable = getTotalSyllableCnt(row['full_script'])
    total_words = getTotalWordCnt(row['full_script'])

    if total_sentences == 0 or total_words == 0:
        return None

    flesh_ease = 206.835 - 1.015 * (total_words / total_sentences) - 84.6 * (total_syllable / total_words)
    difficulty = find_sentence_difficulty_section(flesh_ease, sentence_cutline, reverse=True)
    return difficulty

In [68]:
# 단어 기반 난이도 추출

word_cutline = [1.75, 1.85, 1.92, 1.96, 2.00]
cefr_levels = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']
word_csv = pd.read_csv('/content/sample_data/result.csv')

def find_word_difficulty_section(flesh_ease, cutlines):
    for i, cutline in enumerate(cutlines):
        if flesh_ease <= cutline:
            return i + 1  # 구간 번호를 숫자로 반환
    return len(cutlines) + 1

def calculate_difficulty_by_word(row) -> float:
    paragraph = row['full_script'].lower().split()
    filtered_words_by_level = {}
    for level in cefr_levels:
          # 현재 레벨에 해당하는 행만 필터링
          filtered_df = word_csv[word_csv['CEFR'] == level]

          # 필터링된 데이터프레임의 'headword' 컬럼에서 스크립트의 단어 찾기
          filtered_words = [word for word in paragraph if word in filtered_df['headword'].tolist()]

          # 현재 CEFR 레벨에 대한 결과를 딕셔너리에 저장
          filtered_words_by_level[level] = set(filtered_words)


    weighted_sum = (len(filtered_words_by_level["A1"]) * 1 + len(filtered_words_by_level["A2"]) * 2 + len(filtered_words_by_level["B1"]) * 3 + len(filtered_words_by_level["B2"]) * 4
                    + len(filtered_words_by_level["C1"]) * 5 + len(filtered_words_by_level["C2"]) * 6)
    total_words = sum(len(group) for group in filtered_words_by_level.values())
    final_value = weighted_sum / total_words if total_words > 0 else 0

    difficulty = find_word_difficulty_section(final_value, word_cutline)
    return difficulty

In [68]:
import math

def calculate_difficulty(row) -> int:
    sentence = calculate_difficulty_by_sentence(row)
    word = calculate_difficulty_by_word(row)

    average_difficulty = (sentence + word) / 2

    # word가 sentence보다 크거나 같으면 결과를 올림
    if word >= sentence:
        print(math.ceil(average_difficulty))
        return math.ceil(average_difficulty)
    # 그렇지 않으면 결과를 내림
    else:
        print(math.floor(average_difficulty))
        return math.floor(average_difficulty)

In [72]:
df = read_json_to_df(json_file_path)

df['video_level'] = df.apply(calculate_difficulty, axis=1)
df.rename(columns={'senteceList': 'sentence_list'}, inplace=True)
df.to_json('addDifficulty.json', orient='records')

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
3
4
3
1
4
3
1
1
2
1
5
4
4
2
3
2
1
1
3
3
1
5
4
4
2
5
3
6
1
4
1
3
1
3
1
1
1
1
4
1
2
1
1
6
1
1
6
5
3
1
1
5
6
6
1
4
2
3
4
6
1
2
6
1
4
6
6
1
1
3
2
5
5
4
1
1
4
4
1
1
6
6
6
1
2
2
1
2
4
6
2
3
4
1
1
1
3
1
1
1
6
3
1
1
1
1
6
1
1
1
1
2
1
3
4
6
1
1
1
1
1
1
2
1
3
3
4
2
4
1
1
1
1
1
5
1
1
1
4
6
6
2
1
1
4
2
1
1
2
4
1
1
1
3
1
1
1
3
5
6
3
4
1
1
1
1
3
1
3
3
3
1
1
1
1
3
6
5
1
2
2
2
1
4
1
1
2
4
3
4
5
1
3
1
1
2
1
1
4
3
1
6
4
6
1
4
4
6
6
1
1
1
1
1
4
2
4
6
2
2
4
1
1
4
4
1
2
6
6
6
6
6
4
1
2
2
1
1
4
1
3
1
2
1
4
3
3
6
4
4
1
1
1
2
1
1
1
4
1
6
1
6
4
6
1
1
6
3
1
1
1
2
1
2
1
2
1
2
1
1
1
1
1
4
1
3
1
1
4
4
4
2
6
6
6
6
6
6
1
3
1
1
6
1
1
4
1
3
5
1
3
1
1
3
4
3
3
6
3
5
4
2
1
1
1
6
2
4
1
1
2
3
3
1
3
3
3
3
5
3
4
3
3
4
3
2
1
5
3
3
1
6
5
3
1
5
6
4
6
4
6
5
6
2
2
3
2
1
3
1
6
3
6
1
1
1
1
2
3
6
3
1
3
5
3
3
1
1
1
3
1
2
1
1
1
6
3
5
1
6
2
6
1
4
2
1
2
5
1
5
1
1
5
1
4
3
1
5
2
6
6
4
1
1
3
1
5
4
4
4
1
1
1
1
1
1
2
6
1
6
6
6
6
6
1
1
5
4
6
2
3
3
6
3
4
4
1
1
4
1
6
1
3
1
1
2
1
4
2
3
4
2
2
2
2
