In [35]:
import pandas as pd
from collections import Counter

In [72]:
# убираем колоку с иероглифами
df = pd.read_csv('final_chinese_dataset.csv')

In [73]:
# смотрим как выглядит табличка
hieroglyph_column = pd.DataFrame(data=df['Chinese'], columns=['Chinese'])
df.drop(['Chinese'], axis=1, inplace=True)
df

Unnamed: 0,Pinyin,Wade-Giles,Romatzyh,Palladiy
0,аи,аи,аы,ай
1,аи хао,аи хао,аы хауо,ай хао
2,ба,па,ба,ба
3,ба ба,па ба,бах ба,ба ба
4,ба,па,бах,ба
...,...,...,...,...
5155,зуи,цуи,тзуеы,цзуй
5156,зуи е,цуи е,тзуеы ех,цзуй э
5157,зуо,цо,тзуох,цзо
5158,зуо феи,цо феи,тзуох феы,цзо фэй


In [74]:
def preprocess(entries: list) -> list:
    # Токенизация через пробелы
    tokenized = [entry.split() for entry in entries]
    # Уравнивание списков токенов по количеству элементов
    max_token_number = max([len(token_list) for token_list in tokenized])
    for token_list in tokenized:
        for i in range(max_token_number - len(token_list)):
            token_list.append(" ")
    # Сортировка токенов в отдельные списки по их порядку
    tokenized_sorted = [
        [token[i] for token in tokenized] for i in range(len(tokenized[0]))]
    # print(tokenized)

    return tokenized_sorted

def mean_text_value(entries: list) -> str:
    if not entries:
        return ""

    tokenized_sorted = preprocess(entries)

    average_text = []

    for token_variations in tokenized_sorted:
        average_token_length = round(
            sum(len(token) for token in token_variations) / len(token_variations))

        average_token = ""

        for i in range(average_token_length):
            characters_at_position = [
                variation[i] if i < len(variation) else "" for variation in token_variations]

            print(f"Token Variations: {token_variations}\n\
                    Average Token Length: {average_token_length}\n\
                    Characters at Position: {characters_at_position}\n\
                    Index: {i}\n")

        # # Тут проверка может выйти за пределы списка, т.к. проверяется i из average_token_length 
        #     if not characters_at_position[i]:   
        #         print("PING!")
        #         characters_at_position[i] = Counter(characters_at_position).most_common(2)[1][0]

            most_common_character, _ = Counter(characters_at_position).most_common(1)[0]
            average_token += most_common_character
        
        average_text.append(average_token)

    return " ".join(average_text)

In [75]:
# применяем ко всему датасету
df['Mean'] = df.apply(lambda row: mean_text_value(list(row)), axis=1)
df = pd.concat([hieroglyph_column, df], ignore_index=True, axis=1)
print(df)

Token Variations: ['аи', 'аи', 'аы', 'ай']
                    Average Token Length: 2
                    Characters at Position: ['а', 'а', 'а', 'а']
                    Index: 0

Token Variations: ['аи', 'аи', 'аы', 'ай']
                    Average Token Length: 2
                    Characters at Position: ['и', 'и', 'ы', 'й']
                    Index: 1

Token Variations: ['аи', 'аи', 'аы', 'ай']
                    Average Token Length: 2
                    Characters at Position: ['а', 'а', 'а', 'а']
                    Index: 0

Token Variations: ['аи', 'аи', 'аы', 'ай']
                    Average Token Length: 2
                    Characters at Position: ['и', 'и', 'ы', 'й']
                    Index: 1

Token Variations: ['хао', 'хао', 'хауо', 'хао']
                    Average Token Length: 3
                    Characters at Position: ['х', 'х', 'х', 'х']
                    Index: 0

Token Variations: ['хао', 'хао', 'хауо', 'хао']
                    Average Token Len

In [76]:
# сохраняем
df.to_csv('mean_result.csv', index=False)