# Автоматическое удаление однокоренных слов из конкорданса

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 250)
pd.set_option('display.max_rows', 250)
import xml.etree.ElementTree as et 

## parsing xml file

In [2]:
tree = et.parse('coll_2019-04-06_15_07_13.xml')
root = tree.getroot()

cols = ['Colloc', 'Cooccurrence count', 'Candidate count',
        'T-score', 'MI', 'MI3', 'log likelihood', 
        'min. sensitivity', 'logDice', 'MI.log_f']

items = []
for item_num in range(len(root[1])):
    item = []
    for it in root[1][item_num]:
        item.append(it.text)
    items.append(item)

## Фрейм до очистки:

In [3]:
df = pd.DataFrame(items, columns=cols)
print(f'Длина датафрейма: {len(df)}')

Длина датафрейма: 200


### Все слова фрейма

In [17]:
words = []
for row in df.iterrows():
    words.append(row[1][0])

## Повторяющиеся леммы

Убираем их из фрейма, суммируем Cooccurrence count	и Candidate count, добавляем только к первому

In [5]:
same_root_words = []
for word_0 in words:
    for word_1 in words:
        if word_0[:4] == word_1[:4] and word_0 != word_1:
            same_root_words.append(word_0)
            same_root_words.append(word_1)
same_root_words = set(same_root_words)

for word_0 in same_root_words:
    df_rows = set()
    for word_1 in same_root_words:
        if word_0[:4] == word_1[:4]:
            df_rows.add(word_0)
            df_rows.add(word_1)
    
    temp_df = df[df['Colloc'].isin(df_rows)]
    for word in df_rows:
        df = df[df.Colloc != word]
    
    try:
        temp_df.iloc[0][1] = temp_df['Cooccurrence count'].apply(int).sum()
        temp_df.iloc[0][2] = temp_df['Candidate count'].apply(int).sum()
    except IndexError:
        continue
    print(df_rows)
    df = df.append(temp_df.iloc[0])
    same_root_words = same_root_words - df_rows
    
for col in df:
    if col != 'Colloc':
        df[col] = df[col].apply(float)

df = df.sort_values(by=['MI.log_f'], ascending=False)
df = df.reset_index()
df = df.drop(df.columns[0], axis=1)

while len(df) % 20 != 0:
    df.drop(df.tail(1).index, inplace=True)

{'парфюмированная', 'парфюмерная', 'парфюмированной'}
{'газированная', 'газировать', 'газированный'}
{'прокипяченной', 'прокипятить'}
{'дистилированной', 'дистиллированной', 'дистиллированная', 'дистиллированную'}
{'родниковую', 'родниковый'}
{'водопроводная', 'водопроводную', 'водопроводный'}
{'промывный', 'промывать', 'промыть'}
{'смывать', 'смываем', 'смываться'}
{'закипеть', 'закипевшей'}
{'талый', 'талые'}
{'умягчение', 'умягчения', 'умягченной'}
{'откачки', 'откачивать'}
{'мицеллярной', 'мицеллярную', 'мицеллярная'}
{'дождевая', 'дождевой', 'дождевую'}
{'разбавлять', 'разбавить', 'разбавляется'}
{'негазировать', 'негазированная', 'негазированную'}
{'вскипятить', 'вскипятите'}
{'питьевой', 'питьевую', 'питье', 'пить'}
{'минеральная', 'минеральный'}
{'кипяченый', 'кипячение', 'кипяченной', 'кипятить'}
{'наливать', 'налить'}
{'литра', 'литр'}
{'слива', 'сливать', 'сливаем'}
{'укропная', 'укропную'}
{'нагретая', 'нагретой', 'нагрев'}
{'очистка', 'очистить'}
{'теплый', 'тепловатой'}
{

## Фрейм после чистки

In [6]:
print(f'Длина датафрейма: {len(df)}')

Длина датафрейма: 120


## Валидность коллокатов

In [8]:
valid_collocate = [1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 
                   1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0 ,1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 
                   1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 
                   0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1]

df['valid_colocate'] = valid_collocate

# Подсчет таблицы 1 и таблицы 2

In [9]:
df

Unnamed: 0,Colloc,Cooccurrence count,Candidate count,T-score,MI,MI3,log likelihood,min. sensitivity,logDice,MI.log_f,valid_colocate
0,сточный,130914.0,154954.0,350.96066,10.74553,44.56955,1762600.0,0.01233,8.63816,125.96466,1
1,питьевой,289310.0,1910352.0,390.14641,10.46387,44.89945,2060717.0,0.01524,8.93382,124.88075,0
2,кипяченый,84758.0,166614.0,252.87773,10.70184,42.63449,905969.1,0.0064,7.7019,118.4376,1
3,грунтовый,76507.0,121803.0,276.37634,10.2788,42.72541,1001202.0,0.00765,7.95234,115.58664,0
4,проточный,47216.0,74673.0,213.01799,10.31481,41.25866,598212.6,0.00454,7.20829,110.61955,0
5,горячий,328738.0,1771140.0,571.79491,8.52002,45.17317,3302850.0,0.03287,9.83783,108.22994,1
6,пресный,46170.0,92028.0,203.15924,9.93728,40.60904,512354.3,0.00414,7.0704,105.63379,1
7,подсоленный,23184.0,26409.0,132.86312,10.75895,38.97747,253192.4,0.00177,5.85286,105.22091,1
8,талый,26210.0,38490.0,153.02245,10.38216,39.41688,312198.5,0.00235,6.25886,104.47263,1
9,дистиллированной,27792.0,33474.0,129.23927,10.69195,38.75104,236059.6,0.00167,5.77317,103.97495,0


In [10]:
table1 = []
table2 = []

n = 20
while n <= len(df):
    table1_row = [f'{n-20}-{n}']
    table2_row = [f'1-{n}']
    for col in df.columns[1:-1]:
        table1_row.append(df.sort_values(by=[col], ascending=False).iloc[n-20:n].valid_colocate.sum())
        table2_row.append(df.sort_values(by=[col], ascending=False).iloc[:n].valid_colocate.sum())
    table1.append(table1_row)
    table2.append(table2_row)
    n += 20

cols = ['Ранг', 'Cooccurrence count', 'Candidate count',
        't-score', 'MI', 'MI3', 'log likelihood', 
        'min. sensitivity', 'logDice', 'MI.log_f']

table1_df = pd.DataFrame(table1, columns=cols)
table2_df = pd.DataFrame(table2, columns=cols)


In [11]:
table2_df

Unnamed: 0,Ранг,Cooccurrence count,Candidate count,t-score,MI,MI3,log likelihood,min. sensitivity,logDice,MI.log_f
0,1-20,12,12,12,13,12,12,12,12,10
1,1-40,29,30,27,26,27,27,27,27,26
2,1-60,39,39,39,41,38,38,39,39,39
3,1-80,53,51,53,51,53,52,53,53,55
4,1-100,67,67,67,62,68,67,68,67,70
5,1-120,78,78,78,78,78,78,78,78,78


In [12]:
table1_df

Unnamed: 0,Ранг,Cooccurrence count,Candidate count,t-score,MI,MI3,log likelihood,min. sensitivity,logDice,MI.log_f
0,0-20,12,12,12,13,12,12,12,12,10
1,20-40,17,18,15,13,15,15,15,15,16
2,40-60,10,9,12,15,11,11,12,12,13
3,60-80,14,12,14,10,15,14,14,14,16
4,80-100,14,16,14,11,15,15,15,14,15
5,100-120,11,11,11,16,10,11,10,11,8


In [13]:
# Итог и точность
all_rows = []

for table in [table1_df, table2_df]:
    cycle = 0
    total = ['Итого']+[table[i].sum() for i in table.columns[1:]]
    all_rows.append(total)
    
    accuracy_list = ['Точность']
    for col in table.columns[1:]:   
        if cycle == 1:
            iter_num = 0
        accuracy = []
        for val in table[col].values:
            if cycle == 0:
                accuracy.append(val/20)
            else:
                accuracy.append(val/(iter_num+1)*20)
                iter_num += 1
        
        for weight in range(1, len(accuracy)+1):
            accuracy[-weight] *= weight
        accuracy = sum(accuracy)
        accuracy_list.append(accuracy)
    all_rows.append(accuracy_list)
    cycle+=1

In [14]:
table1_df = table1_df.append(pd.DataFrame(data=all_rows[:2], columns=cols))
table2_df = table2_df.append(pd.DataFrame(data=all_rows[2:], columns=cols))

for table in [table1_df, table2_df]:
    table.index = table['Ранг']
    table.drop('Ранг', axis=1, inplace=True)



In [15]:
table1_df

Unnamed: 0_level_0,Cooccurrence count,Candidate count,t-score,MI,MI3,log likelihood,min. sensitivity,logDice,MI.log_f
Ранг,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0-20,12.0,12.0,12.0,13.0,12.0,12.0,12.0,12.0,10.0
20-40,17.0,18.0,15.0,13.0,15.0,15.0,15.0,15.0,16.0
40-60,10.0,9.0,12.0,15.0,11.0,11.0,12.0,12.0,13.0
60-80,14.0,12.0,14.0,10.0,15.0,14.0,14.0,14.0,16.0
80-100,14.0,16.0,14.0,11.0,15.0,15.0,15.0,14.0,15.0
100-120,11.0,11.0,11.0,16.0,10.0,11.0,10.0,11.0,8.0
Итого,78.0,78.0,78.0,78.0,78.0,78.0,78.0,78.0,78.0
Точность,13.9,13.85,13.8,13.55,13.8,13.7,13.85,13.8,13.9


In [16]:
table2_df

Unnamed: 0_level_0,Cooccurrence count,Candidate count,t-score,MI,MI3,log likelihood,min. sensitivity,logDice,MI.log_f
Ранг,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1-20,12.0,12.0,12.0,13.0,12.0,12.0,12.0,12.0,10.0
1-40,29.0,30.0,27.0,26.0,27.0,27.0,27.0,27.0,26.0
1-60,39.0,39.0,39.0,41.0,38.0,38.0,39.0,39.0,39.0
1-80,53.0,51.0,53.0,51.0,53.0,52.0,53.0,53.0,55.0
1-100,67.0,67.0,67.0,62.0,68.0,67.0,68.0,67.0,70.0
1-120,78.0,78.0,78.0,78.0,78.0,78.0,78.0,78.0,78.0
Итого,278.0,277.0,276.0,271.0,276.0,274.0,277.0,276.0,278.0
Точность,37.2,37.15,36.7,36.35,36.6,36.35,36.8,36.7,36.45
