In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pandas
import pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
df = pandas.read_csv('/content/drive/MyDrive/diplom/mono_df_all_new.csv', index_col=0, quotechar='&')

df = df[['word', 'POS_ms', 'left_syll', 'syll_onset', 'syll_type', 'right_onset', 'syll_num_reversed', 'left_neigh', 'line_id', 'ict']]

In [4]:
df['last_syll'] = df.apply(lambda x: (x.syll_num_reversed == 1), axis=1)
df['second_last_syll'] = df.apply(lambda x: (x.syll_num_reversed == 2), axis=1)

In [5]:
df = df.drop(['syll_num_reversed'], axis=1)

In [6]:
df = df.fillna('nan')

In [7]:
df

Unnamed: 0,word,POS_ms,left_syll,syll_onset,syll_type,right_onset,left_neigh,line_id,ict,last_syll,second_last_syll
0,за,PR,,closed,open,open,False,0,False,False,False
1,стол,S,closed,closed,closed,open,False,0,True,False,False
2,я,SPRO,closed,open,open,closed,True,0,False,False,False
3,вновь,ADV,open,closed,closed,closed,True,0,True,False,False
4,об,PR,closed,open,closed,open,False,1,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
3694755,же,PART,open,closed,open,closed,True,2440705,False,False,False
3694756,ты,SPRO,open,closed,open,closed,True,2440705,False,False,False
3694757,где,ADVPRO,,closed,open,closed,False,2440706,True,False,False
3694758,же,PART,open,closed,open,closed,True,2440706,False,False,False


In [31]:
df.to_csv(f'/content/drive/MyDrive/diplom/df_alg.csv', quotechar='&')

In [9]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=1000, random_state=156)

In [10]:
sum_data = train.groupby(by=['POS_ms', 'left_syll', 'syll_onset', 'syll_type', 'right_onset', 'left_neigh', 'ict'])['word'].count().to_frame().reset_index()

In [11]:
sum_data

Unnamed: 0,POS_ms,left_syll,syll_onset,syll_type,right_onset,left_neigh,ict,word
0,A,closed,closed,closed,closed,False,False,1253
1,A,closed,closed,closed,closed,False,True,2545
2,A,closed,closed,closed,closed,True,False,67
3,A,closed,closed,closed,closed,True,True,1687
4,A,closed,closed,closed,,False,False,24
...,...,...,...,...,...,...,...,...
1220,,open,open,closed,,True,True,8
1221,,open,open,closed,open,False,False,2
1222,,open,open,closed,open,False,True,7
1223,,open,open,closed,open,True,False,2


In [12]:
sum_data.to_csv(f'/content/drive/MyDrive/diplom/sum_data.csv', quotechar='&')

In [13]:
def is_ict(POS_ms, left_syll, syll_onset, syll_type, right_onset, left_neigh, last_syll, second_last_syll):
    if last_syll == True:
        ict_prob = 0.95
    elif second_last_syll == True:
        ict_prob = 0.07
    else:
        res_df = sum_data[(sum_data['POS_ms'] == POS_ms) &
                          (sum_data['left_syll'] == left_syll) &
                          (sum_data['syll_onset'] == syll_onset) &
                          (sum_data['syll_type'] == syll_type) &
                          (sum_data['right_onset'] == right_onset) &
                          (sum_data['left_neigh'] == left_neigh)]
        if len(res_df) == 0:
            return 0.5
        elif len(res_df) == 1:
            if res_df['ict'].values[0] == True:
                return 1.0
            else:
                return 0.0

        ict_f, ict_t = res_df['word'].values
        ict_prob = round(ict_t/(ict_t + ict_f), 2)
    
    return ict_prob

In [14]:
test['prob'] = test.apply(lambda x: is_ict(x.POS_ms, x.left_syll, x.syll_onset, x.syll_type, x.right_onset, x.left_neigh, x.last_syll, x.second_last_syll), axis=1)

In [15]:
test['pred'] = test.apply(lambda x: (x.prob > 0.5), axis=1)

In [16]:
test['res'] = test.apply(lambda x: (x.pred == x.ict), axis=1)

In [17]:
print(test.groupby(by=['res'])['res'].count())

res
False    189
True     811
Name: res, dtype: int64


In [23]:
test[test['res'] == False]

Unnamed: 0,word,POS_ms,left_syll,syll_onset,syll_type,right_onset,left_neigh,line_id,ict,last_syll,second_last_syll,prob,pred,res
812736,был,V,closed,closed,closed,closed,False,580176,False,False,False,0.75,True,False
3493809,от,PR,closed,open,closed,closed,False,2310173,True,False,False,0.24,False,False
3647815,вот,PART,,closed,closed,closed,False,2405840,True,False,False,0.25,False,False
1031989,дни,S,,closed,open,closed,False,737192,False,False,False,0.52,True,False
1238520,не,PART,closed,closed,open,closed,True,871405,True,False,False,0.22,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2833964,так,ADVPRO,,closed,closed,open,False,1905359,True,False,False,0.34,False,False
3200421,за,PR,closed,closed,open,closed,True,2126783,True,False,False,0.23,False,False
1595610,нем,A,closed,closed,closed,closed,False,1124607,False,False,False,0.67,True,False
38737,что,CONJ,,closed,open,closed,False,31499,True,False,False,0.15,False,False


In [24]:
false_conj = test[(test['res'] == False) & (test['POS_ms'] == 'CONJ')]

In [19]:
test[test['res'] == False].groupby(by=['POS_ms'])['word'].count().sort_values()

POS_ms
A          1
INTJ       1
NUM        1
nan        1
V          6
S         11
ADVPRO    12
APRO      12
PART      31
PR        36
SPRO      36
CONJ      41
Name: word, dtype: int64

In [25]:
verse_data = pandas.read_csv('/content/drive/MyDrive/diplom/syllab-tonic-lines.tsv',sep='\t', quotechar='&')

In [30]:
for index, row in false_conj.iterrows():
    print(row['word'], '\n', verse_data.loc[row['line_id'], 'line']) 

и 
 Кто̀ ещѐ любѝмей ѝ суро̀вей,
но 
 Но̀ во мно̀го ра̀з хитрѐе
что 
 «То̀т Воѐйков, что̀ бранѝлся,
и 
 зуба̀ми скрѝпну -- ѝ оста̀нусь.
и 
 ѝ Ван Го̀га за̀ его̀ рома̀шки.
но 
 Но̀ ожерѐлье из слѐз но̀сит Царѝца небѐс.
и 
 Ѝ в мину̀ту мра̀ком ту̀ч
а 
 Нѐ житу̀ха, а̀ страда̀
и 
 Крепко-на̀крепко вста̀ли лаба̀зы, обмѐн и обма̀н.
и 
 Желѐзом ржа̀вым ѝ кремня̀ми;
и 
 на Рѐйне, в Кѐльне, ѝ в долѝнах
и 
 Ѝ прекра̀сней, ѝ умнѐй
и 
   Приро̀ды у̀жас ѝ позо̀р!
и 
 А̀ в душѐ пусты̀нно ѝ напѐвно...
иль 
 Ѝль с карто̀фелѐм котлѐту!
как 
 Он бу̀дет, ка̀к онѝ, глубо̀ко пу̀ст и нѝщ!»
но 
 в шко̀лу вот-во̀т упаду̀, но иду̀.
и 
 Как в то̀й, так ѝ в друго̀м,
но 
 Ты̀ не птѝчка, но̀ твой ло̀кон --
а 
 А̀ пехо̀та нѐ хвастлѝво,
а 
 А̀ нам ра̀дость, а̀ нам смѐх.
и 
   Ста̀рика̀м уж ѝ грешно̀.
но 
 Гремѝте, но̀ не в гру̀дь, не в сѐрдце на̀с разѝте. 
и 
 Как? ѝ мой мѐч, тот са̀мый мѐч,
но 
 Оста̀лась... Но̀ с тех по̀р прошло̀ не мно̀го днѐй,
и 
 глядѐли влѐво ѝ