In [1]:
#import libraries
import pickle
import stanza
import pandas as pd
import numpy as np

In [2]:
#download stanza and load Armenian treebank
stanza.download('hy')
nlp = stanza.Pipeline('hy')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 4.60MB/s]
2021-10-14 14:54:39 INFO: Downloading default packages for language: hy (Armenian)...
2021-10-14 14:54:42 INFO: File exists: C:\Users\Nigol\stanza_resources\hy\default.zip.
2021-10-14 14:54:47 INFO: Finished downloading models and saved to C:\Users\Nigol\stanza_resources.
2021-10-14 14:54:47 INFO: Loading these models for language: hy (Armenian):
| Processor | Package |
-----------------------
| tokenize  | armtdp  |
| mwt       | armtdp  |
| pos       | armtdp  |
| lemma     | armtdp  |
| depparse  | armtdp  |

2021-10-14 14:54:47 INFO: Use device: cpu
2021-10-14 14:54:47 INFO: Loading: tokenize
2021-10-14 14:54:48 INFO: Loading: mwt
2021-10-14 14:54:48 INFO: Loading: pos
2021-10-14 14:54:48 INFO: Loading: lemma
2021-10-14 14:54:48 INFO: Loading: depparse
2021-10-14 14:54:49 INFO: Done loading processors!


In [3]:
english = pd.read_csv('English.txt', sep = '\n', header = None, encoding = 'utf-8')
armenian = pd.read_csv('Armenian.txt', sep = '\n', header = None, encoding = 'utf-8')

In [4]:
armenian = armenian[0]
english = english[0]

In [5]:
print(len(armenian))

9631


In [6]:
pos = []

for i in range(len(armenian)):
    g = nlp(armenian[i]).to_dict()
    if len(g) != 0:
        p = nlp(armenian[i]).to_dict()[0][0]['upos']
        pos.append(p)
    else:
        pos.append('')

In [7]:
#add POS tag
POS = []
for i in range(len(pos)):
    if pos[i] == 'ADJ':
        POS.append('a')
    elif pos[i] == 'ADV':
        POS.append('r')
    elif pos[i] == 'NOUN':
        POS.append('n')
    elif pos[i] == 'VERB':
        POS.append('v')
    else:
        POS.append('n/a')

In [8]:
#create the dataframe and print it
columns = ['Armenian', 'English', 'POS']
translation = pd.DataFrame(columns = columns)
translation['Armenian'] = armenian
translation['English'] = english
translation['POS'] = POS
translation.head()

Unnamed: 0,Armenian,English,POS
0,ա,the first letter of the Armenian alphabet,a
1,ագահ,"greedy,glutton",a
2,ագահություն,"greed,greediness,glutton",n
3,ագարիկոն,brown mushroom,n
4,ագռավ,"crow,raven,spades",n


In [9]:
len(translation)

9631

In [10]:
#read SentiWordNet
swn = pd.read_csv('SentiWordNet_3.0.0.txt', sep = '\t', encoding = 'utf8', dtype = {'ID':str})
swn.head()

Unnamed: 0,POS,ID,PosScore,NegScore,SynsetTerms,Gloss
0,a,1740,0.125,0.0,able#1,(usually followed by `to') having the necessar...
1,a,2098,0.0,0.75,unable#1,(usually followed by `to') not having the nece...
2,a,2312,0.0,0.0,dorsal#2 abaxial#1,facing away from the axis of an organ or organ...
3,a,2527,0.0,0.0,ventral#2 adaxial#1,nearest to or facing toward the axis of an org...
4,a,2730,0.0,0.0,acroscopic#1,facing or on the side toward the apex


In [11]:
for i in range(len(swn)):
    terms = swn['SynsetTerms'][i].split()
    s = ''
    for j in range(len(terms)):
        s = s + terms[j][:-2] + ' '
    swn['SynsetTerms'][i] = s[:-1]

swn.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  swn['SynsetTerms'][i] = s[:-1]


Unnamed: 0,POS,ID,PosScore,NegScore,SynsetTerms,Gloss
0,a,1740,0.125,0.0,able,(usually followed by `to') having the necessar...
1,a,2098,0.0,0.75,unable,(usually followed by `to') not having the nece...
2,a,2312,0.0,0.0,dorsal abaxial,facing away from the axis of an organ or organ...
3,a,2527,0.0,0.0,ventral adaxial,nearest to or facing toward the axis of an org...
4,a,2730,0.0,0.0,acroscopic,facing or on the side toward the apex


In [12]:
#create the necessary columns
columns = ['POS', 'HWN_Offset', 'SWN_Offset', 'Armenian', 'English', 'Positive', 'Negative', 'Objective']
lexicon = pd.DataFrame(columns = columns)
lexicon

Unnamed: 0,POS,HWN_Offset,SWN_Offset,Armenian,English,Positive,Negative,Objective


In [13]:
def retrieve_scores(index):
    pos = swn['PosScore'][index]
    neg = swn['NegScore'][index]
    num = swn['ID'][index]
    return [np.array([pos, neg]), num]

In [14]:
armenian

0                 ա
1              ագահ
2       ագահություն
3          ագարիկոն
4             ագռավ
           ...     
9626         ֆիկտիվ
9627        Ֆրանսիա
9628      ֆրանսուհի
9629     ֆրանսիական
9630      ֆրանսիացի
Name: 0, Length: 9631, dtype: object

In [15]:
new = []
for i in range(len(translation)):
    if translation['POS'][i] != 'n/a':
        words = str(translation['English'][i]).strip().split(',')
        n = len(words)
        found = False
        for j in range(n):
            if words[j] in swn['SynsetTerms'].values:
                ind = np.where(swn['SynsetTerms'].values == words[j])[0]
                for k in range(len(ind)):
                    if swn['POS'][ind[k]] == translation['POS'][i]:
                        scores = retrieve_scores(ind[k])
                        found = True
                        new.append({'POS': translation['POS'][i],'SWN_Offset': scores[1], 'Armenian': translation['Armenian'][i], 
                                    'English': translation['English'][i], 'Positive': scores[0][0], 'Negative': scores[0][1], 
                                    'Objective': 1 - np.sum(scores[0])})
        p = n
        while p > 0 and found == False:
            if ' '.join(words[:p]) in swn['Gloss'].values:
                ind = np.where(swn['Gloss'].values == ' '.join(words[:p]))[0]
                for j in range(len(ind)):
                    if swn['POS'][ind[j]] == translation['POS'][i]:
                        scores = retrieve_scores(ind[j])
                        found = True
                        new.append({'POS': translation['POS'][i], 'SWN_Offset': scores[1], 'Armenian': translation['Armenian'][i], 
                                    'English': translation['English'][i], 'Positive': scores[0][0], 'Negative': scores[0][1], 
                                    'Objective': 1 - np.sum(scores[0])})
            p = p - 1
        if found == False:
            scores = [[np.nan, np.nan], np.nan]
            new.append({'POS': translation['POS'][i], 'SWN_Offset': scores[1], 'Armenian': translation['Armenian'][i], 
                        'English': translation['English'][i], 'Positive': scores[0][0], 'Negative': scores[0][1], 
                        'Objective': 1 - np.sum(scores[0])})
lexicon = lexicon.append(new, ignore_index = True)
lexicon.head()

Unnamed: 0,POS,HWN_Offset,SWN_Offset,Armenian,English,Positive,Negative,Objective
0,a,,,ա,the first letter of the Armenian alphabet,,,
1,a,,11160.0,ագահ,"greedy,glutton",0.0,0.0,1.0
2,n,,4945530.0,ագահություն,"greed,greediness,glutton",0.0,0.375,0.625
3,n,,,ագարիկոն,brown mushroom,,,
4,n,,1579028.0,ագռավ,"crow,raven,spades",0.0,0.125,0.875


In [16]:
lexicon['Armenian']

0                  ա
1               ագահ
2        ագահություն
3           ագարիկոն
4              ագռավ
            ...     
17071     ֆուտբոլիստ
17072         ֆիկտիվ
17073      ֆրանսուհի
17074     ֆրանսիական
17075      ֆրանսիացի
Name: Armenian, Length: 17076, dtype: object

In [17]:
lexicon.tail()

Unnamed: 0,POS,HWN_Offset,SWN_Offset,Armenian,English,Positive,Negative,Objective
17071,n,,,ֆուտբոլիստ,footballer,,,
17072,a,,,ֆիկտիվ,fictitious,,,
17073,n,,,ֆրանսուհի,French,,,
17074,a,,,ֆրանսիական,French,,,
17075,a,,,ֆրանսիացի,French,,,


In [18]:
#check the percentage of nan entries in the lexicon dataframe
lexicon['Positive'].isna().sum() / len(lexicon) * 100

25.890138205668773

In [19]:
len(lexicon)

17076

In [20]:
#drop the rows with nan scores, reset its index and drop the old one
lexicon = lexicon.dropna(subset = ['Positive', 'Negative', 'Objective'])
lexicon = lexicon.reset_index()
lexicon = lexicon.drop('index', axis = 1)

In [21]:
#save the dataframe as a pickle file
with open('translationScores.pickle', 'wb') as f:
    pickle.dump(lexicon, f)
    
#save the dataframe as a text file (for those interested to read the results)
with open('translationScores.txt', 'w', encoding = 'utf-8') as f:
    f.write(lexicon.to_string())