In [1]:
#import libraries
import re
import pickle
import numpy as np
import pandas as pd

In [2]:
#read the files necessary: English wordnet, Armenian wordnet, SentiWordNet
ewn = pd.read_csv('wn-wikt-eng.tab', sep = '\t', encoding = 'utf8')
hwn = pd.read_csv('wn-wikt-hye.tab', sep = '\t', encoding = 'utf8')
swn = pd.read_csv('SentiWordNet_3.0.0.txt', sep = '\t', encoding = 'utf8', dtype = {'ID':'str'})

In [3]:
#print the first five rows of each wordnet
print(ewn.head())
print(hwn.head())
print(swn.head())

  # Wiktionary        eng http://wiktionary.org/  CC BY-SA
0   00002098-a  eng:lemma                 unable       NaN
1   00004171-a  eng:lemma               moribund       NaN
2   00004413-a  eng:lemma               abridged       NaN
3   00005205-a  eng:lemma               absolute       NaN
4   00005599-a  eng:lemma               implicit       NaN
  # Wiktionary        hye http://wiktionary.org/  CC BY-SA
0   00014490-a  hye:lemma     միանգամայն բավարար       NaN
1   00014490-a  hye:lemma                 լիառատ       NaN
2   00024996-a  hye:lemma                    նոր       NaN
3   00025470-a  hye:lemma                թթվային       NaN
4   00029933-a  hye:lemma                   ագահ       NaN
  POS        ID  PosScore  NegScore          SynsetTerms  \
0   a  00001740     0.125      0.00               able#1   
1   a  00002098     0.000      0.75             unable#1   
2   a  00002312     0.000      0.00   dorsal#2 abaxial#1   
3   a  00002527     0.000      0.00  ventral#2 adaxi

In [4]:
#check the column types of swn
swn.dtypes

POS             object
ID              object
PosScore       float64
NegScore       float64
SynsetTerms     object
Gloss           object
dtype: object

In [5]:
#change swn IDs to match English wordnet ID format
swn['Number'] = swn['ID']
for i in range(len(swn)):
    swn['Number'][i] = swn['Number'][i] + '-' + swn['POS'][i]
swn.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  swn['Number'][i] = swn['Number'][i] + '-' + swn['POS'][i]


TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [6]:
#check last row of swn
swn.tail()

Unnamed: 0,POS,ID,PosScore,NegScore,SynsetTerms,Gloss,Number
117655,v,2771888.0,0.0,0.125,fog_up#1,"get foggy; ""The windshield fogged up""",02771888-v
117656,v,2771997.0,0.0,0.0,coal#1 char#1,"burn to charcoal; ""Without a drenching rain, t...",02771997-v
117657,v,2772202.0,0.125,0.25,haze#1,"become hazy, dull, or cloudy",02772202-v
117658,v,2772310.0,0.125,0.0,deflagrate#1,cause to burn rapidly and with great intensity...,02772310-v
117659,,,,,#,,


In [7]:
#drop that row
swn = swn.drop(117659)

In [8]:
#add the three columns: English words, Positive, Negative, Objective scores to hwn
hwn['English'] = np.nan
hwn['Positive'] = np.nan
hwn['Negative'] = np.nan
hwn['Objective'] = np.nan

#change column names and drop some unneeded columns
hwn = hwn.rename(columns = {'# Wiktionary':'ID', 'http://wiktionary.org/':'Armenian'})
hwn = hwn.drop(['hye', 'CC BY-SA'], axis = 1)

#print the result
hwn.head()

Unnamed: 0,ID,Armenian,English,Positive,Negative,Objective
0,00014490-a,միանգամայն բավարար,,,,
1,00014490-a,լիառատ,,,,
2,00024996-a,նոր,,,,
3,00025470-a,թթվային,,,,
4,00029933-a,ագահ,,,,


In [9]:
#get the scores from swn
for i in range(len(swn)):
    score = hwn.loc[hwn['ID'] == (swn['Number'][i])]
    if score.empty == False:
        index = list(score.index)
        for ind in index:
            hwn['English'][ind] = swn['SynsetTerms'][i]
            hwn['Positive'][ind] = swn['PosScore'][i]
            hwn['Negative'][ind] = swn['NegScore'][i]
            hwn['Objective'][ind] = 1 - (swn['PosScore'][i] + swn['NegScore'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hwn['English'][ind] = swn['SynsetTerms'][i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hwn['Positive'][ind] = swn['PosScore'][i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hwn['Negative'][ind] = swn['NegScore'][

In [10]:
#check if any of the entries has no score
indexes = hwn['Positive'].index[hwn['Positive'].apply(np.isnan)]
print(indexes)

Int64Index([], dtype='int64')


In [11]:
#rename columns of hwn
hwn = hwn.rename(columns = {'ID':'HWN_Offset'})
hwn['SWN_Offset'] = hwn['HWN_Offset']
for i in range(len(hwn)):
    hwn['SWN_Offset'][i] = hwn['SWN_Offset'][i][:-2]
hwn = hwn[['HWN_Offset', 'SWN_Offset', 'Armenian', 'English', 'Positive', 'Negative', 'Objective']]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hwn['SWN_Offset'][i] = hwn['SWN_Offset'][i][:-2]


In [12]:
#print the result
hwn.head()

Unnamed: 0,HWN_Offset,SWN_Offset,Armenian,English,Positive,Negative,Objective
0,00014490-a,14490,միանգամայն բավարար,rich#12 plentiful#2 plenteous#1 copious#2 ample#2,0.125,0.0,0.875
1,00014490-a,14490,լիառատ,rich#12 plentiful#2 plenteous#1 copious#2 ample#2,0.125,0.0,0.875
2,00024996-a,24996,նոր,new#11,0.0,0.125,0.875
3,00025470-a,25470,թթվային,acid#3,0.0,0.375,0.625
4,00029933-a,29933,ագահ,prehensile#3 greedy#1 grasping#1 grabby#1 cove...,0.0,0.0,1.0


In [13]:
#change how the words appear in the English column of hwn (to be more presentable)
for i in range(len(hwn)):
    s = hwn['English'][i]
    s = s.replace('#', '')
    s = re.sub(r'\d+', '', s)
    s = s.replace(' ', ',')
    hwn['English'][i] = s

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hwn['English'][i] = s


In [14]:
#print final result
hwn.head()

Unnamed: 0,HWN_Offset,SWN_Offset,Armenian,English,Positive,Negative,Objective
0,00014490-a,14490,միանգամայն բավարար,"rich,plentiful,plenteous,copious,ample",0.125,0.0,0.875
1,00014490-a,14490,լիառատ,"rich,plentiful,plenteous,copious,ample",0.125,0.0,0.875
2,00024996-a,24996,նոր,new,0.0,0.125,0.875
3,00025470-a,25470,թթվային,acid,0.0,0.375,0.625
4,00029933-a,29933,ագահ,"prehensile,greedy,grasping,grabby,covetous,ava...",0.0,0.0,1.0


In [15]:
#save the dataframe as a pickle file
with open('hwn.pickle', 'wb') as f:
    pickle.dump(hwn, f)
    
#save the dataframe as a text file (for those interested to read the results)
with open('hwn.txt', 'w', encoding = 'utf-8') as f:
    f.write(hwn.to_string())