## Word Verification with BabelNet

In [2]:
import re
import pandas as pd
import json

import babelnet as bn
from babelnet.language import Language
from babelnet.pos import POS

2023-12-18 15:08:30,143 [babelnet.conf] INFO: Loaded configuration from ['/home/bartlm/tryout/code/babelnet_conf.yml']
2023-12-18 15:08:30,147 [babelnet.api] INFO: BabelNet online RESTful API v1.1.0


In [3]:
def get_variants(word:str):
    new_variants = []
    # if word contains a dash or space, create variant without & with the respective other
    if '-' in word or ' ' in word:
        # remove non-word character from string with regex
        new_variants.append(re.sub(r'[^a-zA-Z]', '', word))

        # if word contains a space, create variant with dash
        if ' ' in word:
            new_variants.append(word.replace(' ', '-'))
        # if word contains a dash, create variant with space
        if '-' in word:
            new_variants.append(word.replace('-', ' '))
    
    return [word] + new_variants

In [4]:
def verify_wordlist_bn(wordlist):
    processed = dict()
    # go through all of the words
    for word in wordlist:
    # check if word is in already done
        if word in processed.keys():
            continue
        # if not, check if word has synsets
        else:
            synsets = bn.get_synsets(word, from_langs=[Language.EN])
            # if yes, add word to done as (word, word)
            if synsets:
                processed[word] = word
            # if not, check if there are variants
            else:
                variants = get_variants(word)
                # if there are variants, check each variant
                if len(variants) > 1:
                    for v in variants[1:]:
                        synsets = bn.get_synsets(v, from_langs=[Language.EN])
                        # if a variant exists, add it to list of done as (word, variant)
                        if synsets:
                            processed[word] = v
                # if there are no variants, continue
    return processed


## Manship

In [8]:
manship = ['airmanship',
 'batsmanship',
 'brinkmanship',
 'brinksmanship',
 'cellarmanship',
 'chairmanship',
 'chartsmanship',
 'churchmanship',
 'craftmanship',
 'craftsmanship',
 'debtsmanship',
 'draftsmanship',
 'draughtsmanship',
 'foremanship',
 'gamesmanship',
 'gentlemanship',
 'godmanship',
 'grantsmanship',
 'handcraftsmanship',
 'horsemanship',
 'journeymanship',
 'lifemanship',
 'manship',
 'marksmanship',
 'oarsmanship',
 'onedownmanship',
 'oneupmanship',
 'oneupsmanship',
 'outdoorsmanship',
 'penmanship',
 'pitchmanship',
 'policemanship',
 'punmanship',
 'quizmanship',
 'quotemanship',
 'salesmanship',
 'seamanship',
 'showmanship',
 'spearmanship',
 'specmanship',
 'sportsmanship',
 'statesmanship',
 'stateswomanship',
 'stockmanship',
 'successmanship',
 'swordsmanship',
 'teamsmanship',
 'turfmanship',
 'upmanship',
 'upsmanship',
 'workitsmanship',
 'workmanship',
 'workwomanship']

In [9]:
manship_verified = verify_wordlist_bn(manship)

In [10]:
print(f'before: {len(manship)} after: {len(manship_verified)}')

before: 53 after: 32


In [12]:
print(manship_verified.keys())

dict_keys(['airmanship', 'batsmanship', 'brinkmanship', 'brinksmanship', 'chairmanship', 'churchmanship', 'craftmanship', 'craftsmanship', 'draftsmanship', 'draughtsmanship', 'foremanship', 'gamesmanship', 'gentlemanship', 'grantsmanship', 'handcraftsmanship', 'horsemanship', 'journeymanship', 'lifemanship', 'manship', 'marksmanship', 'oarsmanship', 'oneupmanship', 'penmanship', 'salesmanship', 'seamanship', 'showmanship', 'sportsmanship', 'statesmanship', 'stockmanship', 'swordsmanship', 'upmanship', 'workmanship'])


### Suffixes

In [6]:
suffix_word_file = '../words/suffixes.txt'
suffix_words = pd.read_csv(suffix_word_file, names=['word', 'count', 'cat'])
# add new empty columns for variants at index 3
suffix_words.insert(3, 'variant', '')

suffix_words.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   word     422 non-null    object
 1   count    422 non-null    int64 
 2   cat      422 non-null    object
 3   variant  422 non-null    object
dtypes: int64(1), object(3)
memory usage: 13.3+ KB


In [7]:
suffix_words.head()

Unnamed: 0,word,count,cat,variant
0,spokesman,18072,man,
1,congressman,1702,man,
2,businessman,1588,man,
3,policeman,1155,man,
4,freshman,412,man,


In [5]:
processed_words = verify_wordlist_bn(suffix_words['word'].tolist())

In [72]:
# add processed words to dataframe
for word in processed_words.keys():
    suffix_words.loc[suffix_words['word'] == word, 'variant'] = processed_words[word]

In [73]:
# remove rows with empty variant
suffix_words_verified = suffix_words[suffix_words['variant'] != '']
suffix_words_verified.head()

Unnamed: 0,word,count,cat,variant
0,spokesman,18072,man,spokesman
1,congressman,1702,man,congressman
2,businessman,1588,man,businessman
3,policeman,1155,man,policeman
4,freshman,412,man,freshman


In [74]:
# save to file
suffix_words_verified.to_csv('../words/verified_suffixes.csv', header=True)

In [5]:
suffix_words_verified = pd.read_csv('../words/verified_suffixes.csv', index_col=0)

#### Words left behind

In [11]:
# words that were not verified
for t in set(suffix_words_verified['word']) ^ set(suffix_words['word']):
    print(t,end=', ')

twigman, crateman, musicman, bullyboys, dudesman, backwoodswoman, chatirygirl, hateboy, horseboys, smallpoxgirl, boygirl, ombudman, teeman, taxgirl, crewmen, safeman, posterboys, homosextranswoman, gagboy, weathermen, coverman, windsorgirl, browngirl, badboys, eyeman, spokeman, factorygirl, sofaman, lesbiman, spiderboy, londongirl, posterboy, funboy, furnituregirl, squintygirl, hangedman, bigboy, netsman, buttboy, boigirl, sickman, machogirl, pro-woman, tenordrumman, valleygirl, girlboy, pushman, antibusinessman, craftsmen, penisman, non-girl, ploughboys, craftman, stickwoman, trannygirl, postergirl, axman, tradman, enterdaveman, countrygirl, breakupgirl, cameragirl, harmlessfangirl, sissyboy, badboy, marblecargirl, wingwoman, beatboxman, trannyboy, marketingman, watergirl, dirtboy, paceman, stickboy, tarpman, coastman, idiotboy, cityboy, homosextransman, aspergirl, mainman, racketboy, 

In [121]:
print(suffix_words_verified['word'].tolist())

['spokesman', 'congressman', 'businessman', 'policeman', 'freshman', 'fisherman', 'cameraman', 'statesman', 'defenseman', 'madman', 'frontman', 'fireman', 'postman', 'baseman', 'strongman', 'foreman', 'craftsman', 'ombudsman', 'hitman', 'superman', 'caveman', 'batsman', 'lineman', 'workman', 'barman', 'showman', 'newsman', 'pitchman', 'serviceman', 'defenceman', 'snowman', 'watchman', 'marksman', 'sportsman', 'hangman', 'weatherman', 'mailman', 'taxman', 'tribesman', 'wingman', 'batman', 'seaman', 'anchorman', 'conman', 'assemblyman', 'draftsman', 'crewman', 'freeman', 'journeyman', 'gentlemen', 'militiaman', 'horseman', 'headman', 'motorman', 'linesman', 'swordsman', 'bondsman', 'lawman', 'warehouseman', 'midshipman', 'yeoman', 'jokerman', 'spaceman', 'workingman', 'kinsman', 'corpsman', 'bluesman', 'woodsman', 'newspaperman', 'guardsman', 'frontiersman', 'boatman', 'triggerman', 'iceman', 'policemen', 'dairyman', 'freedman', 'stallman', 'stockman', 'longshoreman', 'bagman', 'swingman

### Prefixes

In [75]:
prfx_json = '../words/prefixes.json'
prefixes = json.load(open(prfx_json, 'r'))

In [76]:
# turn into df
prefixes_rowwise = []
for cat, word_dict in prefixes.items():
    for word, count in word_dict.items():
        prefixes_rowwise.append([cat, word, count])

prefixes_df = pd.DataFrame(prefixes_rowwise, columns=['cat', 'word', 'count'])

# add new empty column at position 2
prefixes_df.insert(2, 'variant', '')

In [77]:
prefixes_df.head()

Unnamed: 0,cat,word,variant,count
0,man,man-boobs,,11
1,man,man-marking,,6
2,man,man-made,,200
3,man,man-fasting,,4
4,man,man-ape,,9


In [78]:
prefixes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186 entries, 0 to 185
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   cat      186 non-null    object
 1   word     186 non-null    object
 2   variant  186 non-null    object
 3   count    186 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 5.9+ KB


In [35]:
prefixes_verified = verify_wordlist_bn(prefixes_df['word'].tolist())

In [79]:
# add variants to df
for orig, variant in prefixes_verified.items():
    prefixes_df.loc[prefixes_df['word'] == orig, 'variant'] = variant

# reduce df to only those with variants
prefixes_df_verified = prefixes_df[prefixes_df['variant'] != '']

In [80]:
len(prefixes_df_verified)

88

In [81]:
prefixes_df_verified.to_csv('../words/verified_prefixes.csv', header=True)

#### Words left behind

In [66]:
# words that were not verified
set(prefixes_verified.keys()) ^ set(prefixes_df['word'])

{"boy's",
 'boyCan',
 'boyLocation',
 'boyShow',
 'boychick',
 'boycoott',
 'boycottchina',
 'boycottexxonmobil',
 'boycottfox',
 'boycottvista',
 'boycottwas',
 'boyeee',
 'boyeeeeee',
 'boyeeeez',
 'boyeez',
 'boyfrienddom',
 'boyfriendhave',
 'boyfrind',
 'boygirl',
 'boygrrl',
 'boyie',
 'boynu',
 'boynunda',
 'boynunu',
 'boyon',
 'boyos',
 'boysand',
 'boysfirsttime',
 'boysroom',
 'boyut',
 'boyutlarda',
 'boywear',
 'boywho',
 'boywithstick',
 'boyworld',
 'girlMore',
 'girland',
 'girlboi',
 'girlboy',
 'girlcotting',
 'girlcrush',
 'girlfiend',
 'girlfirend',
 'girlfried',
 'girlfrield',
 'girlfrien',
 'girlfriendd',
 'girlinblack',
 'girllie',
 'girlsCute',
 'girlsandon',
 'girlsof',
 'man-God',
 'man-advantage',
 'man-and',
 'man-bib',
 'man-bra',
 'man-bracelet',
 'man-brain',
 'man-building',
 'man-bush',
 'man-but',
 'man-certificate',
 'man-chick',
 'man-children',
 'man-clit',
 'man-compatibility',
 'man-fashion',
 'man-fasting',
 'man-flesh',
 'man-food',
 'man-hand',

In [112]:
prefixes_df_verified.word.tolist()

['man-boobs',
 'man-marking',
 'man-made',
 'man-ape',
 'man-pack',
 'man-hater',
 'man-eater',
 'man-bag',
 'man-ass',
 'man-crush',
 'man-child',
 'man-stealing',
 'man-woman',
 'man-machine',
 'man-cession',
 'man-hours',
 'man-fest',
 'man-code',
 'man-eating',
 'man-hour',
 'man-hating',
 'man-hug',
 'man-up',
 'man-year',
 'man-thing',
 'man-cat',
 'man-stopper',
 'man-love',
 'man-dog',
 'man-power',
 'man-month',
 'man-servant',
 'man-rating',
 'man-boy',
 'man-db',
 'man-magnet',
 'man-whore',
 'man-cave',
 'man-He',
 'man-trap',
 'man-friend',
 'man-hunt',
 'man-hood',
 'man-page',
 'man-slaughter',
 'man-kind',
 'man-mark',
 'womankind',
 'womanist',
 'womanly',
 'womanism',
 'girlfriend',
 'girliness',
 'girlie',
 'girlfag',
 'girlish',
 'girldom',
 'girlvinyl',
 'girlification',
 'girlishly',
 'girlfight',
 'girlpower',
 'girlcott',
 'girllove',
 'boyfriend',
 'boyle',
 'boyne',
 'boyar',
 'boyband',
 'boyish',
 'boying',
 'boyscout',
 'boyness',
 'boyism',
 'boyishly',
 '

In [113]:
not_people = {'boycot', 'boyardee','boyun', 'boyce', 'boynton', 'boyne', 'boyle'}
# remove rows in which word starts with word in not_people
prefixes_df_verified = prefixes_df_verified[~prefixes_df_verified['word'].str.startswith(tuple(not_people))]

In [114]:
len(prefixes_df_verified)

75

## Join Prefixes and Suffixes

In [115]:
# join suffixes and prefixes into one df
prefixes_df_verified['affix_type'] = 'prefix'
suffix_words_verified['affix_type'] = 'suffix'

assert list(prefixes_df_verified.columns) == list(suffix_words_verified.columns)

Index(['word', 'variant', 'count', 'cat', 'affix_type'], dtype='object') Index(['word', 'count', 'cat', 'variant', 'affix_type'], dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prefixes_df_verified['affix_type'] = 'prefix'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  suffix_words_verified['affix_type'] = 'suffix'


In [116]:
# adjust the oder of columns
prefixes_df_verified = prefixes_df_verified[['word', 'variant', 'count', 'cat', 'affix_type']]
suffixes_df_verified = suffix_words_verified[['word', 'variant', 'count', 'cat', 'affix_type']]

In [118]:
# concatenate dataframes
affixes_df = pd.concat([suffixes_df_verified, prefixes_df_verified], ignore_index=True)

In [119]:
affixes_df.columns

Index(['word', 'variant', 'count', 'cat', 'affix_type'], dtype='object')

In [120]:
affixes_df.to_csv('/home/bartlm/tryout/words/verified_affixes.csv', header=True)