# Add plurals

In [3]:
# load terms from file with pandas

import pandas as pd
import numpy as np
import nltk
import inflect

# load terms from file
path_to_terms = '../words/replacements.csv'
df_sg = pd.read_csv(path_to_terms, sep=',')
df_sg.head()

Unnamed: 0,word,variant,frequency,category,affix_type,replacement
0,spokesman,spokesman,18072.0,man,suffix,spokesperson
1,girlfriend,girlfriend,8436.0,girl,prefix,partner
2,boyfriend,boyfriend,6069.0,boy,prefix,partner
3,spokeswoman,spokeswoman,5731.0,woman,suffix,spokesperson
4,congressman,congressman,1702.0,man,suffix,congressperson


In [4]:
df_sg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335 entries, 0 to 334
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   word         335 non-null    object 
 1   variant      335 non-null    object 
 2   frequency    300 non-null    float64
 3   category     335 non-null    object 
 4   affix_type   335 non-null    object 
 5   replacement  335 non-null    object 
dtypes: float64(1), object(5)
memory usage: 15.8+ KB


In [5]:
df_sg.affix_type.value_counts()

affix_type
suffix    282
prefix     53
Name: count, dtype: int64

In [6]:
df_sg['number'] = 'SG'
df_sg.head()

Unnamed: 0,word,variant,frequency,category,affix_type,replacement,number
0,spokesman,spokesman,18072.0,man,suffix,spokesperson,SG
1,girlfriend,girlfriend,8436.0,girl,prefix,partner,SG
2,boyfriend,boyfriend,6069.0,boy,prefix,partner,SG
3,spokeswoman,spokeswoman,5731.0,woman,suffix,spokesperson,SG
4,congressman,congressman,1702.0,man,suffix,congressperson,SG


In [7]:
# get all words that end in -man
man_words = df_sg[(df_sg['category'] == 'man') & (df_sg['affix_type'] == 'suffix')]['word']
woman_words = df_sg[(df_sg['category'] == 'woman') & (df_sg['affix_type'] == 'suffix')]['word']

In [8]:
# replace -man with -woman
man_words_cut = [word[:-3]+'woman' for word in man_words]

In [9]:
# words that have a -woman ending but not a -man ending
for w in woman_words:
    if w not in man_words_cut:
        print(w)

charwoman
man-woman
anti-woman


In [55]:
excluded = ['manass']
for index, row in df_sg.iterrows():
    if row['word'] != row['variant'] and row['variant'] not in excluded:
        print(row['word'], row['variant'])
        new_row = {'word': row['variant'], 
               'variant': row['variant'], 
               'frequency': np.nan,
               'category': row['category'], 
               'affix_type': row['affix_type'], 
               'replacement': row['replacement'],
               'number': row['number'],
               'source': row['source']}
        df_sg.loc[len(df_sg)] = new_row

anti-woman antiwoman
cabin-boy cabin boy
con-boy conboy
mad-man mad man
man-cave man cave
man-cession mancession
man-friend manfriend
man-hater man hater
man-hating man hating
man-hug man hug
man-hunt man hunt
man-kind mankind
man-magnet man magnet
man-marking man marking
man-ny manny
man-pack manpack
man-power man power
man-servant man servant
man-stealing manstealing
man-stopper manstopper
man-up man up
man-woman manwoman
pot-boy potboy
spy-master spy master


In [10]:
df_sg.tail()

Unnamed: 0,word,variant,frequency,category,affix_type,replacement,number
330,statesmanship,statesmanship,,manship,suffix,public management skill,SG
331,stockmanship,stockmanship,,manship,suffix,livestock raising skill,SG
332,swordsmanship,swordsmanship,,manship,suffix,fencing skill,SG
333,upmanship,upmanship,,manship,suffix,competitive behavior,SG
334,workmanship,workmanship,,manship,suffix,workpersonship,SG


In [11]:
m = inflect.engine()
# input word to be pluralized 
example = 'man-bun'
# Pass the word to be pluralized as an argument to the plural() 
# function to make the word plural.
print(f"The plural form of {example} is: ", m.plural(example))

The plural form of man-bun is:  man-buns


In [13]:
# add plurals
plural_rows = []
jjs = ['boyish', 'girlish', 'man-made', 'manmade']

for index, row in df_sg.iterrows():
    replacement = row['replacement'].split(', ')
    word = row['word']
    pos = nltk.pos_tag([word])[0][1]
    if word in jjs:
        pass
    elif pos == 'NN':
        word_pl = m.plural(word)
        replacement_pl = ', '.join([m.plural(r) for r in replacement])
    elif pos == 'RB': # adverbs such as 'womanly'
        pass
    new_row = {'word': word_pl, 
               'variant': word_pl,
               'frequency': np.nan,
               'category': row['category'],
                'affix_type': row['affix_type'], 
               'replacement': replacement_pl, 
               'number': 'PL'}
    plural_rows.append(new_row)

df_plural = pd.DataFrame(plural_rows)

In [14]:
df_plural

Unnamed: 0,word,variant,frequency,category,affix_type,replacement,number
0,spokesmen,spokesmen,,man,suffix,spokespeople,PL
1,girlfriends,girlfriends,,girl,prefix,partners,PL
2,boyfriends,boyfriends,,boy,prefix,partners,PL
3,spokeswomen,spokeswomen,,woman,suffix,spokespeople,PL
4,congressmen,congressmen,,man,suffix,congresspeople,PL
...,...,...,...,...,...,...,...
330,statesmanships,statesmanships,,manship,suffix,public management skills,PL
331,stockmanships,stockmanships,,manship,suffix,livestock raising skills,PL
332,swordsmanships,swordsmanships,,manship,suffix,fencing skills,PL
333,upmanships,upmanships,,manship,suffix,competitive behaviors,PL


In [15]:
# concatenate plural to original df
df = pd.concat([df_sg, df_plural])
df.sort_values(by=['category', 'word'], inplace=True)

In [16]:
df.head()

Unnamed: 0,word,variant,frequency,category,affix_type,replacement,number
156,ballboy,ballboy,5.0,boy,suffix,ball person,SG
156,ballboys,ballboys,,boy,suffix,ball people,PL
157,batboy,batboy,5.0,boy,suffix,bat person,SG
157,batboys,batboys,,boy,suffix,bat people,PL
193,bellboy,bellboy,3.0,boy,suffix,bellhop,SG


In [17]:
print(len(df))
df.drop_duplicates(inplace=True, subset=['word', 'variant'])
print(len(df))

670
659


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 659 entries, 156 to 227
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   word         659 non-null    object 
 1   variant      659 non-null    object 
 2   frequency    299 non-null    float64
 3   category     659 non-null    object 
 4   affix_type   659 non-null    object 
 5   replacement  659 non-null    object 
 6   number       659 non-null    object 
dtypes: float64(1), object(6)
memory usage: 41.2+ KB


In [19]:
df.head(60)

Unnamed: 0,word,variant,frequency,category,affix_type,replacement,number
156,ballboy,ballboy,5.0,boy,suffix,ball person,SG
156,ballboys,ballboys,,boy,suffix,ball people,PL
157,batboy,batboy,5.0,boy,suffix,bat person,SG
157,batboys,batboys,,boy,suffix,bat people,PL
193,bellboy,bellboy,3.0,boy,suffix,bellhop,SG
193,bellboys,bellboys,,boy,suffix,bellhops,PL
98,boyband,boyband,13.0,boy,prefix,band,SG
98,boybands,boybands,,boy,prefix,bands,PL
2,boyfriend,boyfriend,6069.0,boy,prefix,partner,SG
2,boyfriends,boyfriends,,boy,prefix,partners,PL


In [22]:
save_path = '../words/replacements+plural.csv'
df.to_csv(save_path, index=False)

In [21]:
df_sg[df_sg['word'] == 'girlish'.lower()]['replacement'].values[0]

'feminine'

In [23]:
final_path = '../words/replacements+plural-final.csv'

final_df = pd.read_csv(final_path)
print(final_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 692 entries, 0 to 691
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   word         692 non-null    object
 1   replacement  692 non-null    object
 2   category     692 non-null    object
 3   affix_type   692 non-null    object
 4   number       692 non-null    object
dtypes: object(5)
memory usage: 27.2+ KB
None
