In [72]:
import pandas as pd
import spacy
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")
from spacy.matcher import DependencyMatcher
import nltk.data
import nltk
import numpy as np

from spacy import displacy



## Using SpaCy Dependency Matcher to examine gendered verbs in Arthurian literture
### A few notes: 
- Link to the spaCy dependency matcher: https://spacy.io/api/dependencymatcher
- Link to the inspiration for this piece, from the pudding!! https://pudding.cool/2020/07/gendered-descriptions/

### First up: example/test case

In [None]:
# I made up this sentence
example = "He smote the knight, and she carefully walked over and thought about everything that had happened. The man ran to her and the queen cried. And the king was walking to the garden, and Guinevere saw that he was angry. He angrily thrust the sword"
# you need to turn your text into a 'doc' to use it in spacy
doc = nlp(example)

In [None]:
#trying to get all the verbs and their subjects

# RIGHT_ID should be a new node and
# LEFT_ID should already have have been declared in previous edges

pattern = [
  {
    "RIGHT_ID": "target",
    "RIGHT_ATTRS": {"POS": "VERB"}
  },
  {
    "LEFT_ID": "target",
    "REL_OP": ">",
    "RIGHT_ID": "subject",
    "RIGHT_ATTRS": {"DEP": "nsubj"}
  }
]



In [None]:
matcher = DependencyMatcher(nlp.vocab)
matcher.add("pattern", [pattern])
matches = matcher(doc)


In [None]:
#working, and I have the lemmas!!!!
# lemmas are the base form of the word. 
# Lemmas will make walk, walking, and walked all into walk
 
for match_id, elements in matcher(doc):
    pieces = [doc[e].lemma_ for e in elements]
    print(pieces, sep='\t')
    print('-----')

### Now that my test is working, I'll make this into a function 


In [None]:
#Now I'll put this in a pandas df
#I'll start by making a function so that I don't get confused

def collect(text):
    #cleaning the text:
    for char in ["\n", "\r", "\d", "\t"]:
        text = text.replace(char, " ")
    
    #making the doc: 
    doc = nlp(text)
    
    #making the pattern(verb + subject) 
    pattern = [
  {
    "RIGHT_ID": "target",
    "RIGHT_ATTRS": {"POS": "VERB"}
  },
  {
    "LEFT_ID": "target",
    "REL_OP": ">",
    "RIGHT_ID": "subject",
    "RIGHT_ATTRS": {"DEP": "nsubj"}
  }
]
    
    #setting up the matcher: 
    matcher = DependencyMatcher(nlp.vocab)
    matcher.add("pattern", [pattern])
    matches = matcher(doc)
    
    #printing stuff and making list!
    snippets = []
    for match_id, elements in matcher(doc):
        pieces = [doc[e].lemma_ for e in elements]
        snippets.append(pieces)
    return(snippets)

In [None]:
example_list = collect(example)

In [None]:
example_list
#ok, seems to be working! 

### Running the function on the real text
-  Note that there are rather strict limits on how long your text can be to put it in spacy. I had to split my books into 3 or 4 parts.
 - note that this step takes a little while so you'll want to save it after you're done to a csv 

In [None]:
#Let's try it with my real texts: 
mists1 = open('mists1.txt').read()
mists1_list = collect(mists1)

In [None]:
mists2 = open('mists2.txt').read()
mists2_list = collect(mists2)

In [None]:
mists3 = open('mists3.txt').read()
mists3_list = collect(mists3)

In [None]:
# appending all the lists together so that I can make it into a pandas df
for item in mists2_list:
    mists1_list.append(item)

In [None]:
for item in mists3_list:
    mists1_list.append(item)

In [None]:
mists_df = pd.DataFrame(mists1_list, columns = ['verb', 'subject'])

In [None]:
#important to save to csv as this step takes a bit!!!
mists_df.to_csv('mists_verbs.csv')

In [None]:
#now for malory
malory1 = open('malory1.txt').read()
malory1_list = collect(malory1)

In [None]:
malory2 = open('malory2.txt').read()
malory2_list = collect(malory2)

In [None]:
malory3 = open('malory3.txt').read()
malory3_list = collect(malory3)

In [None]:
malory4 = open('malory4.txt').read()
malory4_list = collect(malory4)

In [None]:
#just double checking the count
# that way I can check that I appended the list correctly 
print(len(malory1_list) + len(malory2_list) + len(malory3_list) + len(malory4_list))


In [None]:
for item in malory2_list:
    malory1_list.append(item)

In [None]:
for item in malory3_list:
    malory1_list.append(item)

In [None]:
for item in malory4_list:
    malory1_list.append(item)

In [None]:
malory_df = pd.DataFrame(malory1_list, columns = ['verb', 'subject'])

In [None]:
malory_df #df is same length so we know that the lists appended correctly

In [None]:
# important!!!
malory_df.to_csv('malory_verbs.csv')

### now that we have a couple of nice looking data frames, let's do some simple analysis.

In [2]:
malory_df = pd.read_csv('malory_verbs.csv')

In [3]:
malory_df

Unnamed: 0.1,Unnamed: 0,verb,subject
0,0,send,Pendragon
1,1,befell,it
2,2,hold,that
3,3,send,Uther
4,4,like,king
...,...,...,...
31237,31237,readeth,that
31238,31238,send,God
31239,31239,pray,I
31240,31240,pray,you


In [4]:
mists_df = pd.read_csv('mists_verbs.csv')

In [5]:
mists_df

Unnamed: 0.1,Unnamed: 0,verb,subject
0,0,become,she
1,1,acknowledgment,-malory
2,2,acknowledgment,d'Arthur
3,3,cite,I
4,4,give,who
...,...,...,...
42541,42541,smile,Morgaine
42542,42542,turn,she
42543,42543,need,she
42544,42544,need,she


In [None]:
# now to do some testing!
malory_df['subject'].value_counts().head(50)
#most common subjects: 'he', 'I', 'she', 'Launcelot', 'Tristram', 'Arthur'

In [None]:
mists_df['subject'].value_counts().head(50)
#here, the most common is 'she', then 'i', 'he', 'Morgaine', 'Arthur', 'Gwenhwyfar', 'Igraine'

In [None]:
#verbs!
malory_df['verb'].value_counts().head(30)
#say is up there, so is ride, take, depart, smite, slay, fight

In [None]:
mists_df['verb'].value_counts().head(30)
#think, lie, love, etc are higher here. 

### a bit of simple exploration of the data.

In [None]:
#Let's just look at what is most common for both
mists_df.value_counts().head(30)

In [None]:
malory_df.value_counts().head(30)

### Loading in my lists of gendered names and titles
Since I was only working with two books that I am familiar with, I made these lists manually. If you were working with hundreds of books you might want to find some sort of predetermined list with typical gendered names and titles (like sister, mother, Mary, Susan, queen, etc).

LMD

In [6]:
male_malory = ['Carados', 'earl', 'Melodias', 'nephew', 'Segwarides', 'Elias', 'Linet','Galahalt','Colgrevance', 'Epinogris','Kehydius', 'Gringamore', 'Nabon', 'Dinas', 'himself','Solomon', 'Andred', 'Lucan', 'Balan', 'Pelles', 'Joseph', 'Persant','Certes','Aglovale','Turquine', 'Pope', 'herald', 'clerk', 'Breunor','Lucius', 'Urre','yeoman','Archbishop', 'Lucas', 'Constantine', 'Meliagaunce', 'Darras', 'Plenorius', 'Brandiles', 'Ontzlake', 'Anglides', 'Blamore', 'Bromel', 'Harry', 'Maris', 'Safere', 'Priamus', 'father', 'Melias','Mador','Sadok','Persides', 'baron','son', 'duke', 'Lord','Bishop','Bedivere','Ulfius','Griflet','Ban', 'Ganis', 'Sagramore','father', 'Gouvernail','Bagdemagus','squire', 'Gramercy','Pelleas', 'Lionesse','Agravaine', 'Accolon', 'Taile', 'Lionel','fellow', 'Tor', 'lord', 'Palamides', 'sir', 'Alisander', 'Bleoberis', 'hermit','Uwaine','man', 'Marhaus', 'Lavaine', 'Meliagrance','Gaheris', 'Uwaine' 'lord', 'Balin', 'brother','Ector','Sir','he', 'Pendragon', 'Uther', 'Launcelot', 'Tristram', 'Arthur', 'Palomides', 'Gawaine', 'Mark', 'Bors', 'Dinadan', 'Galahad', 'Gareth', 'Lamorak', 'Percivale', 'Merlin', 'Galahad' ,'Kay', 'Beaumains', 'Lot', 'Uriens', 'Pellinore', 'Mordred', 'King', 'king', 'Knight', 'knight', 'Prince', 'prince',]

In [7]:
female_malory = ['maiden', 'wife', 'Lady', 'madam','Bragwaine', 'woman','daughter', 'sister', 'mother','Brisen', 'gentlewoman','Damosel','Madam', 'damosel', 'lady','she', 'Guenever', 'Isoud', 'Elaine', 'Margawse', 'Igraine', 'Nimue', 'Morgan', 'queen', 'Queen', 'princess', 'Princess']

Mists

In [8]:
female_mists = ['maiden', 'Isotta', 'Priscilla', 'abbess','midwife','her','nun','daughter','Columba','viviane','Mother', 'Maline', 'wife','woman', 'girl', 'Lady', 'priestess', 'sister', 'morgaine', 'urien', 'lady', 'mother', 'girl', 'herself','Brisen','she','Goddess','Morgaine', 'Gwenhwyfar', 'Viviane', 'Morgause', 'Igraine', 'Elaine', 'Niniane', 'Raven', 'Nimue', 'queen', 'Queen', 'princess', 'Princess' ]

In [9]:
male_mists = ['Archbishop', 'Pendragon', 'Gawan', 'Agravaine', 'husband', 'Ectorius', 'Griflet', 'Marcus', 'brother','Eian', 'gawaine', 'Ban', 'Joseph', 'Alienor', 'Drustan', 'Duke', 'himself', 'Northmen', 'Griffin', 'Mordred', 'lad', 'Lancelot','Lamorak', 'Meleas','monk','soldier','Leodegranz', 'bishop', 'lord', 'Pellinore', 'son', 'Balin', 'Patricius', 'Uwaine', 'gwydion', 'boy','Uwiane', 'Meleagrant', 'lancelet', 'Ambrosius', 'Balan', 'Cai', 'Avalloch', 'father', 'man', 'Taliesin', 'priest', 'Accolon', 'Lot','he','Gwydion', 'Galahad', 'Gareth', 'Gawaine', 'Uriens', 'Merlin', 'Kevin', 'Gorlois', 'Uther', 'Lancelet', 'Arthur', 'King', 'king', 'Knight', 'knight', 'Prince', 'prince']

### Matching subject with gender. 
1. Run the function which goes to the 'subject' column, checks if the subject is in the male or female list. If it is, the function adds either 'male' or 'female' to the gender column. If not, it adds 'unknown' to the gender column. 
2. Use value_counts() to examine all subjects that are labeled as 'unknown'. I went from the most mentioned subjects to the least mentioned. If 'monk' or 'knight' or 'king' or 'Gawain' were listed as 'unknown', I would add those words to the male list and then rerun the function. I repeated this until all of the unknown gender subjects were actually unknown to me (I, we, they, people, horses).

### Malory 

In [10]:
# Function to determine gender based on the name
def determine_gender(name):
    if name in female_malory:
        return 'female'
    elif name in male_malory:
        return 'male'
    else:
        return 'unknown'  # You can customize this if there are names not found in the lists.

# Apply the function to create a new 'gender' column
malory_df['gender'] = malory_df['subject'].apply(determine_gender)

# Display the resulting dataframe
malory_df

Unnamed: 0.1,Unnamed: 0,verb,subject,gender
0,0,send,Pendragon,male
1,1,befell,it,unknown
2,2,hold,that,unknown
3,3,send,Uther,male
4,4,like,king,male
...,...,...,...,...
31237,31237,readeth,that,unknown
31238,31238,send,God,unknown
31239,31239,pray,I,unknown
31240,31240,pray,you,unknown


In [11]:
counts = malory_df[malory_df['gender'] == 'unknown']['subject'].value_counts()
#I would change counts > ? to look at different parts of the unknown list
counts = counts[counts > 12]
#I would change this to head or tail to look at the beginning or the end of the list
counts.tail(60)

subject
ye           1411
that         1039
we            553
it            433
you           222
thou          200
which         157
God           117
horse         105
Nay           103
one            77
hath           74
hast           74
alas           72
shalt          72
none           70
what           66
Jesu           63
who            56
this           46
anon           42
people         40
dwarf          39
bor            39
sword          34
all            33
some           32
other          31
ah             29
fie            28
thee           28
Fay            28
day            27
heart          26
art            24
blood          24
either         24
many           23
wilt           23
spear          22
yea            22
?              21
Lo             20
party          19
ship           19
book           19
adventure      19
messenger      19
Ye             18
host           18
voice          17
wit            17
shield         17
everych        16
word           16
Al

### Mists

In [12]:
# Function to determine gender based on the name
def determine_gender(name):
    if name in female_mists:
        return 'female'
    elif name in male_mists:
        return 'male'
    else:
        return 'unknown'  # You can customize this if there are names not found in the lists.

# Apply the function to create a new 'gender' column
mists_df['gender'] = mists_df['subject'].apply(determine_gender)

# Display the resulting dataframe
mists_df

Unnamed: 0.1,Unnamed: 0,verb,subject,gender
0,0,become,she,female
1,1,acknowledgment,-malory,unknown
2,2,acknowledgment,d'Arthur,unknown
3,3,cite,I,unknown
4,4,give,who,unknown
...,...,...,...,...
42541,42541,smile,Morgaine,female
42542,42542,turn,she,female
42543,42543,need,she,female
42544,42544,need,she,female


In [13]:
counts = mists_df[mists_df['gender'] == 'unknown']['subject'].value_counts()
#I would change the counts > number in order to view different parts of the list of unknown subjects
counts = counts[counts > 20]
counts.tail(60)

subject
they          1331
it            1140
who           1041
we             837
that           346
which          309
one            226
God            213
what           185
eye            123
voice          113
hand           104
face            99
this            69
child           69
people          68
time            61
Saxons          57
sun             56
none            54
all             51
hair            51
some            47
day             46
body            42
world           42
folk            41
something       37
word            35
thing           35
heart           35
someone         35
moon            31
nothing         31
tear            31
anyone          31
life            31
Avalon          30
Gods            29
mind            29
thought         29
whatever        28
Christians      27
Christ          27
light           27
head            27
sword           26
servant         25
other           25
mist            25
horse           24
land            24
bloo

### Here is just some exploratory stuff looking at popular verbs that men and women use. 


In [14]:
mists_df[mists_df['gender'] == 'female']['verb'].value_counts().head(30)

verb
say         1634
see          597
know         588
think        567
feel         456
have         427
look         281
hear         253
come         240
do           228
go           212
take         158
tell         132
bear         130
give         124
ask          122
wonder       122
speak        122
sit          119
remember     118
lie          115
make         112
stand        105
turn         105
want          98
find          90
whisper       84
put           84
smile         83
shake         79
Name: count, dtype: int64

In [None]:
mists_df[mists_df['gender'] == 'male']['verb'].value_counts().head(30)

In [None]:
malory_df[malory_df['gender'] == 'male']['verb'].value_counts().head(30)
#men are smiting, fighting, jousting, falling

In [None]:
malory_df[malory_df['gender'] == 'female']['verb'].value_counts().head(30)
#women are dying, crying, loving, weeping, and praying

In [None]:
## Mists: 31242 total. 14498 is unknown. 

In [None]:
## Malory: 42546 total #20581 unknown

### Now I am going to remove all rows with unknown gender. 
I choose to do this because the subject of the unknown gendered verbs are locations,people, animals, objects, or subjects such as I, we, and they. Since I want to look at the gendering of words, these rows are not important

In [37]:
#here I create a filtered dataframe that does not include the verbs with unknown gender
mists_df_filtered = mists_df[mists_df['gender'] != 'unknown'].copy()


In [46]:
mists_df_filtered.value_counts('gender')

gender
female    13213
male       8752
Name: count, dtype: int64

In [16]:
#here I create a filtered dataframe that does not include the verbs with unknown gender
malory_df_filtered = malory_df[malory_df['gender'] != 'unknown'].copy()


In [None]:
malory_df_filtered.value_counts('gender')
#male: 14596
#female: 2148

### Next I will make a df that has: 
    - one row for each verb
    - a column for the female verb usage
    - a column for the male verb usage
    - a column for the total verb usage


In [17]:

# Group by 'verb' and 'gender' columns, then count the occurrences
malory_grouped = malory_df_filtered.groupby(['verb', 'gender']).size().reset_index(name='count')

# Pivot the table to get male and female counts as separate columns
malory_pivot_table = malory_grouped.pivot(index='verb', columns='gender', values='count').reset_index()

# Fill NaN values with 0, in case some verbs were not used by a particular gender
malory_pivot_table = malory_pivot_table.fillna(0)

# Calculate the total count of each verb
malory_pivot_table['total_count'] = malory_pivot_table['male'] + malory_pivot_table['female']



### Mists

In [39]:

# Group by 'verb' and 'gender' columns, then count the occurrences
mists_grouped = mists_df_filtered.groupby(['verb', 'gender']).size().reset_index(name='count')

# Pivot the table to get male and female counts as separate columns
mists_pivot_table = mists_grouped.pivot(index='verb', columns='gender', values='count').reset_index()

# Fill NaN values with 0, in case some verbs were not used by a particular gender
mists_pivot_table = mists_pivot_table.fillna(0)

# Calculate the total count of each verb
mists_pivot_table['total_count'] = mists_pivot_table['male'] + mists_pivot_table['female']


### Making pctF and pctM columns
I will take the number of times the verb appeared for females and divide it by the total # of female verb appearances. And the same for male. 
- Malory female verbs: 2148
- Malory male verbs: 16744

- Mists female verbs: 13213
- mists male verbs: 8752
Then, I will calculate the skew with the following equation: 
- if pctM > pctf, then skew = pctM/ pctF, else skew = -pctF /pctM

These equations came from the Pudding: https://pudding.cool/2020/07/gendered-descriptions/

In [18]:
malory_pivot_table['pctF'] = malory_pivot_table['female'] / 2148

In [19]:
malory_pivot_table['pctM'] = malory_pivot_table['male'] / 14596

In [20]:
# Initialize the 'skew' column to 0

#calculating skew: if pctM > pctf, then skew = pctM/ pctf, else skew = -pctf /pctm
malory_pivot_table['skew'] = 0

# Calculate the skew for each row
for index, row in malory_pivot_table.iterrows():
    pctM = row['pctM']
    pctF = row['pctF']

    if pctM > pctF:
        if pctF == 0:
            malory_pivot_table.loc[index, 'skew'] = 0
        else:
            malory_pivot_table.loc[index, 'skew'] = pctM / pctF
    else:
        if pctM == 0:
            malory_pivot_table.loc[index, 'skew'] = 0
        else:
            malory_pivot_table.loc[index, 'skew'] = -1 * pctF / pctM

In [None]:
malory_pivot_table.sort_values('skew').head(60)

#Cool! why is kill used more for women? Interesting! 
#rejoice, nourish, scorn, rebuke, bleed, heal, conjour, laugh, complain

In [None]:
malory_pivot_table.sort_values('skew').tail(60)
#smite, lol

In [21]:
#adding an empty column for y axis so that I don't need to use the y axis
malory_pivot_table['yaxis'] = 0

In [22]:
#saving as a csv so that I don't have to run all the code again!

malory_pivot_table.to_csv('malory_verbs_plot.csv')

In [23]:
malory_pivot_table = pd.read_csv('malory_verbs_plot.csv')

In [24]:
malory_pivot_table

Unnamed: 0.1,Unnamed: 0,verb,female,male,total_count,pctF,pctM,skew,yaxis
0,0,abash,1.0,3.0,4.0,0.000466,0.000206,-2.265053,0
1,1,abate,1.0,2.0,3.0,0.000466,0.000137,-3.397579,0
2,2,abide,7.0,24.0,31.0,0.003259,0.001644,-1.981921,0
3,3,abiden,0.0,1.0,1.0,0.000000,0.000069,0.000000,0
4,4,abideth,0.0,1.0,1.0,0.000000,0.000069,0.000000,0
...,...,...,...,...,...,...,...,...,...
755,755,yede,4.0,14.0,18.0,0.001862,0.000959,-1.941474,0
756,756,yield,1.0,29.0,30.0,0.000466,0.001987,4.267745,0
757,757,yode,2.0,2.0,4.0,0.000931,0.000137,-6.795158,0
758,758,yonder,0.0,1.0,1.0,0.000000,0.000069,0.000000,0


### Now for mists

In [47]:
mists_pivot_table['pctF'] = mists_pivot_table['female'] / 13213
mists_pivot_table['pctM'] = mists_pivot_table['male'] / 8752

In [48]:
# Initialize the 'skew' column to 0

#calculating skew: if pctM > pctf, then skew = pctM/ pctf, else skew = -pctf /pctm
mists_pivot_table['skew'] = 0

# Calculate the skew for each row
for index, row in mists_pivot_table.iterrows():
    pctM = row['pctM']
    pctF = row['pctF']

    if pctM > pctF:
        if pctF == 0:
            mists_pivot_table.loc[index, 'skew'] = 0
        else:
            mists_pivot_table.loc[index, 'skew'] = pctM / pctF
    else:
        if pctM == 0:
            mists_pivot_table.loc[index, 'skew'] = 0
        else:
            mists_pivot_table.loc[index, 'skew'] = -1 * pctF / pctM

In [49]:
#adding an empty column for y axis so that I don't need to use the y axis
mists_pivot_table['yaxis'] = 0

In [55]:
mists_pivot_table.sort_values('skew').head(60)

gender,verb,female,male,total_count,pctF,pctM,skew,yaxis
1137,wonder,122.0,2.0,124.0,0.009233,0.000229,-40.405056,0
750,realize,55.0,2.0,57.0,0.004163,0.000229,-18.215394,0
400,feel,456.0,21.0,477.0,0.034511,0.002399,-14.383064,0
754,recall,18.0,1.0,19.0,0.001362,0.000114,-11.922803,0
899,sink,13.0,1.0,14.0,0.000984,0.000114,-8.610913,0
642,note,12.0,1.0,13.0,0.000908,0.000114,-7.948536,0
412,flee,12.0,1.0,13.0,0.000908,0.000114,-7.948536,0
480,guess,12.0,1.0,13.0,0.000908,0.000114,-7.948536,0
329,dream,21.0,2.0,23.0,0.001589,0.000229,-6.954969,0
357,endure,10.0,1.0,11.0,0.000757,0.000114,-6.62378,0


### Choosing which words to put on my graph
The Pudding has about 50 on their word graphs, so I want to try to stay below that. 

To start, let's only get the verbs that have at least total mentions. If we don't do this, we get really high skews for words that might be used only 1 or 2 times for both women and men. 

Next, we'll only grab the very polarized words: the verbs that have a skew lower than -2 or higher than 2. 

In [52]:
#ok, now we only have the words that have at least 10 mentions 
# i picked this number arbitrarily
malory_smaller = malory_pivot_table[malory_pivot_table['total_count'] > 10 ].copy()

In [26]:
#now let's try to make it smaller - 
#we'll just use words that have a skew smaller than -2 and larger than 2
malory_smaller2 = malory_smaller[(malory_smaller['skew'] < -2) | (malory_smaller['skew'] > 2)]


In [27]:
len(malory_smaller2)
#46 is a good number 


46

### Mists

In [56]:
mists_smaller = mists_pivot_table[mists_pivot_table['total_count'] > 10 ].copy()

In [60]:
mists_smaller2 = mists_smaller[(mists_smaller['skew'] < -2.5) | (mists_smaller['skew'] > 2.5)]

In [62]:
len(mists_smaller2) 
#54 sounds good

54

In [64]:
mists_smaller2

gender,verb,female,male,total_count,pctF,pctM,skew,yaxis
104,betray,6.0,22.0,28.0,0.000454,0.002514,5.535611,0
155,cast,23.0,6.0,29.0,0.001741,0.000686,-2.539116,0
158,cease,10.0,2.0,12.0,0.000757,0.000229,-3.31189,0
175,chuckle,11.0,21.0,32.0,0.000833,0.002399,2.882178,0
176,claim,3.0,8.0,11.0,0.000227,0.000914,4.025899,0
184,close,14.0,3.0,17.0,0.00106,0.000343,-3.091097,0
259,cut,13.0,3.0,16.0,0.000984,0.000343,-2.870304,0
284,deserve,3.0,8.0,11.0,0.000227,0.000914,4.025899,0
285,desire,9.0,19.0,28.0,0.000681,0.002171,3.18717,0
294,die,44.0,81.0,125.0,0.00333,0.009255,2.779243,0


### Next steps: 
1. Create chart as a scatterplot in Datawrapper. 
2. Download SVG + Edit in illustrator 

In [28]:
malory_smaller2.sort_values('total_count').to_clipboard()
#Sending this to my clipboard for copying into Datawrapper 

In [67]:
malory_smaller2.to_csv('malory_smaller2.csv')

In [69]:
mists_smaller2.sort_values('skew').to_clipboard()


In [66]:
mists_smaller2.to_csv('mists_smaller2.csv')


In [76]:
mists_smaller2['total_count'] == mists_smaller2['total_count']
                        

104     True
155     True
158     True
175     True
176     True
184     True
259     True
284     True
285     True
294     True
329     True
330     True
357     True
400     True
404     True
412     True
414     True
426     True
480     True
498     True
552     True
554     True
585     True
601     True
615     True
635     True
642     True
643     True
651     True
672     True
687     True
690     True
721     True
734     True
750     True
754     True
781     True
782     True
824     True
854     True
860     True
875     True
878     True
883     True
885     True
899     True
956     True
1012    True
1021    True
1036    True
1061    True
1127    True
1128    True
1137    True
Name: total_count, dtype: bool

# Deciding on font size
I used log to make the font sizes managable. Since some words were used 11 times and others were used 400 times, I had to figure out a way to size them in an understandable way. 

In [80]:
mists_smaller2['log_total_count'] = np.log2(mists_smaller2[ 'total_count'])
 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mists_smaller2['log_total_count'] = np.log2(mists_smaller2[ 'total_count'])


In [100]:
mists_smaller2['font_size'] = mists_smaller2['log_total_count'] * 4.2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mists_smaller2['font_size'] = mists_smaller2['log_total_count'] * 4.2


In [102]:
mists_smaller2['font_size'] = mists_smaller2['font_size'].round(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mists_smaller2['font_size'] = mists_smaller2['font_size'].round(1)


In [103]:
mists_smaller2.sort_values('skew')

gender,verb,female,male,total_count,pctF,pctM,skew,yaxis,log_total_count,font_size
1137,wonder,122.0,2.0,124.0,0.009233,0.000229,-40.405056,0,6.954196,29.2
750,realize,55.0,2.0,57.0,0.004163,0.000229,-18.215394,0,5.83289,24.5
400,feel,456.0,21.0,477.0,0.034511,0.002399,-14.383064,0,8.897845,37.4
754,recall,18.0,1.0,19.0,0.001362,0.000114,-11.922803,0,4.247928,17.8
899,sink,13.0,1.0,14.0,0.000984,0.000114,-8.610913,0,3.807355,16.0
480,guess,12.0,1.0,13.0,0.000908,0.000114,-7.948536,0,3.70044,15.5
412,flee,12.0,1.0,13.0,0.000908,0.000114,-7.948536,0,3.70044,15.5
642,note,12.0,1.0,13.0,0.000908,0.000114,-7.948536,0,3.70044,15.5
329,dream,21.0,2.0,23.0,0.001589,0.000229,-6.954969,0,4.523562,19.0
357,endure,10.0,1.0,11.0,0.000757,0.000114,-6.62378,0,3.459432,14.5


### Creating a scatterplot in datawrapper
1. I used datawrapper to create a scatterplot that put the verb skew on the x axis and 0 on the y axis 
    - I made a column called yaxis with 0s in it
    - Then, I manually changed some of the yaxis values for words that had a similar skew, so that they were not completely overlapping.
2. I sized the bubbles using the total verb count. 
3. Under annotations in datawrapper, I added the verb label to every dot (I had to click on each one manually to add it. It is annoying but it works)
4. I downloaded it as an svg


### Editing in illustrator
1. I didn't know how to size the font appropriately since some words had a total count of 12 and some (ahem, smite) had a total count of 360. 
    - I solved this in an annoying way. I used the ruler on illustrator to measure every circle. The smallest one was 7 pixels and the largest was 31 pixels. 
    - The font size I would use was the # of pixels + 5. So the smallest circle, the word with the smallest total count, had a font size of 12, and the largest had a font size of 37. 
    - For the second graph that I made, I actually took the log of the counts. The sizes of the words were about the same as when I measured the pixels of the scatterplot. 
    - I removed the background of the graph and the x and y axis and added my own lines. 
    - I moved my resized words around (never to the left or right, because that would mess with the skew, but I would move them up or down, as the y axis holds no meaning. 
    - I deleted all of the circles
    - I changed the colors and fonts of all the words
    - I made a longer mobile version of the graph so that 
 

In [1]:
#what is the one woman who uses the verb smote??? 
#
malory_df.loc[(malory_df['verb'] == 'smite') &
              (malory_df['gender'] == 'female')
               ]

NameError: name 'malory_df' is not defined

The section with smite being used by a woman: 

And in the meanwhile the damosel said to Sir Mordred: I ween my foolish
knight be either slain or taken prisoner: then were they ware where he
came riding. And when he was come unto them he told all how he had sped
and escaped in despite of them all: And some of the best of them will
tell no tales. Thou liest falsely, said the damosel, that dare I make
good, but as a fool and a dastard to all knighthood they have let thee
pass. That may ye prove, said La Cote Male Taile. With that she sent a
courier of hers, that rode alway with her, for to know the truth of
this deed; and so he rode thither lightly, and asked how and in what
manner that La Cote Male Taile was escaped out of the castle. Then all
the knights cursed him, and said that he was a fiend and no man: For he
hath slain here twelve of our best knights, and we weened unto this day
that it had been too much for Sir Launcelot du Lake or for Sir Tristram
de Liones. And in despite of us all he is departed from us and maugre
our heads.

With this answer the courier departed and came to Maledisant his lady,
and told her all how Sir La Cote Male Taile had sped at the Castle
Orgulous. Then she smote down her head, and said little. By my head,
said Sir Mordred to the damosel, ye are greatly to blame so to rebuke
him, for I warn you plainly he is a good knight, and I doubt not but he
shall prove a noble knight; but as yet he may not yet sit sure on
horseback, for he that shall be a good horseman it must come of usage
and exercise. But when he cometh to the strokes of his sword he is then
noble and mighty, and that saw Sir Bleoberis and Sir Palomides, for wit
ye well they are wily men of arms, and anon they know when they see a
young knight by his riding, how they are sure to give him a fall from
his horse or a great buffet. But for the most part they will not light
on foot with young knights, for they are wight and strongly armed. For
in likewise Sir Launcelot du Lake, when he was first made knight, he
was often put to the worse upon horseback, but ever upon foot he
recovered his renown, and slew and defoiled many knights of the Round
Table. And therefore the rebukes that Sir Launcelot did unto many
knights causeth them that be men of prowess to beware; for often I have
seen the old proved knights rebuked and slain by them that were but
young beginners. Thus they rode sure talking by the way together.

Here leave we off a while of this tale, and speak we of Sir Launcelot
du Lake.


In [36]:
 malory_smaller2[malory_smaller2['verb'] == 'smite']

Unnamed: 0.1,Unnamed: 0,verb,female,male,total_count,pctF,pctM,skew,yaxis
610,610,smite,1.0,359.0,360.0,0.000466,0.024596,52.831735,0
