In [107]:
# Import spacy
import spacy

# Install English language model
!spacy download en_core_web_sm



Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [108]:
# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import graphing package
import plotly.express as px


In [109]:
# Create empty lists for file names and contents
texts = []
file_names = []

# Iterate through each file in the folder
for _file_name in os.listdir('txt_files'):
# Look for only text files
    if _file_name.endswith('.txt'):
    # Append contents of each text file to text list
        texts.append(open('txt_files' + '/' + _file_name, 'r', encoding='utf-8').read())
        # Append name of each file to file name list
        file_names.append(_file_name)

In [110]:
# Create dictionary object associating each file name with its text
d = {'Filename':file_names,'Text':texts}

In [111]:
# Turn dictionary into a dataframe
lyrics = pd.DataFrame(d)


In [112]:
lyrics.head()


Unnamed: 0,Filename,Text
0,tisthedamnseason.txt,’tis the damn season Lyrics[Verse 1]\nIf I wan...
1,evermore.txt,evermore Lyrics[Verse 1: Taylor Swift]\nGray N...
2,happiness.txt,"happiness Lyrics[Verse 1]\nHoney, when I'm abo..."
3,tolerateit.txt,tolerate it Lyrics[Verse 1]\nI sit and watch y...
4,willow.txt,willow Lyrics[Verse 1]\nI'm like the water whe...


In [113]:
# Remove extra spaces from papers


lyrics['Text'] = paper_df['Text'].str.replace(r'\s+', ' ', regex=True).str.strip()
lyrics.head()


Unnamed: 0,Filename,Text
0,tisthedamnseason.txt,’tis the damn season Lyrics[Verse 1] If I want...
1,evermore.txt,evermore Lyrics[Verse 1: Taylor Swift] Gray No...
2,happiness.txt,"happiness Lyrics[Verse 1] Honey, when I'm abov..."
3,tolerateit.txt,tolerate it Lyrics[Verse 1] I sit and watch yo...
4,willow.txt,willow Lyrics[Verse 1] I'm like the water when...


In [114]:
# Remove .txt from title of each paper
lyrics['Filename'] = lyrics['Filename'].str.replace('.txt', '', regex=True)

In [115]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [116]:
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')

# Check what functions it performs
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [117]:
#Define example sentence
sentence = "This is 'an' example? sentence"

# Call the nlp model on the sentence
doc = nlp(sentence)

In [118]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

In [119]:
# Apply the function to the "Text" column, so that the nlp pipeline is called on each student essay
lyrics['Doc'] = lyrics['Text'].apply(process_text)

In [120]:
# Define a function to retrieve tokens from a doc object
def get_token(doc):
    return [(token.text) for token in doc]

In [121]:
# Run the token retrieval function on the doc objects in the dataframe
lyrics['Tokens'] = lyrics['Doc'].apply(get_token)
lyrics.head()

Unnamed: 0,Filename,Text,Doc,Tokens
0,tisthedamnseason,’tis the damn season Lyrics[Verse 1] If I want...,"(’, tis, the, damn, season, Lyrics[Verse, 1, ]...","[’, tis, the, damn, season, Lyrics[Verse, 1, ]..."
1,evermore,evermore Lyrics[Verse 1: Taylor Swift] Gray No...,"(evermore, Lyrics[Verse, 1, :, Taylor, Swift, ...","[evermore, Lyrics[Verse, 1, :, Taylor, Swift, ..."
2,happiness,"happiness Lyrics[Verse 1] Honey, when I'm abov...","(happiness, Lyrics[Verse, 1, ], Honey, ,, when...","[happiness, Lyrics[Verse, 1, ], Honey, ,, when..."
3,tolerateit,tolerate it Lyrics[Verse 1] I sit and watch yo...,"(tolerate, it, Lyrics[Verse, 1, ], I, sit, and...","[tolerate, it, Lyrics[Verse, 1, ], I, sit, and..."
4,willow,willow Lyrics[Verse 1] I'm like the water when...,"(willow, Lyrics[Verse, 1, ], I, 'm, like, the,...","[willow, Lyrics[Verse, 1, ], I, 'm, like, the,..."


In [122]:
tokens = lyrics[['Text', 'Tokens']].copy()
tokens.head()

Unnamed: 0,Text,Tokens
0,’tis the damn season Lyrics[Verse 1] If I want...,"[’, tis, the, damn, season, Lyrics[Verse, 1, ]..."
1,evermore Lyrics[Verse 1: Taylor Swift] Gray No...,"[evermore, Lyrics[Verse, 1, :, Taylor, Swift, ..."
2,"happiness Lyrics[Verse 1] Honey, when I'm abov...","[happiness, Lyrics[Verse, 1, ], Honey, ,, when..."
3,tolerate it Lyrics[Verse 1] I sit and watch yo...,"[tolerate, it, Lyrics[Verse, 1, ], I, sit, and..."
4,willow Lyrics[Verse 1] I'm like the water when...,"[willow, Lyrics[Verse, 1, ], I, 'm, like, the,..."


In [123]:
# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
lyrics['Lemmas'] = lyrics['Doc'].apply(get_lemma)

In [124]:
print(f'"Write" appears in the text tokens column ' + str(lyrics['Tokens'].apply(lambda x: x.count('write')).sum()) + ' times.')
print(f'"Write" appears in the lemmas column ' + str(lyrics['Lemmas'].apply(lambda x: x.count('write')).sum()) + ' times.')

"Write" appears in the text tokens column 5 times.
"Write" appears in the lemmas column 5 times.


In [125]:
# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
    #Return the coarse- and fine-grained part of speech text for each token in the doc
    return [(token.pos_, token.tag_) for token in doc]

# Define a function to retrieve parts of speech from a doc object
lyrics['POS'] = lyrics['Doc'].apply(get_pos)

In [126]:
# Create a list of part of speech tags
list(lyrics['POS'])

[[('PUNCT', '``'),
  ('VERB', 'VBP'),
  ('DET', 'DT'),
  ('ADJ', 'JJ'),
  ('NOUN', 'NN'),
  ('NOUN', 'NN'),
  ('X', 'LS'),
  ('PUNCT', '-RRB-'),
  ('SCONJ', 'IN'),
  ('PRON', 'PRP'),
  ('VERB', 'VBD'),
  ('PART', 'TO'),
  ('VERB', 'VB'),
  ('PRON', 'WP'),
  ('PRON', 'PRP'),
  ('AUX', 'VBD'),
  ('VERB', 'VBG'),
  ('ADP', 'IN'),
  ('SCONJ', 'IN'),
  ('PRON', 'PRP'),
  ('AUX', 'VBD'),
  ('VERB', 'VBN'),
  ('PUNCT', ','),
  ('PRON', 'PRP'),
  ('AUX', 'MD'),
  ('AUX', 'VB'),
  ('VERB', 'VBN'),
  ('PRON', 'PRP'),
  ('PRON', 'PRP'),
  ('AUX', 'VBZ'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('ADJ', 'JJ'),
  ('PUNCT', ','),
  ('VERB', 'VBP'),
  ('ADP', 'RP'),
  ('NOUN', 'NN'),
  ('NOUN', 'NN'),
  ('CCONJ', 'CC'),
  ('PRON', 'PRP'),
  ('VERB', 'VBD'),
  ('PRON', 'PRP'),
  ('SCONJ', 'WRB'),
  ('PRON', 'PRP'),
  ('VERB', 'VBD'),
  ('PRON', 'PRP'),
  ('PRON', 'EX'),
  ('VERB', 'VBZ'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('PRON', 'PRP'),
  ('PUNCT', ','),
  ('VERB', '

In [127]:
spacy.explain("IN")

'conjunction, subordinating or preposition'

In [128]:
# Define function to extract proper nouns from Doc object
def extract_proper_nouns(doc):
    return [token.text for token in doc if token.pos_ == 'PROPN']

# Apply function to Doc column and store resulting proper nouns in new column
lyrics['Proper_Nouns'] = lyrics['Doc'].apply(extract_proper_nouns)

In [129]:
result = list(lyrics.loc[[3, lyrics.index[-1]], 'Proper_Nouns'])

In [130]:
# Get all NE labels and assign to variable
labels = nlp.get_pipe("ner").labels

# Print each label and its description
for label in labels:
    print(label + ' : ' + spacy.explain(label))

CARDINAL : Numerals that do not fall under another type
DATE : Absolute or relative dates or periods
EVENT : Named hurricanes, battles, wars, sports events, etc.
FAC : Buildings, airports, highways, bridges, etc.
GPE : Countries, cities, states
LANGUAGE : Any named language
LAW : Named documents made into laws.
LOC : Non-GPE locations, mountain ranges, bodies of water
MONEY : Monetary values, including unit
NORP : Nationalities or religious or political groups
ORDINAL : "first", "second", etc.
ORG : Companies, agencies, institutions, etc.
PERCENT : Percentage, including "%"
PERSON : People, including fictional
PRODUCT : Objects, vehicles, foods, etc. (not services)
QUANTITY : Measurements, as of weight or distance
TIME : Times smaller than a day
WORK_OF_ART : Titles of books, songs, etc.


In [131]:
# Define function to extract named entities from doc objects
def extract_named_entities(doc):
    return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
lyrics['Named_Entities'] = lyrics['Doc'].apply(extract_named_entities)
lyrics['Named_Entities']

0     [CARDINAL, DATE, CARDINAL, NORP, DATE, ORG, DA...
1     [CARDINAL, PERSON, DATE, DATE, PERSON, PERSON,...
2     [CARDINAL, CARDINAL, PERSON, TIME, DATE, CARDI...
3     [CARDINAL, CARDINAL, CARDINAL, NORP, CARDINAL,...
4     [CARDINAL, PERSON, DATE, CARDINAL, PERSON, PER...
5     [CARDINAL, CARDINAL, PERSON, WORK_OF_ART, DATE...
6     [CARDINAL, NORP, PERSON, CARDINAL, CARDINAL, D...
7     [CARDINAL, PERSON, ORG, PERSON, PERSON, GPE, P...
8                  [CARDINAL, CARDINAL, PERSON, PERSON]
9     [CARDINAL, PERSON, CARDINAL, CARDINAL, CARDINA...
10    [CARDINAL, DATE, ORG, CARDINAL, CARDINAL, DATE...
11    [PERSON, CARDINAL, GPE, ORG, CARDINAL, GPE, OR...
12             [ORG, ORG, CARDINAL, ORG, CARDINAL, ORG]
13    [CARDINAL, ORG, PERSON, LOC, PERSON, PERSON, C...
14    [CARDINAL, PERSON, DATE, TIME, PERSON, ORG, DA...
15    [CARDINAL, PRODUCT, ORG, CARDINAL, PERSON, ORG...
16    [CARDINAL, PERSON, CARDINAL, PERSON, LOC, PERS...
Name: Named_Entities, dtype: object

In [132]:
# Define function to extract text tagged with named entities from doc objects
def extract_named_entities(doc):
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
lyrics['NE_Words'] = lyrics['Doc'].apply(extract_named_entities)
lyrics['NE_Words']

0     [(1), (the, weekend), (2), (Methodist), (the, ...
1     [(1), (Taylor, Swift, ]), (November), (July), ...
2     [(1), (2), (Sorry), (midnight), (seven, years)...
3                   [(1), (1), (2), (polish), (1), (2)]
4     [(1), (Anywhere), (', 90s), (3), (Anywhere), (...
5     [(1), (2), (Nevеr), (Pre, -, Chorus), (autumn)...
6     [(1), (Tarnished), (Grieving), (2), (3), (Spri...
7     [(1), (Dorothea), (Honey), (Pre, -, Chorus), (...
8                       [(1), (2), (Staying), (Guilty)]
9     [(1), (Dancin), (one), (one), (2), (one), (one...
10    [(1), (2, ], Twenty, years), (Post, -, Chorus,...
11    [(l​ong, story, short), (1), (Clung), (Post, -...
12    [(Eyes), (anticipatin), (1), (anticipatin), (2...
13    [(1), (Strangers), (Matches), (Pages), (Pre, -...
14    [(1), (Taylor, Swift, ]), (Tuesday), (night), ...
15    [(1), (Champagne), (Champagne), (2), (Chorus, ...
16    [(1), (Taylor, Swift, ]), (two), (Taylor, Swif...
Name: NE_Words, dtype: object

In [133]:
# Extract the first Doc object
doc = lyrics['Doc'][1]

# Visualize named entity tagging in a single paper
displacy.render(doc, style='ent', jupyter=True)

In [134]:
# Save DataFrame as csv (in Google Drive)
# Use this step only to save  csv to your computer's working directory
lyrics.to_csv('ts-lyrics.csv')