In [1]:
import csv
from collections import defaultdict
from pathlib import Path

from gensim.corpora.textcorpus import TextCorpus
from gensim.parsing.preprocessing import preprocess_string
from gensim import utils
import spacy

In [2]:
nlp = spacy.load("en_core_web_lg")

In [26]:
doc = nlp("David wrote some of her own songs apparently. is a test document. Google sponsored them for this investment.")

In [27]:
doc.ents

(David,)

In [28]:
for ent in doc.ents:
    print(ent.text, ent.label_)

David PERSON


In [30]:
# code snippet to get glove embeddings of vector.
cat_id = nlp.vocab.strings[u"cat"]
cat_vec = nlp.vocab.vectors[cat_id]
print(cat_vec.shape)

(300,)


In [7]:
class CSVCorpus(TextCorpus):
    def getstream(self):
        with open(Path(self.input)) as csv_file:
            reader = csv.DictReader(csv_file)
            for row in reader:
                yield row["text"]
    def get_docs(self):
        for doc in self.getstream():
            yield nlp(doc)
        return
    def __len__(self):
        self.length = sum(1 for _ in self.get_texts())
        return self.length

In [8]:
datapath = Path("data/songdata.csv")
lyrics_corpus = CSVCorpus(datapath)

In [9]:
entity_counter = {}
for song in lyrics_corpus.get_docs():
    ents = song.ents
    for ent in ents:
        if ent.label_ != "PERSON":
            continue
        try:
            entity_counter[ent.text] += 1
        except KeyError:
            entity_counter[ent.text] = 1

In [10]:
top_50 = [(entity, num_entity) for entity, num_entity in sorted(entity_counter.items(), key= lambda k: -k[1])[:50]]

In [11]:
for a, b in top_50:
    print(f"Named Entity: {a}\tNumber Observations: {b}")

Named Entity: Jesus	Number Observations: 2909
Named Entity: Mary	Number Observations: 642
Named Entity: Said	Number Observations: 630
Named Entity: Johnny	Number Observations: 616
Named Entity: Kiss	Number Observations: 505
Named Entity: Santa Claus	Number Observations: 472
Named Entity: Jack	Number Observations: 448
Named Entity: Joe	Number Observations: 422
Named Entity: God	Number Observations: 302
Named Entity: Billy	Number Observations: 291
Named Entity: John	Number Observations: 274
Named Entity: Feelin	Number Observations: 271
Named Entity: Deep	Number Observations: 270
Named Entity: Sun	Number Observations: 266
Named Entity: Nigga	Number Observations: 247
Named Entity: Sweet	Number Observations: 238
Named Entity: Jesus Christ	Number Observations: 237
Named Entity: Lucy	Number Observations: 230
Named Entity: Sittin	Number Observations: 216
Named Entity: Sally	Number Observations: 212
Named Entity: Jimmy	Number Observations: 210
Named Entity: Charlie	Number Observations: 206
Name