In [112]:
import pandas as pd
import spacy
from spacy.lang.en import English
import tracery
import os
import shutil



In [113]:

nlp = spacy.load('en_core_web_md')
nlp_en = English()

In [12]:
headlines = pd.read_csv('../headlines.csv')
headline_list = headlines.headline.to_list()
headline_str = str(headline_list)
doc = nlp(headline_str)

In [25]:
sentences = list(doc.sents)
words = [w for w in list(doc) if w.is_alpha]
noun_chunks = list(doc.noun_chunks)
entities = list(doc.ents)
nouns = [w for w in words if w.pos_ == "NOUN"]
verbs = [w for w in words if w.pos_ == "VERB"]
adjs = [w for w in words if w.pos_ == "ADJ"]
advs = [w for w in words if w.pos_ == "ADV"]

In [26]:
print("len of —")
print(f"sentences: {len(sentences)}")
print(f"words: {len(words)}")
print(f"noun chunks: {len(noun_chunks)}")
print(f"entities: {len(entities)}")
print("- - -")
print(f"nouns: {len(nouns)}")
print(f"verbs: {len(verbs)}")
print(f"adjs: {len(adjs)}")
print(f"advs: {len(advs)}")


len of —
sentences: 32
words: 336
noun chunks: 111
entities: 53
- - -
nouns: 17
verbs: 37
adjs: 4
advs: 7


In [20]:
import random
for item in random.sample(noun_chunks, 5):
    print(item.text.strip().replace("\n", " "))
    print()

Gastric Sleeve

California Soccer Player

Depo

Beaver' Star Dead

RHONY' Star Sonja Morgan



In [21]:
sentence_strs = [item.text for item in doc.sents]

In [22]:
random.sample(sentence_strs, 10)

['If Pregnancy Unplanned", " \'RHONY\' Star Sonja Morgan Lists NYC Townhouse ...',
 "But I'm OK!!!",
 "Involving Players & Fans', ' Lana Del Rey Gets TRO Against Alleged Stalker ...",
 "For Mental Health Break ', ' Mama June Honey Boo Boo Too Young For Gastric Sleeve!!!",
 '\', " \'RHOA\' STAR SHEREE WHITFIELD',
 'For 1st Speech Back in D.C.',
 'Since Jan. 6\', " Kevin Hart Shames Ex-NFLer Over Nudist Colony Hobby With Wife ... \'You Hear Whatchu Sayin\'!?!\'',
 "'I'm Jamie Motherf***ing Foxx!!!'",
 "At Cowboys Camp ', ' Buzz Aldrin Apollo 11 Jacket Sells For $2.7 Mil ... Space Artifact Record!!!', ' Donald Trump Protesters Flood Hotel ...",
 '", " Jim Harbaugh I\'m Willing To Raise My Players\' Baby ...']

In [30]:
for item in random.sample(nouns, 5): # change "nouns" to "verbs" or "adjs" or "advs" to sample from those lists!
    print(item.text)

Message
Shoot
Cops
Customers
Sayin


In [31]:
people = [e for e in entities if e.label_ == "PERSON"]
locations = [e for e in entities if e.label_ == "LOC"]
times = [e for e in entities if e.label_ == "TIME"]

In [35]:
from collections import Counter
word_count = Counter([w.text for w in words])

In [37]:
word_count['Jamie']

2

In [38]:
word_count.most_common(10)

[('For', 6),
 ('To', 5),
 ('I', 5),
 ('My', 4),
 ('Star', 4),
 ('In', 3),
 ('At', 3),
 ('Wife', 3),
 ('He', 3),
 ('at', 3)]

In [68]:
with open("words.txt", "w") as fh:
    fh.write("\n".join([w.text for w in words]))
shutil.move("words.txt", "data")

'data'

In [84]:
# saves a list of spacy values to a text file in ../data
def save_spacy_list(filename, t):
    with open(filename, "w") as fh:
        fh.write("\n".join([item.text for item in t]))
    shutil.move(filename, "../data")

In [119]:
spacy_values = [
    "noun_chunks",
    "entities",
    "words",
    "adjs",
    "advs",
    "verbs",
    "nouns"
]

In [83]:
save_spacy_list("words.txt", words)
save_spacy_list("noun_chunks.txt", noun_chunks)
save_spacy_list("entities.txt", entities)
save_spacy_list("adjs.txt", adjs)
save_spacy_list("advs.txt", advs)
save_spacy_list("verbs.txt", verbs)
save_spacy_list("nouns.txt", nouns)

In [85]:
def save_counter_tsv(filename, counter, limit=1000):
    with open(filename, "w") as outfile:
        outfile.write("key\tvalue\n")
        for item, count in counter.most_common():
            outfile.write(item.strip() + "\t" + str(count) + "\n")
    shutil.move(filename, "../data")

In [86]:
save_counter_tsv("100_common_words.tsv", word_count, 100)

In [87]:
people_counter = Counter([e.text.lower() for e in people])
save_counter_tsv("people_count.tsv", people_counter, 100)

In [89]:
for word in random.sample(words, 12):
    print(word.text, "→", word.lemma_)

Prez → Prez
I → I
TRO → tro
Rights → Rights
Says → say
Cops → cop
Dead → Dead
Bari → Bari
Car → car
Aldrin → Aldrin
Cover → Cover
Lear → Lear


In [90]:
sentence = random.choice(sentences)
for word in sentence:
    print(word.text)

Despite
Facebook
Death
Post
"
,
'
Helicopter
Tragedy
Man
Dies
Walking
into
Blades
...


In [91]:
for item in random.sample(words, 24):
    print(item.text, "/", item.pos_, "/", item.tag_)

I / PRON / PRP
Zelensky / PROPN / NNP
Jerry / PROPN / NNP
Tony / PROPN / NNP
OK / ADJ / JJ
SHEREE / PROPN / NNP
Renaissance / PROPN / NNP
My / PRON / PRP$
Pregnancy / PROPN / NNP
it / PRON / PRP
Trump / PROPN / NNP
NFLer / PROPN / NNP
Dow / PROPN / NNP
Buy / VERB / VBP
Maybach / PROPN / NNP
at / ADP / IN
Despite / SCONJ / IN
I / PRON / PRP
Again / ADV / RB
Throws / VERB / VBZ
Involving / VERB / VBG
Kanye / PROPN / NNP
If / SCONJ / IN
Record / PROPN / NNP


In [93]:
spacy.explain('SCONJ')

'subordinating conjunction'

In [94]:
only_past = [item.text for item in doc if item.tag_ == 'VBN']

In [100]:
random.sample(only_past, 2)

['Deposed', 'Alleged']

In [101]:
only_plural = [item.text for item in doc if item.tag_ == 'NNS']

In [None]:
sent = random.choice(sentences)
print("Original sentence:", sent.text.replace("\n", " "))
for word in sent:
    print()
    print("Word:", word.text)
    print("Tag:", word.tag_)
    print("Head:", word.head.text)
    print("Dependency relation:", word.dep_)
    print("Children:", list(word.children))

- nsubj: this word's head is a verb, and this word is itself the subject of the verb
- nsubjpass: same as above, but for subjects in sentences in the passive voice
- dobj: this word's head is a verb, and this word is itself the direct object of the verb
- iobj: same as above, but indirect object
- aux: this word's head is a verb, and this word is an "auxiliary" verb (like "have", "will", "be")
- attr: this word's head is a copula (like "to be"), and this is the description attributed to the subject of the sentence (e.g., in "This product is a global brand", brand is dependent on is with the attr dependency relation)
- det: this word's head is a noun, and this word is a determiner of that noun (like "the," "this," etc.)
- amod: this word's head is a noun, and this word is an adjective describing that noun
- prep: this word is a preposition that modifies its head
- pobj: this word is a dependent (object) of a preposition

In [103]:
def flatten_subtree(st):
    return ''.join([w.text_with_ws for w in list(st)]).strip()

In [None]:
sent = random.choice(sentences)
print("Original sentence:", sent.text.replace("\n", " "))
for word in sent:
    print()
    print("Word:", word.text.replace("\n", " "))
    print("Flattened subtree: ", flatten_subtree(word.subtree).replace("\n", " "))

In [105]:
subjects = []
for word in doc:
    if word.dep_ in ('nsubj', 'nsubjpass'):
        subjects.append(flatten_subtree(word.subtree))

In [106]:
random.sample(subjects, 12)

['PetSmart',
 'Tony Dow',
 'Brittney Griner',
 'Britney Spears',
 "'s",
 'I',
 'Cops',
 'Helicopter Tragedy Man',
 'He',
 'Your Groomers',
 'Former Cops',
 "ITo Raise My Players' Baby"]

In [107]:
prep_phrases = []
for word in doc:
    if word.dep_ == 'prep':
        prep_phrases.append(flatten_subtree(word.subtree).replace("\n", " "))

In [108]:
subjects = [flatten_subtree(word.subtree).replace("\n", " ")
            for word in doc if word.dep_ in ('nsubj', 'nsubjpass')]
past_tense_verbs = [word.text for word in words if word.tag_ == 'VBD' and word.lemma_ != 'be']
adjectives = [word.text for word in words if word.tag_.startswith('JJ')]
nouns = [word.text for word in words if word.tag_.startswith('NN')]
prep_phrases = [flatten_subtree(word.subtree).replace("\n", " ")
                for word in doc if word.dep_ == 'prep']

In [109]:
import tracery
from tracery.modifiers import base_english

In [110]:
rules = {
    "origin": [
        "#subject.capitalize# #predicate#.",
        "#subject.capitalize# #predicate#.",
        "#prepphrase.capitalize#, #subject# #predicate#."
    ],
    "predicate": [
        "#verb#",
        "#verb# #nounphrase#",
        "#verb# #prepphrase#"
    ],
    "nounphrase": [
        "the #noun#",
        "the #adj# #noun#",
        "the #noun# #prepphrase#",
        "the #noun# and the #noun#",
        "#noun.a#",
        "#adj.a# #noun#",
        "the #noun# that #predicate#"
    ],
    "subject": subjects,
    "verb": past_tense_verbs,
    "noun": nouns,
    "adj": adjectives,
    "prepphrase": prep_phrases
}
grammar = tracery.Grammar(rules)
grammar.add_modifiers(base_english)
grammar.flatten("#origin#")

'Tony Dow Sued the Rey.'

In [111]:
from textwrap import fill
output = " ".join([grammar.flatten("#origin#") for i in range(12)])
print(fill(output, 60))

Tony Dow Wrecked. Brittney Griner Wrecked the Former Morgan.
For Derogatory Term, Brittney Griner Killed a Cops. For
Violating His Civil Rights ", ' Kanye West Gifts A$AP Bari
New Maybach… Days After He Wrecked His Old One!!! ', "
Norman Lear Singin', Jeff Bezos Parents Wrecked the Break in
FL. To Russia, Cops Unplanned the Hart and the Players. Your
Groomers Wrecked the RHONY that Stole In Five Minutes. I
Stole Since Jan. 6', " Kevin Hart Shames Ex-NFLer Over
Nudist Colony Hobby With Wife. Motherf***ing Foxx, Your
Groomers Dragged. He Sued the Honey at Trial. I Stole an OK
Exam. At 100, He Killed. At Customers, Helicopter Tragedy
Man Killed WITH DOMESTIC VIOLENCE.
