References: 
https://www.kaggle.com/code/niyamatalmass/texts-summarizing-with-the-help-of-spacy/notebook
https://spacy.io/usage/spacy-101
https://www.analyticsvidhya.com/blog/2020/03/spacy-tutorial-learn-natural-language-processing/


In [115]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
import pymysql
import pprint
pp = pprint.PrettyPrinter(indent = 4)

In [116]:
conn = pymysql.connect(host='localhost', user='admin',
                            passwd="password", db='joe_rogan')
cur = conn.cursor()
cur.execute(
    "SElECT id, episode_id, text FROM `transcript_time_split` "
    "WHERE episode_id=269 "
    "ORDER BY id ASC "
    "LIMIT 10000")
data_tuple = cur.fetchall()
cur.close()
conn.close()

In [117]:
data = []
for i in data_tuple:
    data.append(list(i))

In [118]:
for i in range(len(data)):
    data[i][2] = " ".join(data[i][2].split())

In [119]:
nlp = spacy.load("en_core_web_lg")
doc = nlp(data[7][2])

In [120]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [121]:
for token in range(14):
    print(doc[token].text + " --> " + spacy.explain(doc[token].pos_) + " (" + doc[token].pos_ + ")")

By --> adposition (ADP)
the --> determiner (DET)
way --> noun (NOUN)
. --> punctuation (PUNCT)
There --> pronoun (PRON)
is --> verb (VERB)
a --> determiner (DET)
system --> noun (NOUN)
like --> adposition (ADP)
that --> pronoun (PRON)
in --> adposition (ADP)
the --> determiner (DET)
brain --> noun (NOUN)
. --> punctuation (PUNCT)


In [122]:
for token in range(14):
    print(doc[token].text + " --> " + str(spacy.explain(doc[token].dep_)) + " (" + doc[token].dep_ + ")")

By --> None (ROOT)
the --> determiner (det)
way --> object of preposition (pobj)
. --> punctuation (punct)
There --> expletive (expl)
is --> None (ROOT)
a --> determiner (det)
system --> attribute (attr)
like --> prepositional modifier (prep)
that --> object of preposition (pobj)
in --> prepositional modifier (prep)
the --> determiner (det)
brain --> object of preposition (pobj)
. --> punctuation (punct)


In [123]:
for ent in doc.ents[:4]:
    print(ent.text)
    for word in ent:
        print("    " + word.text + " (" + ent.label_ + " " + word.pos_ + " " + word.dep_ + ")")


Aeropostale
    Aeropostale (ORG PROPN compound)
last night
    last (TIME ADJ amod)
    night (TIME NOUN npadvmod)
12 hours
    12 (TIME NUM nummod)
    hours (TIME NOUN pobj)
no 16 hours
    no (TIME DET det)
    16 (TIME NUM nummod)
    hours (TIME NOUN npadvmod)


In [124]:
def summarize(text, per, nounweight, verbweight, personweight, orgweight):
    nlp = spacy.load("en_core_web_sm")
    doc= nlp(text)
    tokens=[token.text for token in doc]
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if (word.pos_ == "NOUN" and word.dep_ == "dobj"):
                    if word.text not in word_frequencies.keys():
                        word_frequencies[word.text] = nounweight
                    else:
                        word_frequencies[word.text] += nounweight
                elif (word.pos_ == "VERB" and word.dep_ in ["ROOT", "advcl"]):
                    if word.text not in word_frequencies.keys():
                        word_frequencies[word.text] = verbweight
                    else:
                        word_frequencies[word.text] += verbweight
    for ent in doc.ents:
        for word in ent:
            if (ent.label_ == "PERSON") and (word.pos_ == "PROPN"):
                if ent.text not in word_frequencies.keys():
                    word_frequencies[ent.text] = personweight
            elif (ent.label_ == "ORG") and (word.pos_ == "PROPN"):
                if ent.text not in word_frequencies.keys():
                    word_frequencies[ent.text] = orgweight
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    #pp.pprint(sorted(word_frequencies.items(), key=lambda kv: kv[1], reverse=True)[:10])
    #pp.pprint(f"sentence_scores{sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)}")
    #pp.pprint(f"sentence_tokens{STOP_WORDS}")
    return summary 

In [125]:
nounweight = 1
verbweight = 1
personweight = 1
orgweight = 1

output_summaries = []

for i in data:
    tempEpisodeID = i[1]
    tempID = i[0]
    #print(tempID)
    tempOutput = summarize(i[2], 0.05, nounweight, verbweight, personweight, orgweight)
    output_summaries.append([tempEpisodeID, tempID, tempOutput])

In [126]:
with open ('spacy_summaries.txt','a') as f:
    f.write(f"""

    SPACY SUMMARIES
    Tuned to:
        nounweight = {nounweight}
        verbweight = {verbweight}
        personweight = {personweight}
        orgweight = {orgweight}
    """
    )

    for i in range(len(output_summaries)):
        f.write(f"""
        EpisodeID: {output_summaries[i][0]}
        10 minute ID: {output_summaries[i][1]}
        Summary: 
        {output_summaries[i][2]}
        """
        )
        f.write(f"""
        Length of original: {len(data[i][2])}, Length of summary: {len(output_summaries[i][2])}
        
        """
        )


In [127]:
"""import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("AI & ML is a fantastic course, and we love it")
displacy.serve(doc, style="dep")"""

'import spacy\nfrom spacy import displacy\n\nnlp = spacy.load("en_core_web_sm")\ndoc = nlp("AI & ML is a fantastic course, and we love it")\ndisplacy.serve(doc, style="dep")'