In [11]:
import pandas as pd

term_list_df = pd.read_csv("data/NOW_preprocessed/BLM_filtered_preprocessed_news_sources_df.csv")
print(len(term_list_df))

content = term_list_df["text"][0]

term_list_df

41729


Unnamed: 0,index,text,ID,weird_id,Date,Country,News_source_name,link,title
0,3,Colorado Senate Primary: John Hickenlooper Fa...,31930103,1492.0,20-07-01,US,npr.org,https://www.npr.org/2020/06/28/883922991/one-o...,Colorado Senate Primary: John Hickenlooper Fac...
1,13,SACRAMENTO -- Public access to police discipl...,31930303,775.0,20-07-01,US,mercurynews.com,https://www.mercurynews.com/2020/06/29/bill-wo...,Bill would broaden and speed up access to Cali...
2,18,"Dolores Attea Sapienza, 88, queen of special ...",31930400,1056.0,20-07-01,US,buffalonews.com,https://buffalonews.com/news/local/dolores-att...,"Dolores Attea Sapienza, 88, queen of special e..."
3,19,Roswell Park plans expansion of cancer care i...,31930401,1014.0,20-07-01,US,buffalonews.com,https://buffalonews.com/news/local/roswell-par...,Roswell Park plans expansion of cancer care in...
4,25,Subscribers to The Climate Crisis newsletter ...,31930409,2318.0,20-07-01,US,newyorker.com,https://www.newyorker.com/news/annals-of-a-war...,At the Core of the Climate Crisis | The New Yo...
...,...,...,...,...,...,...,...,...,...
41724,450142,Colorado Rockies shortstop Ian Desmond became...,85307989,403.0,20-06-30,US,UPI.com,https://www.upi.com/Sports_News/MLB/2020/06/30...,"All-Star SS Ian Desmond to skip MLB season, ci..."
41725,450144,""" I have very, very strict standards about me...",85307991,443.0,20-06-30,US,Us Weekly,https://www.usmagazine.com/celebrity-moms/news...,Why Phaedra Parks Hasn't Introduced Her 2 Kids...
41726,450153,"Comedy legend Carl Reiner, one of the earlies...",85308187,980.0,20-06-30,US,NBC News,https://www.nbcnews.com/news/us-news/comedy-le...,"Comedy legend Carl Reiner, of 'The Dick Van Dy..."
41727,450185,Jon Stewart and Rose Byrne want' Irresistible...,85308688,1086.0,20-06-30,US,Houston Chronicle,https://www.chron.com/entertainment/article/Jo...,Jon Stewart and Rose Byrne want 'Irresistible'...


## SV extraction ##

In [9]:
### applying SV extractor (self made from prev. trial 18 versions)
# it is based on the textpipliner package: https://github.com/krzysiekfonal/textpipeliner
# decoding of pipes: https://github.com/krzysiekfonal/grammaregex
# decoding of POS: https://www.guru99.com/pos-tagging-chunking-nltk.html

# adding additional named entity filters to pipe structure
# list of filters: https://stackoverflow.com/questions/59319207/ner-entity-recognition-country-filter

from textpipeliner import PipelineEngine, Context
from textpipeliner.pipes import *
import spacy

nlp = spacy.load('en_core_web_lg')
doc = nlp(content)

def SV_extractor(content):
    doc = nlp(content)
    pipes_structure = [
        AnyPipe([
            SequencePipe([
                FindTokensPipe("VERB/nsubj/*"),
                AggregatePipe([
                    NamedEntityFilterPipe("PERSON"),
                    NamedEntityFilterPipe("FAC"),
                    NamedEntityFilterPipe("LOC"),
                    NamedEntityFilterPipe("GPE"),
                    NamedEntityFilterPipe("PRODUCT"),
                    NamedEntityFilterPipe("LAW"),
                    NamedEntityFilterPipe("LANGUAGE"),
                    NamedEntityFilterPipe("DATE"),
                    NamedEntityFilterPipe("TIME"),
                    NamedEntityFilterPipe("PERCENT"),
                    NamedEntityFilterPipe("MONEY"),
                    NamedEntityFilterPipe("QUANTITY"),
                    NamedEntityFilterPipe("ORDINAL"),
                    NamedEntityFilterPipe("CARDINAL"),
                    NamedEntityFilterPipe("ORG"),
                    NamedEntityFilterPipe("WORK_OF_ART"),
                    NamedEntityFilterPipe("EVENT")
                    ]),
                NamedEntityExtractorPipe()]),
            SequencePipe([
                FindTokensPipe("VERB/agent/*/pobj/*"),
                AggregatePipe([
                    NamedEntityFilterPipe("PERSON"),
                    NamedEntityFilterPipe("FAC"),
                    NamedEntityFilterPipe("LOC"),
                    NamedEntityFilterPipe("GPE"),
                    NamedEntityFilterPipe("PRODUCT"),
                    NamedEntityFilterPipe("LAW"),
                    NamedEntityFilterPipe("LANGUAGE"),
                    NamedEntityFilterPipe("DATE"),
                    NamedEntityFilterPipe("TIME"),
                    NamedEntityFilterPipe("PERCENT"),
                    NamedEntityFilterPipe("MONEY"),
                    NamedEntityFilterPipe("QUANTITY"),
                    NamedEntityFilterPipe("ORDINAL"),
                    NamedEntityFilterPipe("CARDINAL"),
                    NamedEntityFilterPipe("ORG"),
                    NamedEntityFilterPipe("WORK_OF_ART"),
                    NamedEntityFilterPipe("EVENT")
                    ]),
                NamedEntityExtractorPipe()]),
            SequencePipe([
                FindTokensPipe("VERB/nsubj/*")]),
            SequencePipe([
                FindTokensPipe("VERB/agent/*/pobj/*")])]),
        FindTokensPipe("VERB")
    ]
    engine = PipelineEngine(pipes_structure, Context(doc), [0, 1])
    SVs = engine.process()
    return SVs

In [10]:
# applying the SV_extractor function on each row of my pandas dataframe

term_list_df["SVs"] = term_list_df.apply(lambda row: SV_extractor(row["text"]), axis = 1)

In [11]:
SV_df = term_list_df[["ID", "Date", "News_source_name", "SVs"]] 

In [12]:
# within the dataframe, separating the list of SV pairs into individual SV pairs per row

SV_df_exploded = SV_df.explode("SVs")

SV_df_exploded[["S", "V"]] = pd.DataFrame(SV_df_exploded.SVs.tolist(), index = SV_df_exploded.index)

SV_df_exploded

Unnamed: 0,ID,Date,News_source_name,SVs,S,V
0,31930103,20-07-01,npr.org,"([1], [Stumbles])",[1],[Stumbles]
0,31930103,20-07-01,npr.org,"([Getty, Images], [hide])","[Getty, Images]",[hide]
0,31930103,20-07-01,npr.org,"([voters], [decide])",[voters],[decide]
0,31930103,20-07-01,npr.org,"([voters], [pick])",[voters],[pick]
0,31930103,20-07-01,npr.org,"([Romanoff], [run])",[Romanoff],[run]
...,...,...,...,...,...,...
41728,85309194,20-06-30,YAHOO!,"([MSNBC], [came])",[MSNBC],[came]
41728,85309194,20-06-30,YAHOO!,"([FNC], [averaged])",[FNC],[averaged]
41728,85309194,20-06-30,YAHOO!,"([MSNBC], [came])",[MSNBC],[came]
41728,85309194,20-06-30,YAHOO!,"([show], [drew])",[show],[drew]


In [13]:
# saving df with VO triples into a csv file

SV_df_exploded.to_csv("data/NOW_preprocessed/total_SV_df.csv", index = False)

## VO extraction ## 

In [4]:
### applying VO extractor (self made from prev. trial 18 versions)
# it is based on the textpipliner package: https://github.com/krzysiekfonal/textpipeliner
# decoding of pipes: https://github.com/krzysiekfonal/grammaregex
# decoding of POS: https://www.guru99.com/pos-tagging-chunking-nltk.html

# adding additional named entity filters to pipe structure
# list of filters: https://stackoverflow.com/questions/59319207/ner-entity-recognition-country-filter

from textpipeliner import PipelineEngine, Context
from textpipeliner.pipes import *
import spacy

nlp = spacy.load('en_core_web_lg')
doc = nlp(content)

def VO_extractor(content):
    doc = nlp(content)
    pipes_structure = [
        FindTokensPipe("VERB"),
        AnyPipe([    
            SequencePipe([
                FindTokensPipe("VERB/dobj/*"),
                AggregatePipe([
                    NamedEntityFilterPipe("PERSON"),
                    NamedEntityFilterPipe("FAC"),
                    NamedEntityFilterPipe("LOC"),
                    NamedEntityFilterPipe("GPE"),
                    NamedEntityFilterPipe("PRODUCT"),
                    NamedEntityFilterPipe("LAW"),
                    NamedEntityFilterPipe("LANGUAGE"),
                    NamedEntityFilterPipe("DATE"),
                    NamedEntityFilterPipe("TIME"),
                    NamedEntityFilterPipe("PERCENT"),
                    NamedEntityFilterPipe("MONEY"),
                    NamedEntityFilterPipe("QUANTITY"),
                    NamedEntityFilterPipe("ORDINAL"),
                    NamedEntityFilterPipe("CARDINAL"),
                    NamedEntityFilterPipe("ORG"),
                    NamedEntityFilterPipe("WORK_OF_ART"),
                    NamedEntityFilterPipe("EVENT")
                    ]),
                NamedEntityExtractorPipe()]),
            SequencePipe([
                FindTokensPipe("VERB/nsubjpass/*"),
                AggregatePipe([
                    NamedEntityFilterPipe("PERSON"),
                    NamedEntityFilterPipe("FAC"),
                    NamedEntityFilterPipe("LOC"),
                    NamedEntityFilterPipe("GPE"),
                    NamedEntityFilterPipe("PRODUCT"),
                    NamedEntityFilterPipe("LAW"),
                    NamedEntityFilterPipe("LANGUAGE"),
                    NamedEntityFilterPipe("DATE"),
                    NamedEntityFilterPipe("TIME"),
                    NamedEntityFilterPipe("PERCENT"),
                    NamedEntityFilterPipe("MONEY"),
                    NamedEntityFilterPipe("QUANTITY"),
                    NamedEntityFilterPipe("ORDINAL"),
                    NamedEntityFilterPipe("CARDINAL"),
                    NamedEntityFilterPipe("ORG"),
                    NamedEntityFilterPipe("WORK_OF_ART"),
                    NamedEntityFilterPipe("EVENT")
                    ]),
                NamedEntityExtractorPipe()]),
            SequencePipe([
                FindTokensPipe("VERB/dobj/*")]),
            SequencePipe([
                FindTokensPipe("VERB/nsubjpass/*")])])
            ]
    engine = PipelineEngine(pipes_structure, Context(doc), [0, 1])
    VOs = engine.process()
    return VOs

In [5]:
# applying the VO_extractor function on each row of my pandas dataframe

term_list_df["VOs"] = term_list_df.apply(lambda row: VO_extractor(row["text"]), axis = 1)

In [6]:
VO_df = term_list_df[["ID", "Date", "News_source_name", "VOs"]] 

In [7]:
# within the dataframe, separating the list of VO pairs into individual VO pairs per row

VO_df_exploded = VO_df.explode("VOs")

VO_df_exploded[["V", "O"]] = pd.DataFrame(VO_df_exploded.VOs.tolist(), index = VO_df_exploded.index)

VO_df_exploded

Unnamed: 0,ID,Date,News_source_name,VOs,V,O
0,31930103,20-07-01,npr.org,"([seen], [[John, Hickenlooper], [Colorado, Sen...",[seen],"[[John, Hickenlooper], [Colorado, Senate, Prim..."
0,31930103,20-07-01,npr.org,"([made], [missteps])",[made],[missteps]
0,31930103,20-07-01,npr.org,"([hide], [caption])",[hide],[caption]
0,31930103,20-07-01,npr.org,"([considered], [John, Hickenlooper])",[considered],"[John, Hickenlooper]"
0,31930103,20-07-01,npr.org,"([made], [missteps])",[made],[missteps]
...,...,...,...,...,...,...
41728,85309194,20-06-30,YAHOO!,"([posted], [gains])",[posted],[gains]
41728,85309194,20-06-30,YAHOO!,"([led], [way])",[led],[way]
41728,85309194,20-06-30,YAHOO!,"([averaged], [viewers])",[averaged],[viewers]
41728,85309194,20-06-30,YAHOO!,"([drew], [viewers])",[drew],[viewers]


In [8]:
# saving df with VO triples into a csv file

VO_df_exploded.to_csv("data/NOW_preprocessed/total_VO_df.csv", index = False)