In [4]:
import pandas as pd
import spacy
 
nlp = spacy.load("en_core_web_sm")
 
# Load dataset
data = pd.read_csv("cross_domain_article.csv")
 
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]
 
# Apply NER to each sentence
data["entities"] = data["body"].apply(lambda x: extract_entities(str(x)) if pd.notnull(x) else [])
 
# Display sample
print(data.head())
 
# Save processed data
data.to_csv("entities_extracted.csv", index=False)
print("Entities saved to entities_extracted.csv")

         category                                              title  \
0  ARTS & CULTURE  Modeling Agencies Enabled Sexual Predators For...   
1  ARTS & CULTURE  Actor Jeff Hiller Talks ‚ÄúBright Colors And Bol...   
2  ARTS & CULTURE  New Yorker Cover Puts Trump 'In The Hole' Afte...   
3  ARTS & CULTURE  Man Surprises Girlfriend By Drawing Them In Di...   
4  ARTS & CULTURE  This Artist Gives Renaissance-Style Sculptures...   

                                                body  \
0  In October 2017, Carolyn Kramer received a dis...   
1  This week I talked with actor Jeff Hiller abou...   
2  The New Yorker is taking on President Donald T...   
3  Kellen Hickey, a 26-year-old who lives in Huds...   
4  There‚Äôs something about combining the traditio...   

                                            entities  
0  [(October 2017, DATE), (Carolyn Kramer, PERSON...  
1  [(This week, DATE), (Jeff Hiller, PERSON), (Br...  
2  [(The New Yorker, ORG), (Donald Trump, PERSON)...  
3  [(K

In [None]:
#TASK

import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

data = pd.read_csv("cross_domain_article.csv")

def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

data["entities"] = data["body"].apply(lambda x: extract_entities(str(x)) if pd.notnull(x) else [])

persons = set()
orgs = set()
dates = set()

for entity in data["entities"]:
    for ent_text, ent_label in entity:
        if ent_label == "PERSON":
            persons.add(ent_text)
        elif ent_label == "ORG":
            orgs.add(ent_text)
        elif ent_label == "DATE":
            dates.add(ent_text)

print(persons)
print(orgs)
print(dates)
print("No. of PERSON entities:", len(persons))# it will be unique
print("No. of ORG entities:", len(orgs))
print("No. of DATE entities:", len(dates))

{'Nick Hexum', 'Bobby Kennedy', 'Maya Hawke', 'Will Hurd', 'Rob Sherman', 'Billy Rice', 'Amritpal Singh', 'Voltaire', 'Kings', 'Richard Feynman', 'Darby Johns', 'Bruno Massot', 'Jorgen Nielsen', 'Barbara Bush‚Äôs', 'Mox', 'Citizen Lab', 'Joe Estlack', 'Joseph Savoie', 'Chamath Palihapitiya', 'Josh Holt', "Stanislaw Moniuszko's", 'Geoff Morrell', 'David Shell', 'Vivien Zhang', 'Colleen', 'Neo-Pagans', "O'Hara", 'David Sparsholt', 'Hailey Baldwin', 'Travis Kalanick', 'Erosie', 'Caroline Perzan', 'aloe vera', 'Xavier Becerra', 'Joana Matthias', 'Thandumzi Moyakhe', 'Michel Targe', 'Garc√≠a Gonz√°lez', 'Samuel Rund', 'Woodcock', 'John Hollenbeck', 'Wine Business', 'Michael Solomon', 'Losen', 'Essie Carmichael', 'Stephon Clark', 'Darren Criss', 'Jimmy', 'Mike Clifford', 'Maarja Nuut', 'Blogger', 'Lynn Hershman Leeson', 'Hope', 'Vernon Barrett Jr.', 'Siobhan Vivian', 'Mel Gibson', 'Judy H.', 'lesbian progress', 'J√§gerst√§tter', 'Greg Locke', 'Franklin Furnace', 'Gina Ortiz Jones', 'Oprah', 

In [7]:
from transformers import pipeline
 
re_model = pipeline("text2text-generation", model="Babelscape/rebel-large")

text = "Barack Obama was born in Honolulu and became the 44th President of the United States."
 
print("Extracted relations:")

print(re_model(text)[0]['generated_text'])

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


Extracted relations:
 Barack Obama  Honolulu  place of birth  President of the United States  position held


In [13]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

print("Loading model...")
re_model = pipeline("text2text-generation", model="Babelscape/rebel-large") 

data = pd.read_csv("cross_domain_article.csv")
TEXT_COLUMN = "body"

# Limit to first 500 rows
data = data.head(500)

data["Extracted_Relations"] = ""

print("Extracting relations...")

for index, row in tqdm(data.iterrows(), total=len(data)):
    text = str(row[TEXT_COLUMN]).strip()
    if not text or text == "nan":
        data.at[index, "Extracted_Relations"] = "No Text"
        continue

    # Truncate to avoid excessive compute
    text = text[:200] 
    try:
        result = re_model(text, max_length=200)[0]['generated_text']
    except Exception as e:
        result = f"Error: {e}"
    data.at[index, "Extracted_Relations"] = result

output_filename = "cross_domain_relations.csv"
data.to_csv(output_filename, index=False)

print(f"File saved: {output_filename}")


Loading model...


Device set to use cpu


Extracting relations...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [43:44<00:00,  5.25s/it] 

File saved: cross_domain_relations.csv





In [None]:
#MAIN TASK

import pandas as pd
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")

data = pd.read_csv("cross_domain_article.csv") 

TEXT_COLUMN = "body"  

def extract_relations(text):
    doc = nlp(text)
    triples = []
    for sent in doc.sents:
        subject = None
        relation = None
        object_ = None
        for token in sent:
            if "subj" in token.dep_:
                subject = token.text
            if "obj" in token.dep_:
                object_ = token.text
            if token.dep_ == "ROOT":
                relation = token.text
        if subject and relation and object_:
            triples.append((subject, relation, object_))
    return triples

all_triples = []

for _, row in tqdm(data.iterrows(), total=len(data)):
    text = row[TEXT_COLUMN]
    triples = extract_relations(str(text))
    for triple in triples:
        all_triples.append({
            "Text": text,
            "Subject": triple[0],
            "Relation": triple[1],
            "Object": triple[2]
        })

triples_df = pd.DataFrame(all_triples)
triples_df.to_csv("extracted_triples.csv", index=False)

print("Triples saved to extracted_triples.csv")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6877/6877 [17:04<00:00,  6.71it/s] 


Triples saved to extracted_triples.csv


: 

In [1]:
import pandas as pd

# Load your dataset
data = pd.read_csv("extracted_triples.csv")   # replace with your actual file name

# Randomly sample up to 800 rows (if dataset smaller, it takes all)
sampled_data = data.sample(n=400, random_state=42)  # random_state keeps it reproducible

# Save to new CSV
sampled_data.to_csv("tripless.csv", index=False)

print("‚úÖ Random 800 rows saved as 'triples.csv'")
print("Total rows selected:", len(sampled_data))


‚úÖ Random 800 rows saved as 'triples.csv'
Total rows selected: 400
