# Project - Part 1 - Web scrapping and knowledge base construction

## Environment setup

In [92]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\julie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\julie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\julie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\julie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [93]:
from datasets import load_dataset

# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003", trust_remote_code=True)

# Access the training, validation, and test sets
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

# Example: Print the first example from the training set
print(train_dataset[0])

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


## Task 1: Model for NER

### Text Cleaning & Preprocessing

In [94]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

import inflect, string

def write_numbers(text: str):
	p = inflect.engine()
	words = text.split()
	for i, word in enumerate(words):
		if word.isdigit():
			words[i] = p.number_to_words(word)

	return ' '.join(words)

def remove_punctuation(text: str):
	text = text.replace('-', ' ')
	translator = str.maketrans('', '', string.punctuation)
	text = text.translate(translator)
	return text

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stems)

def lemma_words(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word) for word in word_tokens]
    return ' '.join(lemmas)

In [95]:
def preprocess_text(text: str):
	# Remove punctuation
	text = remove_punctuation(text)

	# Convert numbers to words
	text = write_numbers(text)

	# Lowercase the text
	# text = text.lower()
	
	# Remove stopwords
	# text = remove_stopwords(text)

	# Stem the words
	# text = stem_words(text)

	# Lemmatize the words
	text = lemma_words(text)

	return text

text = ' '.join(test_dataset[3]['tokens'])
print(text)
print(preprocess_text(text))

Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday .
Japan began the defence of their Asian Cup title with a lucky two one win against Syria in a Group C championship match on Friday


### Named Entity Recognition (NER)

spaCyʼs en_ner_conll03 pre-trained NER model:

In [96]:
import spacy

def extract_types(text: str):
	# Load the spaCy model
	nlp = spacy.load("./best_ner_model")

	# Create a spaCy Doc object
	doc = nlp(text)

	# Extract named entities and their labels
	entities = [(ent.text, ent.label_) for ent in doc.ents]

	return entities

# Example usage
text = ' '.join(test_dataset[3]['tokens'])
clean_text = preprocess_text(text)
# clean_text = ' '.join(tokens)
entities = extract_types(clean_text)
print(text)
print(clean_text)
print(entities)

Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday .
Japan began the defence of their Asian Cup title with a lucky two one win against Syria in a Group C championship match on Friday
[('Japan', 'LOC'), ('Asian Cup', 'MISC'), ('Syria', 'LOC')]


CRF (Conditional Random Fields) model using sklearn-crfsuite:

In [97]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from collections import defaultdict

tag_map = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}

def extract_features(tokens):
	return [{'word': token} for token in tokens]

def get_labels(dataset):
	return [[tag_map[tag] for tag in example['ner_tags']] for example in dataset]

# Define the CRF model
crf = sklearn_crfsuite.CRF(
	algorithm='lbfgs',
	c1=0.1,
	c2=0.1,
	max_iterations=100,
	all_possible_transitions=False
)

# Extract features and labels for training and testing
X_train = [extract_features(example['tokens']) for example in train_dataset]
y_train = get_labels(train_dataset)
X_test = [extract_features(example['tokens']) for example in test_dataset]
y_test = get_labels(test_dataset)

# Train the CRF model
crf.fit(X_train, y_train)

# Evaluate the model
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       B-LOC       0.89      0.78      0.83      1668
      B-MISC       0.83      0.67      0.74       702
       B-ORG       0.81      0.55      0.66      1661
       B-PER       0.74      0.43      0.55      1617
       I-LOC       0.42      0.66      0.51       257
      I-MISC       0.63      0.60      0.61       216
       I-ORG       0.53      0.63      0.58       835
       I-PER       0.72      0.55      0.62      1156
           O       0.94      0.97      0.95     38323

    accuracy                           0.91     46435
   macro avg       0.72      0.65      0.67     46435
weighted avg       0.90      0.91      0.90     46435



In [98]:
# Initialize dictionaries to store metrics for each entity type
entity_metrics = defaultdict(list)

# Get the report as a dictionary
report = metrics.flat_classification_report(y_test, y_pred, output_dict=True)

# Extract metrics for each entity type
for label, scores in report.items():
	if label.startswith('B-') or label.startswith('I-'):
		entity_type = label.split('-')[1]  # Extract LOC, MISC, PER, ORG
		if isinstance(scores, dict):
			for metric, value in scores.items():
				if metric in ['precision', 'recall', 'f1-score']:
					entity_metrics[(entity_type, metric)].append(value)

# Calculate mean metrics for each entity type
entity_means = {}
for (entity_type, metric), values in entity_metrics.items():
	if values:
		entity_means[(entity_type, metric)] = sum(values) / len(values)

# Print the mean metrics for each entity type
print("\nMean Metrics by Entity Type:")
print("-" * 50)
print(f"{'Entity':<10} {'Precision':<12} {'Recall':<12} {'F1-Score':<12}")
print("-" * 50)
for entity_type in ['LOC', 'MISC', 'PER', 'ORG']:
	precision = entity_means.get((entity_type, 'precision'), 0)
	recall = entity_means.get((entity_type, 'recall'), 0)
	f1 = entity_means.get((entity_type, 'f1-score'), 0)
	print(f"{entity_type:<10} {precision:<12.2f} {recall:<12.2f} {f1:<12.2f}")


Mean Metrics by Entity Type:
--------------------------------------------------
Entity     Precision    Recall       F1-Score    
--------------------------------------------------
LOC        0.65         0.72         0.67        
MISC       0.73         0.63         0.68        
PER        0.73         0.49         0.59        
ORG        0.67         0.59         0.62        


In [99]:
print(test_dataset[3])
print(crf.predict([extract_features(test_dataset[3]['tokens'])]))

{'id': '3', 'tokens': ['Japan', 'began', 'the', 'defence', 'of', 'their', 'Asian', 'Cup', 'title', 'with', 'a', 'lucky', '2-1', 'win', 'against', 'Syria', 'in', 'a', 'Group', 'C', 'championship', 'match', 'on', 'Friday', '.'], 'pos_tags': [22, 38, 12, 21, 15, 29, 16, 22, 21, 15, 12, 16, 11, 41, 15, 22, 15, 12, 22, 22, 21, 21, 15, 22, 7], 'chunk_tags': [11, 21, 11, 12, 13, 11, 12, 12, 12, 13, 11, 12, 12, 21, 13, 11, 13, 11, 12, 12, 12, 12, 13, 11, 0], 'ner_tags': [5, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
[['B-LOC' 'O' 'O' 'O' 'O' 'O' 'B-MISC' 'I-MISC' 'O' 'O' 'O' 'O' 'O' 'O'
  'O' 'B-LOC' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O']]


### Relation Extraction (RE)

In [100]:
import spacy

def extract_relations(text):
	# Load spaCy's pre-trained model
	nlp = spacy.load("en_core_web_sm")

	# Process the text with spaCy
	doc = nlp(text)

	# Extract relations
	relations = []
	for token in doc:
		# Subject-Verb-Object pattern
		if token.dep_ in ("nsubj", "nsubjpass"):
			subject = token.text
			# Get the full subject phrase
			for child in token.children:
				if child.dep_ in ("compound", "amod"):
					subject = child.text + " " + subject
			
			if token.head.pos_ == "VERB":
				predicate = token.head.text
				
				# Look for objects
				for child in token.head.children:
					# Direct object
					if child.dep_ == "dobj":
						obj = child.text
						# Get the full object phrase
						for obj_child in child.children:
							if obj_child.dep_ in ("compound", "amod"):
								obj = obj_child.text + " " + obj
						relations.append((subject, predicate, obj))
					
					# Prepositional object
					elif child.dep_ in ("prep", "agent"):
						for pobj in child.children:
							if pobj.dep_ == "pobj":
								obj = pobj.text
								# Get full object phrase
								for pobj_child in pobj.children:
									if pobj_child.dep_ in ("compound", "amod"):
										obj = pobj_child.text + " " + obj
								# Include the preposition in the relation
								full_predicate = predicate + " " + child.text
								relations.append((subject, full_predicate, obj))
	return relations

# Test with an example
text = ' '.join(test_dataset[3]['tokens'])
clean_text = preprocess_text(text)
relations = extract_relations(clean_text)
print(clean_text)
print(relations)

Japan began the defence of their Asian Cup title with a lucky two one win against Syria in a Group C championship match on Friday
[('Japan', 'began', 'defence'), ('Japan', 'began with', 'lucky win'), ('Japan', 'began on', 'Friday')]


### Knowledge Graph Building

In [107]:
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS

def create_graph(text):
	# Create a new RDF graph
	g = Graph()

	# Define namespaces
	EX = Namespace("http://example.org/")

	# Preprocess the text
	text = preprocess_text(text)

	# Add triples to the graph
	# Add types
	entities = extract_types(text)

	for entity in entities:
		value = entity[0].replace(" ", "_")
		if entity[1] == "ORG":
			g.add((URIRef(EX[value]), RDF.type, URIRef(EX.Company)))
		elif entity[1] == "PER":
			g.add((URIRef(EX[value]), RDF.type, URIRef(EX.Person)))
		elif entity[1] == "LOC":
			g.add((URIRef(EX[value]), RDF.type, URIRef(EX.Location)))
		elif entity[1] == "MISC":
			g.add((URIRef(EX[value]), RDF.type, URIRef(EX.Miscellaneous)))

	# Add relations
	relations = extract_relations(text)

	for relation in relations:
		subject, predicate, obj = relation
		subject = subject.replace(" ", "_")
		predicate = predicate.replace(" ", "_")
		obj = obj.replace(" ", "_")

		g.add((URIRef(EX[subject]), URIRef(EX[predicate]), URIRef(EX[obj])))

	# Serialize the graph in RDF/XML format
	graph = g.serialize(format="xml")
	
	return g, graph

text = ' '.join(test_dataset[3]['tokens'])
g, graph = create_graph(text)
print(graph)

<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
   xmlns:ns1="http://example.org/"
   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
>
  <rdf:Description rdf:about="http://example.org/Japan">
    <rdf:type rdf:resource="http://example.org/Location"/>
    <ns1:began rdf:resource="http://example.org/defence"/>
    <ns1:began_with rdf:resource="http://example.org/lucky_win"/>
    <ns1:began_on rdf:resource="http://example.org/Friday"/>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/Asian_Cup">
    <rdf:type rdf:resource="http://example.org/Miscellaneous"/>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/Syria">
    <rdf:type rdf:resource="http://example.org/Location"/>
  </rdf:Description>
</rdf:RDF>



In [108]:
# Perform a SPARQL query
query = """
SELECT ?predicate ?object
WHERE {
  <http://example.org/Japan> ?predicate ?object .
}
"""

for row in g.query(query):
	print(f"{row.predicate} {row.object} ")

http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://example.org/Location 
http://example.org/began http://example.org/defence 
http://example.org/began_with http://example.org/lucky_win 
http://example.org/began_on http://example.org/Friday 


In [109]:
text = """Star Wars IV is a Movie where there are different kinds of creatures, like
humans and wookies. Some creatures are Jedis; for instance, the human Luke
is a Jedi, and Master Yoda – for whom the species is not known – is also a
Jedi. The wookie named Chewbacca is Han’s co-pilot on the Millennium
Falcon starship. The speed of Millennium Falcon is 1.5 (above the speed of
light!)"""
text = text.replace("\n", " ")

g, graph = create_graph(text)
print(graph)

<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
   xmlns:ns1="http://example.org/"
   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
>
  <rdf:Description rdf:about="http://example.org/Millennium_Falcon">
    <rdf:type rdf:resource="http://example.org/Company"/>
    <rdf:type rdf:resource="http://example.org/Person"/>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/Movie">
    <rdf:type rdf:resource="http://example.org/Person"/>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/Luke">
    <rdf:type rdf:resource="http://example.org/Person"/>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/Chewbacca">
    <rdf:type rdf:resource="http://example.org/Location"/>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/specie">
    <ns1:known_for rdf:resource="http://example.org/whom"/>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/Master_Yoda">
    <rdf:type rdf:resource="http://exam

## Task 2: Pipeline for Knowledge Graph Construction

### Fetch News Articles

In [None]:
from bs4 import BeautifulSoup
import requests

def fetch_reuters_articles():
	# Set up a user agent to mimic a browser
	headers = {
		'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
		'Accept-Language': 'en-US,en;q=0.9',
	}
	
	# URL of Reuters World section
	url = "https://www.reuters.com/world/"
	
	# Make a request with the headers
	response = requests.get(url, headers=headers)
	
	# Parse the HTML content with BeautifulSoup
	soup = BeautifulSoup(response.content, "html.parser")
	
	articles = []
	
	for div in soup.find_all('div', {"class": "media-story-card__body__3tRWy"}, limit=10):
		potential_articles = div.find_all('a')
		for potential_article in potential_articles:
			if potential_article.parent.name != 'span':
				article = potential_article
				break

		title = article.get_text()
		
		link = article['href']
		article_url = f"https://www.reuters.com{link}"
		
		article_response = requests.get(article_url, headers=headers)
		article_soup = BeautifulSoup(article_response.content, 'html.parser')

		# Find all paragraphs in the article
		paragraphs = article_soup.find_all('div', {"data-testid": lambda value: value and value.startswith("paragraph-")})
		content = '\n'.join([p.get_text() for p in paragraphs])
		
		publication_date = article_soup.find('meta', {'name': 'article:published_time'})['content']

		articles.append({
			'title': title,
			'url': article_url,
			'content': content,
			'publication_date': publication_date
		})

	return articles

In [111]:
articles = fetch_reuters_articles()
print(len(articles))

10


In [112]:
for article in articles:
	print(article['title'], article['publication_date'])
	print(article['content'][:200]) # Print the first 200 characters of the content
	print()

Iran wants indirect talks with US, warns regional countries over strikes against it 2025-04-06T09:58:30Z

Israeli military changes initial account of Gaza aid worker killings 2025-04-06T09:56:05Z
JERUSALEM, April 6 (Reuters) - The Israeli military has provided new details that changed its initial account of the killing of 15 emergency workers near the southern Gaza city of Rafah last month but

Russian missile strike kills one, injures three in Kyiv, Ukraine says 2025-04-06T09:29:15Z
KYIV, April 6 (Reuters) - A Russian missile attack on Kyiv killed one man and injured three other people overnight, causing damage and fires in several districts in the biggest such attack on Ukraine 

As judges stymie Trump with nationwide orders, pressure builds on US Supreme Court 2025-04-06T10:22:30Z
WASHINGTON, April 6 (Reuters) - Republican President Donald Trump and his Democratic predecessor Joe Biden may not agree on much, but there is one issue on which they have been united: The need to blu

EU 

## Test the pipeline

In [116]:
import random

# Select a random article
random_article = random.choice(articles)

print("Title:", random_article['title'])
content = random_article['content']

Title: As judges stymie Trump with nationwide orders, pressure builds on US Supreme Court


In [117]:
g, graph = create_graph(content)
print(graph)



<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
   xmlns:ns1="http://example.org/"
   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
>
  <rdf:Description rdf:about="http://example.org/Trumps">
    <rdf:type rdf:resource="http://example.org/Person"/>
    <rdf:type rdf:resource="http://example.org/Location"/>
  </rdf:Description>
  <rdf:Description rdf:about="http://example.org/that">
    <ns1:stop rdf:resource="http://example.org/government_policy"/>
    <ns1:stop_in rdf:resource="http://example.org/it"/>
    <ns1:offer rdf:resource="http://example.org/relief"/>
    <ns1:offer_to rdf:resource="http://example.org/specific_plaintiff"/>
    <ns1:deny rdf:resource="http://example.org/citizenship"/>
    <ns1:deny_to rdf:resource="http://example.org/baby"/>
    <ns1:applied_beyond rdf:resource="http://example.org/plaintiff"/>
    <ns1:applied_in rdf:resource="http://example.org/case"/>
    <ns1:applied_In rdf:resource="http://example.org/one_thousand"/>
    <ns1:protected rdf:resourc