In [1]:
!pip install pandas matplotlib networkx seaborn
!pip install pyvis --quiet  # For interactive network graphs
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install langdetect
!pip install validators

# Qixian: This cell takes around 20 seconds and may need to restart kernel

import pandas as pd
import spacy
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import pandas as pd

# Provide the path to your Excel file
file_path = "news_excerpts_parsed.xlsx"

# Load the dataset
df_original = pd.read_excel(file_path)
df = df_original.copy()

# View the first few rows
print(df.head())
print(df.info())


                                                Link  \
0  https://edition.cnn.com/2023/09/29/business/st...   
1  https://www.channelnewsasia.com/singapore/su-w...   
2  https://edition.cnn.com/2023/05/22/tech/meta-f...   
3  https://www.channelnewsasia.com/singapore/bill...   
4  https://edition.cnn.com/2024/03/05/politics/li...   

                                                Text  
0  Starbucks violated federal labor law when it i...  
1  The first suspect to plead guilty in Singapore...  
2  Meta has been fined a record-breaking €1.2 bil...  
3  SINGAPORE: A 45-year-old man linked to Singapo...  
4  The Department of Education imposed a record $...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1509 entries, 0 to 1508
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Link    1509 non-null   object
 1   Text    1509 non-null   object
dtypes: object(2)
memory usage: 23.7+ KB
None


In [3]:
df = df_original.copy()

print(df.head())  # View the first few rows
print(df.info())  # Check for missing or null valuesdf = df_original.copy

                                                Link  \
0  https://edition.cnn.com/2023/09/29/business/st...   
1  https://www.channelnewsasia.com/singapore/su-w...   
2  https://edition.cnn.com/2023/05/22/tech/meta-f...   
3  https://www.channelnewsasia.com/singapore/bill...   
4  https://edition.cnn.com/2024/03/05/politics/li...   

                                                Text  
0  Starbucks violated federal labor law when it i...  
1  The first suspect to plead guilty in Singapore...  
2  Meta has been fined a record-breaking €1.2 bil...  
3  SINGAPORE: A 45-year-old man linked to Singapo...  
4  The Department of Education imposed a record $...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1509 entries, 0 to 1508
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Link    1509 non-null   object
 1   Text    1509 non-null   object
dtypes: object(2)
memory usage: 23.7+ KB
None


In [4]:
def validate_data(dataframe):
    original_rows = len(dataframe)

    # Check for missing values
    dataframe = dataframe.dropna(subset=['Text'])
    rows_removed_missing = original_rows - len(dataframe)

    # Check for empty strings or whitespaces
    dataframe['content_length'] = dataframe['Text'].apply(lambda x: len(str(x).strip()))
    dataframe = dataframe[dataframe['content_length'] > 0]
    rows_removed_empty = original_rows - len(dataframe) - rows_removed_missing

    # Remove duplicates
    original_rows = len(dataframe)
    dataframe = dataframe.drop_duplicates(subset=['Text'], keep='first')
    rows_removed_duplicates = original_rows - len(dataframe)

    # Check for invalid lengths
    min_length, max_length = 50, 10000
    # Use 'content_length' instead of 'Text_length' for filtering
    dataframe = dataframe[(dataframe['content_length'] >= min_length) & (dataframe['content_length'] <= max_length)]
    rows_removed_length = original_rows - len(dataframe)

    # Final count
    final_rows = len(dataframe)
    rows_removed_total = original_rows - final_rows

    # Summary
    print("Data Validation Summary:")
    print(f"Total Rows Initially: {original_rows}")
    print(f"Rows Removed (Missing Content): {rows_removed_missing}")
    print(f"Rows Removed (Empty Content): {rows_removed_empty}")
    print(f"Rows Removed (Duplicates): {rows_removed_duplicates}")
    print(f"Rows Removed (Invalid Length): {rows_removed_length}")
    print(f"Final Row Count: {final_rows}")
    print(f"Total Rows Removed: {rows_removed_total}")

    return dataframe

df = validate_data(df)

Data Validation Summary:
Total Rows Initially: 1509
Rows Removed (Missing Content): 0
Rows Removed (Empty Content): 0
Rows Removed (Duplicates): 0
Rows Removed (Invalid Length): 0
Final Row Count: 1509
Total Rows Removed: 0


In [5]:
import validators
import requests
from tqdm import tqdm
from langdetect import detect

def additional_checks_with_progress(dataframe):
    """
    Perform additional checks for non-English content and validate URLs.
    Perform accessibility checks only on links with invalid format and remove inaccessible ones.
    """
    original_rows = len(dataframe)  # Initial row count

    # Initialize tqdm progress bar
    tqdm.pandas(desc="Processing rows")

    # 1. Detect Non-English Content
    print("Checking for non-English content...")
    def detect_language_with_progress(text):
        try:
            return detect(text)
        except:
            return "error"

    dataframe['language'] = dataframe['Text'].progress_apply(detect_language_with_progress)
    non_english_rows = dataframe[dataframe['language'] != 'en']  # Identify non-English rows
    dataframe = dataframe[dataframe['language'] == 'en']  # Keep only English rows
    rows_removed_non_english = len(non_english_rows)

    # 2. Validate URLs in the 'Link' column
    print("Validating URL format...")
    def is_format_valid(url):
        """
        Validates the URL format using validators.
        Returns True if the format is valid, False otherwise.
        """
        try:
            return validators.url(url.strip()) is True  # Strip whitespace before validation
        except:
            return False

    dataframe['valid_format'] = dataframe['Link'].progress_apply(is_format_valid)

    # Log invalid format links
    invalid_format_links = dataframe[~dataframe['valid_format']]  # Links with invalid format
    if not invalid_format_links.empty:
        print("\nLinks with Invalid Format:")
        print(invalid_format_links['Link'].tolist())

    # 3. Perform Accessibility Check on Invalid Format Links
    print("Checking accessibility for invalid format links...")
    def is_accessible(url):
        """
        Checks accessibility of a URL using a GET request with headers.
        Returns True if the URL is accessible, False otherwise.
        """
        try:
            # Normalize the URL by stripping spaces and adding http:// if missing
            url = url.strip()
            if not url.startswith(("http://", "https://")):
                url = "http://" + url

            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            }
            response = requests.get(url, headers=headers, timeout=5)
            return response.status_code < 400  # Accessible if status code is OK
        except:
            return False

    # Perform accessibility check only on invalid format links
    invalid_format_links['accessible'] = invalid_format_links['Link'].progress_apply(is_accessible)

    # Log inaccessible links
    inaccessible_links = invalid_format_links[~invalid_format_links['accessible']]
    if not inaccessible_links.empty:
        print("\nInaccessible Links Removed:")
        print(inaccessible_links['Link'].tolist())

    # Remove inaccessible links
    valid_invalid_links = invalid_format_links[invalid_format_links['accessible']]
    dataframe = pd.concat([dataframe[dataframe['valid_format']], valid_invalid_links])

    # Final row count
    final_rows = len(dataframe)
    rows_removed_invalid_links = len(inaccessible_links)
    rows_removed_total = original_rows - final_rows

    # Print the additional validation summary
    print("\nAdditional Validation Summary:")
    print(f"Total Rows Initially: {original_rows}")
    print(f"Rows Removed (Non-English Content): {rows_removed_non_english}")
    print(f"Rows Removed (Invalid Format and Inaccessible Links): {rows_removed_invalid_links}")
    print(f"Final Row Count After Additional Checks: {final_rows}")
    print(f"Total Rows Removed in Additional Checks: {rows_removed_total}")

    # Return the cleaned dataframe
    return dataframe

# Apply the additional checks with progress bar
df = additional_checks_with_progress(df)


Checking for non-English content...


Processing rows: 100%|██████████| 1509/1509 [00:03<00:00, 435.49it/s]


Validating URL format...


Processing rows: 100%|██████████| 1509/1509 [00:00<00:00, 106274.95it/s]


Checking accessibility for invalid format links...


Processing rows: 0it [00:00, ?it/s]


Additional Validation Summary:
Total Rows Initially: 1509
Rows Removed (Non-English Content): 0
Rows Removed (Invalid Format and Inaccessible Links): 0
Final Row Count After Additional Checks: 1509
Total Rows Removed in Additional Checks: 0





In [6]:
from tqdm import tqdm
import spacy

# Initialize SpaCy model and TQDM progress bar
nlp = spacy.load("en_core_web_sm")
tqdm.pandas(desc="Extracting Entities")

# Define the function for entity extraction
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Apply the function with a progress bar
df['entities'] = df['Text'].progress_apply(extract_entities)

# Define a function for extracting relationships using dependency parsing
def extract_relationships(text):
    """
    Extracts relationships between entities based on dependency parsing.
    Returns a list of (subject, verb, object) tuples.
    """
    doc = nlp(text)
    relationships = []
    for token in doc:
        if token.dep_ in ("nsubj", "dobj") and token.head.pos_ == "VERB":
            relationships.append((token.text, token.head.text, token.head.i))
    return relationships

# Add a progress bar for the relationship extraction
tqdm.pandas(desc="Extracting Relationships")
df['relationships'] = df['Text'].progress_apply(extract_relationships)

# Display a few rows of the DataFrame with entities and relationships
print(df[['Text', 'entities', 'relationships']].head())

Extracting Entities: 100%|██████████| 1509/1509 [00:22<00:00, 65.86it/s]
Extracting Relationships: 100%|██████████| 1509/1509 [00:21<00:00, 71.11it/s]

                                                Text  \
0  Starbucks violated federal labor law when it i...   
1  The first suspect to plead guilty in Singapore...   
2  Meta has been fined a record-breaking €1.2 bil...   
3  SINGAPORE: A 45-year-old man linked to Singapo...   
4  The Department of Education imposed a record $...   

                                            entities  \
0  [(National Labor Relations Board, ORG), (Thurs...   
1  [(first, ORDINAL), (Singapore, GPE), (13 month...   
2  [(Meta, ORG), (€1.2 billion, MONEY), ($1.3 bil...   
3  [(SINGAPORE, GPE), (45-year-old, DATE), (Singa...   
4  [(The Department of Education, ORG), (a record...   

                                       relationships  
0  [(Starbucks, violated, 1), (law, violated, 1),...  
1  [(Wenqiang, admitted, 39), (proceeds, possessi...  
2  [(billion, fined, 3), (laws, violating, 21), (...  
3        [(This, amounts, 50), (Zhang, pleaded, 90)]  
4  [(Department, imposed, 4), (fine, imposed, 4),..




In [7]:
total_entities = df['entities'].apply(len).sum()
total_relationships = df['relationships'].apply(len).sum()

print(f"Total number of entities: {total_entities}")
print(f"Total number of relationships: {total_relationships}")

Total number of entities: 21323
Total number of relationships: 20062


In [8]:
# Build the entity-relationship graph
for _, row in df.iterrows():
    entities = row['entities']
    relationships = row['relationships']

    # Add entities as nodes
    for ent, label in entities:
        entity_graph.add_node(ent, label=label)

    # Add relationships as edges
    for subj, verb, obj_token in relationships:
        obj = obj_token.text
        if subj in entity_graph and obj in entity_graph:
            entity_graph.add_edge(subj, obj, relation=verb)

NameError: name 'entity_graph' is not defined