In [1]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Load stop words
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\NEELKANTH
[nltk_data]     RAWAT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Function to apply filter 1: Check common non-stop-words
def filter1(headline, first_sentence):
    headline_words = set(headline.lower().split()) - stop_words
    sentence_words = set(first_sentence.lower().split()) - stop_words
    common_words = headline_words & sentence_words
    return len(common_words) > 0

# Function to apply filter 2: Check for bylines or extraneous marks
def filter2(headline):
    byline_pattern = re.compile(r'by\s+\w+', re.IGNORECASE)
    extraneous_marks = ['—', '--']
    if byline_pattern.search(headline):
        return False
    if any(mark in headline for mark in extraneous_marks):
        return False
    return True

# Function to apply filter 3: Check for question marks or colons
def filter3(headline):
    return not (':' in headline or '?' in headline)

### Giving ID to each of the line

In [3]:
def add_ids_to_file(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        lines = infile.readlines()
        for idx, line in enumerate(lines, start=1):
            outfile.write(f"{idx}\t{line}")

# Specify the path to the input and output files
input_file = 'alignedtest.txt'
output_file = 'alignedtest_with_ids.txt'

# Add IDs to the file
add_ids_to_file(input_file, output_file)


In [4]:
def check_filters(test_file):
    with open(test_file, 'r') as file:
        lines = file.readlines()

    total_pairs = len(lines)
    filter1_pass = 0
    filter2_pass = 0
    filter3_pass = 0
    all_filters_pass = 0

    filter1_fail_ids = []
    filter2_fail_ids = []
    filter3_fail_ids = []

    for line in lines:
        parts = line.strip().split('\t')
        idx, first_sentence, headline = parts[0], parts[1], parts[2]
        
        pass_filter1 = filter1(headline, first_sentence)
        pass_filter2 = filter2(headline)
        pass_filter3 = filter3(headline)

        if pass_filter1:
            filter1_pass += 1
        else:
            filter1_fail_ids.append(idx)

        if pass_filter2:
            filter2_pass += 1
        else:
            filter2_fail_ids.append(idx)

        if pass_filter3:
            filter3_pass += 1
        else:
            filter3_fail_ids.append(idx)

        if pass_filter1 and pass_filter2 and pass_filter3:
            all_filters_pass += 1

    print(f'Total pairs: {total_pairs}')
    print(f'Pairs passing filter 1: {filter1_pass} ({(filter1_pass / total_pairs) * 100:.2f}%)')
    print(f'Pairs passing filter 2: {filter2_pass} ({(filter2_pass / total_pairs) * 100:.2f}%)')
    print(f'Pairs passing filter 3: {filter3_pass} ({(filter3_pass / total_pairs) * 100:.2f}%)')
    print(f'Pairs passing all filters: {all_filters_pass} ({(all_filters_pass / total_pairs) * 100:.2f}%)')

    return {
        'filter1_fail_ids': filter1_fail_ids,
        'filter2_fail_ids': filter2_fail_ids,
        'filter3_fail_ids': filter3_fail_ids
    }

In [5]:
# Specify the path to the test set file with IDs
test_file = 'alignedtest_with_ids.txt'

# Run the filter checks and get the IDs of lines that do not pass the filters
failed_ids = check_filters(test_file)
print(f"IDs of a few lines failing filter 1: {failed_ids['filter1_fail_ids'][:5]}")
print(f"IDs of a few lines failing filter 2: {failed_ids['filter2_fail_ids'][:5]}")
print(f"IDs of a few lines failing filter 3: {failed_ids['filter3_fail_ids'][:5]}")

Total pairs: 1951
Pairs passing filter 1: 1645 (84.32%)
Pairs passing filter 2: 1798 (92.16%)
Pairs passing filter 3: 1602 (82.11%)
Pairs passing all filters: 1246 (63.86%)
IDs of a few lines failing filter 1: ['10', '44', '66', '82', '91']
IDs of a few lines failing filter 2: ['6', '11', '23', '25', '27']
IDs of a few lines failing filter 3: ['6', '41', '42', '43', '51']


In [6]:
import random

# Load the test file to get statements
def load_statements(test_file):
    with open(test_file, 'r') as file:
        lines = file.readlines()

    statements = {}
    for line in lines:
        parts = line.strip().split('\t')
        idx, first_sentence, headline = parts[0], parts[1], parts[2]
        statements[idx] = (first_sentence, headline)

    return statements

# Print random examples of statements failing each filter
def print_filter_examples(statements, failed_ids, filter_num):
    print(f"\nExamples of statements failing filter {filter_num}:")
    for _ in range(3):
        idx = random.choice(failed_ids)
        if idx in statements:
            print(f"ID: {idx}, Sentence: {statements[idx][0]}, Headline: {statements[idx][1]}")
        else:
            print(f"No statement found for ID: {idx}")

# Specify the path to the test set file with IDs
test_file = 'alignedtest_with_ids.txt'

# Run the filter checks and get the IDs of lines that do not pass the filters
failed_ids = check_filters(test_file)

# Load statements
statements = load_statements(test_file)

# Print examples for each filter
print_filter_examples(statements, failed_ids['filter1_fail_ids'], 1)
print_filter_examples(statements, failed_ids['filter2_fail_ids'], 2)
print_filter_examples(statements, failed_ids['filter3_fail_ids'], 3)


Total pairs: 1951
Pairs passing filter 1: 1645 (84.32%)
Pairs passing filter 2: 1798 (92.16%)
Pairs passing filter 3: 1602 (82.11%)
Pairs passing all filters: 1246 (63.86%)

Examples of statements failing filter 1:
ID: 1208, Sentence: UNK lindner watches her boys asleep in a sofa bed ., Headline: keeping together in tough times
ID: 1838, Sentence: UNK, Headline: britons ready to pay more on health services
ID: 1572, Sentence: st. louis - it 's been so long now - eight years - that the gold medal seems as if it happened to someone else ., Headline: lipinski treasures golden memories

Examples of statements failing filter 2:
ID: 662, Sentence: the shape of india 's future government could be decided sunday as politicians huddled into meetings to work out alliances between parties and cobble together a coalition ., Headline: parties meet to work out a coalition to rule india by UNK roy
ID: 119, Sentence: the launch of a long-range north korean missile would provide the first real test of 