## Loading data

In [19]:
import os

from bs4 import BeautifulSoup

# Define the directory where your files are located
directory = 'dataset/reuters21578/'

# Function to extract topics and body content from a Reuters entry

# Set of desired topics
desired_topics = {'earn', 'acq', 'crude', 'corn'}

# Function to extract topics and body content from a Reuters entry
def extract_topics_and_body(reuter):
    # First find the TOPICS tag
    topics_tag = reuter.find('topics')
    # Then find all D tags within the TOPICS tag
    topics = [topic.text for topic in topics_tag.find_all('d')] if topics_tag else []
    body = reuter.find('body')
    body_content = body.text if body else None

    # Keep only the desired topics, if there is exactly one match
    matched_topics = [topic for topic in topics if topic in desired_topics]

    if len(matched_topics) == 1:
        # Return the single matched topic and the body content
        return matched_topics[0], body_content
    else:
        # Return None to indicate no topic or multiple topics found
        return None, None

# List to hold all articles from all files
all_articles = []

# Counter for 'earn' topics
earn_counter = 0
acq_counter = 0
crude_counter = 0

# Process the first N files
for i in range(10):
    filename = f'reut2-00{i}.sgm' if i < 10 else f'reut2-0{i}.sgm'
    file_path = os.path.join(directory, filename)

    # Open and parse the file
    with open(file_path, 'r', encoding='latin1') as file:
        parsed_content = BeautifulSoup(file.read(), 'html.parser')

        # Extract information from each Reuters entry
        for reuter in parsed_content.find_all('reuters'):
            topic, body_content = extract_topics_and_body(reuter)

            # Continue to the next iteration if body_content is empty or None
            if not body_content:
                continue

            # Check if the topic is 'earn'
            if topic == 'earn':
                earn_counter += 1
                # Only add every fifth 'earn' entry
                if earn_counter % 15 == 0:
                    all_articles.append({'topic': topic, 'body': body_content})
             # Check if the topic is 'acq'
            elif topic == 'acq':
                acq_counter += 1
                # Only add every second 'earn' entry
                if acq_counter % 9 == 0:
                    all_articles.append({'topic': topic, 'body': body_content})
             # Check if the topic is 'acq'
            elif topic == 'crude':
                crude_counter += 1
                # Only add every second 'earn' entry
                if crude_counter % 2 == 0:
                    all_articles.append({'topic': topic, 'body': body_content})
            elif topic:
                # Add all other topics as usual
                all_articles.append({'topic': topic, 'body': body_content})


#
## Extracted data display

In [20]:
from collections import Counter

# Initialize a counter for all topics
topics_counter = Counter()

# Iterate over each article and update the count of the topic
for article in all_articles:
    # The topic is now a string, not a list, so we increment its count by 1
    topics_counter[article['topic']] += 1

# Now we have a counter object with all topics and their counts
# Let's convert it to a list of (topic, count) pairs for easier viewing
topics_count_list = topics_counter.most_common()

all_articles = all_articles[:480]

print("Number of articles: ", len(all_articles))
print()
for topic, count in topics_count_list:
    print(f"Topic: {topic}, Count: {count}")


Number of articles:  480

Topic: earn, Count: 139
Topic: crude, Count: 131
Topic: acq, Count: 117
Topic: corn, Count: 116


#
#
#
# Train SVM with the SSK kernel

#
## Prepare text data for the SVM

In [23]:
from sklearn.model_selection import train_test_split

# Split your articles into the text content and labels
X = [article['body'] for article in all_articles]  # list of text contents
y = [article['topic'] for article in all_articles]  # list of corresponding labels

# Since the SSK kernel expects the input as arrays of strings, we need to reshape our data
# Convert the lists to numpy arrays and reshape them to be 2D with one column
import numpy as np
X = np.array(X).reshape(-1, 1)  # Text data
y = np.array(y)  # Labels

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=90, train_size=370, random_state=42)


480


#
## Effectiveness of Varying Sequence length

In [24]:
from sklearn.svm import SVC
from string_kernel import string_kernel

# Define the SSK kernel parameters
max_substring = 5
lambda_decay = 0.8

# Assuming `string_kernel` is properly imported and callable
# Define the wrapper function for the string kernel to be used with scikit-learn
def get_ssk_kernel_for_scikit(max_substring, lambda_decay):
    def ssk_kernel(X, Y):
        if X.ndim < 2:
            X = np.array(X).reshape(-1, 1)
        if Y.ndim < 2:
            Y = np.array(Y).reshape(-1, 1)

        return string_kernel(X, Y, max_substring, float(lambda_decay))
    return ssk_kernel

# Create the custom SSK kernel function for scikit-learn
my_ssk_kernel = get_ssk_kernel_for_scikit(max_substring, lambda_decay)

# Create the SVM classifier using the custom kernel
clf = SVC(kernel=my_ssk_kernel)

# Train the SVM classifier
clf.fit(X_train, y_train)

# Now you can make predictions and evaluate the classifier
# ...


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return string_kernel(X, Y, max_substring, float(lambda_decay))


### Make predictions on the test data

In [26]:
from sklearn.metrics import classification_report, accuracy_score

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return string_kernel(X, Y, max_substring, float(lambda_decay))


KeyboardInterrupt: 