In [95]:
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dropout, Dense
from keras.optimizers import RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam, SGD
from keras.layers import Flatten, LSTM, Reshape, BatchNormalization, Activation, UpSampling1D, ZeroPadding1D, PReLU, GRU
from random import shuffle
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
import time
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # CPU:-1; GPU0: 1; GPU1: 0;

## Dataset

In [41]:
import pandas as pd
import glob, os, re
import xml.etree.ElementTree as ET

def clean_xml_content(content):
    content = content.replace('<br>', '<br />')  # Fix malformed tags
    content = content.replace('&', '&amp;')  # Replace & with &amp;
    content = re.sub(r'&(?!amp;)', '&amp;', content)  # Fix any remaining unescaped &

    # Replace invalid characters (e.g., ASCII control characters)
    content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', content)

    return content


def parse_xml_content(content):
    data = {}
    root = ET.fromstring(content)

    data['NEWID'] = root.attrib.get('NEWID', '')
    data['DATE'] = root.findtext('DATE', '')
    data['TOPICS'] = ','.join([topic.text for topic in root.findall('.//TOPICS/D')]) if root.find('.//TOPICS') is not None else ''
    data['PLACES'] = ','.join([place.text for place in root.findall('.//PLACES/D')]) if root.find('.//PLACES') is not None else ''
    data['PEOPLE'] = ','.join([person.text for person in root.findall('.//PEOPLE/D')]) if root.find('.//PEOPLE') is not None else ''
    data['ORGS'] = ','.join([org.text for org in root.findall('.//ORGS/D')]) if root.find('.//ORGS') is not None else ''
    data['EXCHANGES'] = ','.join([exchange.text for exchange in root.findall('.//EXCHANGES/D')]) if root.find('.//EXCHANGES') is not None else ''
    data['COMPANIES'] = ','.join([company.text for company in root.findall('.//COMPANIES/D')]) if root.find('.//COMPANIES') is not None else ''
    data['TITLE'] = root.findtext('.//TITLE', '')
    data['DATELINE'] = root.findtext('.//DATELINE', '')
    data['BODY'] = root.findtext('.//BODY', '')
    data['UNKNOWN'] = root.findtext('UNKNOWN', '')
    
    return data

def parse_xml_content_lists(content):
    data = {}
    root = ET.fromstring(content)

    data['NEWID'] = root.attrib.get('NEWID', '')
    data['DATE'] = root.findtext('DATE', '')
    data['TOPICS'] = [topic.text for topic in root.findall('.//TOPICS/D')] if root.find('.//TOPICS') is not None else []
    data['PLACES'] = [place.text for place in root.findall('.//PLACES/D')] if root.find('.//PLACES') is not None else  []
    data['PEOPLE'] = [person.text for person in root.findall('.//PEOPLE/D')] if root.find('.//PEOPLE') is not None else  []
    data['ORGS'] = [org.text for org in root.findall('.//ORGS/D')] if root.find('.//ORGS') is not None else  []
    data['EXCHANGES'] = [exchange.text for exchange in root.findall('.//EXCHANGES/D')] if root.find('.//EXCHANGES') is not None else  []
    data['COMPANIES'] = [company.text for company in root.findall('.//COMPANIES/D')] if root.find('.//COMPANIES') is not None else []
    data['TITLE'] = root.findtext('.//TITLE', '')
    data['DATELINE'] = root.findtext('.//DATELINE', '')
    data['BODY'] = root.findtext('.//BODY', '')
    data['UNKNOWN'] = root.findtext('UNKNOWN', '')
    
    return data

all_data = []
file_list = glob.glob("./reuters21578/reut2-*")
for file_name in file_list:
    content = ""

    with open(os.path.abspath(file_name), "r", encoding="iso-8859-1") as file:
        for line in file:
            if line.startswith("<!DOCTYPE"):
                continue

            content += line
            if line.startswith("</REUTERS>"):
                try:
                    data = parse_xml_content_lists(clean_xml_content(content))
                    all_data.append(data)
                except ET.ParseError as e:
                    print(f"ParseError: {e}")
                content = ""
                
reuters = pd.DataFrame(all_data)

In [42]:
display(reuters.head(5))

Unnamed: 0,NEWID,DATE,TOPICS,PLACES,PEOPLE,ORGS,EXCHANGES,COMPANIES,TITLE,DATELINE,BODY,UNKNOWN
0,4001,11-MAR-1987 18:04:17.59,[],[canada],[],[],[],[],INCO SEES NO MAJOR IMPACT FROM DOW REMOVAL,"TORONTO, March 11 -",Inco Ltd said it did not expect its\nearlier r...,\n&#5;&#5;&#5;M\n&#22;&#22;&#1;f0849&#31;reut...
1,4002,11-MAR-1987 18:06:47.22,[],[usa],[],[],[],[],FORMER EMPIRE OF CAROLINA &lt;EMP> EXEC SENTENCED,"NEW YORK, March 11 -","Mason Benson, former president and\nchief oper...",\n&#5;&#5;&#5;F\n&#22;&#22;&#1;f0852&#31;reut...
2,4003,11-MAR-1987 18:09:39.66,[],[usa],[],[],[],[],"DOCTORS FIND LINK BETWEEN AIDS, SMALLPOX VIRUS","BOSTON, March 11 -",In a discovery that could complicate the\nsear...,\n&#5;&#5;&#5;F\n&#22;&#22;&#1;f0856&#31;reut...
3,4004,11-MAR-1987 18:13:59.93,[],[usa],[],[],[],[],BIRTH CONTROL PILLS HELP PREVENT CANCER - STUDY,"BOSTON, March 11 -",Doctors at the Centers for Disease\nControl in...,\n&#5;&#5;&#5;F\n&#22;&#22;&#1;f0860&#31;reut...
4,4005,11-MAR-1987 18:14:49.93,"[interest, retail, ipi]",[usa],[],[],[],[],U.S. ECONOMIC DATA KEY TO DEBT FUTURES OUTLOOK,"CHICAGO, March 11 -",U.S. economic data this week could be\nthe key...,\n&#5;&#5;&#5;RM C\n&#22;&#22;&#1;f0861&#31;r...


## Model

In [60]:
(x_train, y_train), (x_test, y_test) = keras.datasets.reuters.load_data(
    path="reuters.npz",
    num_words=None,
    skip_top=0,
    maxlen=None,
    test_split=0.2,
    seed=113,
    start_char=1,
    oov_char=2,
    index_from=3,
)

In [118]:
from tensorflow.keras.datasets import reuters
from tensorflow.keras.utils import to_categorical

#(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=30000, maxlen=50, test_split=0.3)
(X_train, y_train), (X_test, y_test) = keras.datasets.reuters.load_data(
    path="reuters.npz",
    num_words=None,
    skip_top=0,
    maxlen=None,
    test_split=0.2,
    seed=113,
    start_char=1,
    oov_char=2,
    index_from=3,
)


#Padding the data
X_train_pad = pad_sequences(X_train, padding='post')
X_test_pad = pad_sequences(X_test, padding='post')

y_data = np.concatenate([y_train, y_test])
y_data = to_categorical(y_data)

y_train_en = y_data[:len(y_train)]
y_test_en = y_data[len(y_train):]


In [122]:
# Define the model
model = keras.Sequential(
    [
        keras.layers.LSTM(46, input_shape=(50, 1), return_sequences=False),   
        # keras.layers.LSTM(46),   
        keras.layers.Activation('softmax')
    ]
)
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
# model.summary()
model.fit(X_train_pad, y_train_en, epochs=10, batch_size=8)

Epoch 1/150
[1m 123/1123[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m3:35[0m 215ms/step - accuracy: 0.3131 - loss: 0.0211

KeyboardInterrupt: 

In [119]:
model.evaluate(X_test_pad, y_test_en)

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.3664 - loss: 0.0195


[0.019481094554066658, 0.36197686195373535]