# Machine Learning
## Loading Data

In [2]:
import pandas as pd

df = pd.read_parquet('data/ml_data.parquet')
df.head()


Unnamed: 0,inputs,target,n_posts
0,"[0.45066947, 0.22533473, 0.8637831, 0.13968499...",18,30
1,"[0.4656903, 0.23284516, 0.8537656, -0.02034902...",12,16
2,"[0.48154342, 0.24077171, 0.84270096, 0.3549081...",34,104
3,"[0.48154342, 0.24077171, 0.84270096, -0.196450...",20,35
4,"[0.49827287, 0.24913643, 0.8304548, 0.1336856,...",3,4


## Loading df into torch, train test split

In [61]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split

# Using cuda
my_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('We are using ', my_device)

# Create a TensorDataset
inputs = torch.tensor(df['inputs'].tolist(), device=my_device)
targets = torch.tensor(df['target'].values, device=my_device)
dataset = TensorDataset(inputs, targets)

# Split into training and test set
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


We are using  cuda


## Model definitions
### Deep regression network

In [64]:
class RegressionNN(nn.Module):
    def __init__(self, drop, layer_sizes):
        super(RegressionNN, self).__init__()

        # Create layers dynamically
        self.layers = nn.ModuleList()
        for i in range(len(layer_sizes)-1):
            self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1]))

        # Dropout layer
        self.dropout = nn.Dropout(drop)

        # Output layer
        self.output_layer = nn.Linear(layer_sizes[-1], 1)

    def forward(self, x):
        # Forward pass through each layer with ReLU activation and dropout
        for layer in self.layers:
            x = F.relu(layer(x))
            x = self.dropout(x)

        # Forward pass through the output layer
        x = self.output_layer(x)
        return x

model = RegressionNN(
    drop = 0.3,
    layer_sizes = [303, 303, 256, 256, 256, 128, 128, 64]
    ).to(my_device)

## Training loop

In [65]:
import torch
import torch.optim as optim

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Number of epochs
epochs = 200

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(my_device), targets.to(my_device)
        optimizer.zero_grad()
        outputs = model(inputs.float())
        loss = criterion(outputs.squeeze(), targets.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")


Epoch 1/200, Loss: 340.2196580260044
Epoch 2/200, Loss: 297.0465605284216
Epoch 3/200, Loss: 285.3003563527084
Epoch 4/200, Loss: 278.00222415890374
Epoch 5/200, Loss: 270.8819308230397
Epoch 6/200, Loss: 265.32222969034956
Epoch 7/200, Loss: 261.49599332607255
Epoch 8/200, Loss: 257.31408604631997
Epoch 9/200, Loss: 253.51739373392437
Epoch 10/200, Loss: 251.2925152896571
Epoch 11/200, Loss: 248.54408621029802
Epoch 12/200, Loss: 245.60065855238548
Epoch 13/200, Loss: 242.913513230772
Epoch 14/200, Loss: 241.3364811651277
Epoch 15/200, Loss: 239.03457661315326
Epoch 16/200, Loss: 237.31969179955473
Epoch 17/200, Loss: 234.76920635034676
Epoch 18/200, Loss: 233.13777491566148
Epoch 19/200, Loss: 231.66937827726977
Epoch 20/200, Loss: 229.6819375833437
Epoch 21/200, Loss: 228.20610024314044
Epoch 22/200, Loss: 227.37804649238453
Epoch 23/200, Loss: 225.8756240079765
Epoch 24/200, Loss: 223.70189172387543
Epoch 25/200, Loss: 222.27757913063778
Epoch 26/200, Loss: 221.34309499592325
Epoch

## Evaluation

In [67]:
model.eval()
test_loss = 0
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(my_device), targets.to(my_device)
        outputs = model(inputs.float())
        loss = criterion(outputs.squeeze(), targets.float())
        test_loss += loss.item()
print(f"Test Loss: {test_loss/len(test_loader)}")


Test Loss: 251.99174196897042


## Saving the model

In [13]:
model_path = 'reg_model.pth'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")


Model saved to rnn_model.pth


## Testing on the current frontpage

In [109]:
import numpy as np
from datetime import datetime, timedelta
from time import sleep
from bs4 import BeautifulSoup
import requests
import spacy

def get_articles(date):
    link = f'https://www.derstandard.at/frontpage/{date.strftime('%Y/%m/%d')}'
    # fetch the html content of a derstandard.at page
    response = requests.get(link, cookies={'DSGVO_ZUSAGE_V1': 'true'})
    soup = BeautifulSoup(response.content, 'html.parser')
    # get the articles
    articles_dict = {}
    articles = soup.select('div.chronological>section article')
    for article in articles:
        title_tag = article.find('a')
        if title_tag and title_tag.has_attr('title'):
            title = title_tag['title']
            articles_dict[title] = article
    # make a list of the articles
    HOST = 'https://www.derstandard.at'
    article_data = []
    for title, article in articles_dict.items():
        data = {
            'title': title,
            'teaser-subtitle': None,
            'link': None,
            'time': None,
            'teaser-kicker': None,
            'n_posts': None,
            'storylabels': None
        }
        link = article.find('a')['href']
        if not link.startswith(HOST):
            link = HOST + link
        data['link'] = link
        time = [tag for tag in article.find_all('time') if 'datetime' in tag.attrs][0]
        data['time'] = time['datetime'].rstrip('\r\n')
        n_posts = article.find('div', 'teaser-postingcount')
        try: data['n_posts'] = int(n_posts.get_text(strip=True).rstrip('Posting').replace('.', ''))
        except: data['n_posts'] = 0
        for tag, class_name in [('p', 'teaser-kicker'), 
                                ('p', 'teaser-subtitle'), 
                                ('div', 'storylabels')]:
            found_tag = article.find(tag, class_=class_name)
            if found_tag:
                data[class_name] = found_tag.get_text(strip=True)
        article_data.append(data)
    # make a df
    df = pd.DataFrame(article_data)
    df.columns = df.columns.str.replace('teaser-', '')
    df.rename(columns={'time': 'datetime'}, inplace=True)
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['text'] = df['title'] +  df['kicker'].fillna('') + df['subtitle'].fillna('')
    # add embeddings
    nlp = spacy.load("de_core_news_lg")
    df['doc_vector'] = df['text'].apply(lambda t: nlp(t).vector)
    # add date
    def process_row(row):
        date_vector = np.array([
            row['datetime'].month,
            row['datetime'].weekday(),
            row['datetime'].hour
            ])
        # Normalize the date_vector, ensure it is float32 like the doc vectors
        norm_date_vector = (date_vector / np.linalg.norm(date_vector)).astype(np.float32)
        return np.concatenate((norm_date_vector, row['doc_vector']))
    df['inputs'] = df.apply(process_row, axis=1)
    return df


In [113]:
lastweek = datetime.now() - timedelta(days=7)
lastweek
# print the current date
articles = get_articles(lastweek)

target_map = pd.read_csv('data/target_map.csv')

for index, row in articles.iterrows():
    # Extract the input data and convert it to a tensor
    input_data = torch.tensor(row['inputs'], dtype=torch.float)

    # If your model is on GPU, move the input data to GPU
    input_data = input_data.to(next(model.parameters()).device)

    # Reshape the input data and pass it through the model
    input_data = input_data.unsqueeze(0)  # Add a batch dimension
    with torch.no_grad():  # Ensure gradients are not computed
        output = model(input_data)

    # Convert the output tensor to a Python number and print it
    target_index = int(output.item())
    predicted_b = target_map.iloc[target_index]['bounds']
    actual = row['n_posts']
    print(f"Predicted: {predicted_b}, Actual: {actual}")

Predicted: (36.0, 40.0], Actual: 197
Predicted: (9.0, 11.0], Actual: 2
Predicted: (6.0, 7.0], Actual: 10
Predicted: (14.0, 16.0], Actual: 18
Predicted: (14.0, 16.0], Actual: 4
Predicted: (9.0, 11.0], Actual: 14
Predicted: (176.0, 189.0], Actual: 372
Predicted: (14.0, 16.0], Actual: 11
Predicted: (30.0, 33.0], Actual: 15
Predicted: (30.0, 33.0], Actual: 77
Predicted: (14.0, 16.0], Actual: 4
Predicted: (218.0, 235.0], Actual: 233
Predicted: (11.0, 12.0], Actual: 5
Predicted: (86.0, 93.0], Actual: 402
Predicted: (69.0, 74.446], Actual: 421
Predicted: (86.0, 93.0], Actual: 518
Predicted: (80.0, 86.0], Actual: 17
Predicted: (47.0, 51.0], Actual: 591
Predicted: (22.0, 25.0], Actual: 63
Predicted: (36.0, 40.0], Actual: 294
Predicted: (115.0, 123.0], Actual: 1614
Predicted: (36.0, 40.0], Actual: 52
Predicted: (8.0, 9.0], Actual: 4
Predicted: (30.0, 33.0], Actual: 138
Predicted: (20.0, 22.0], Actual: 124
Predicted: (47.0, 51.0], Actual: 17
Predicted: (43.0, 47.0], Actual: 1862
Predicted: (74.44