# Text Processing

This notebook imports scraped data, splits text column to words, extracts and normalizes tokens. Next, tokens are converted to embeddings.

In [1]:
### Setup

# pip install spacy
# python -m spacy download de_core_news_sm  - for small model (13 MB)
# python -m spacy download de_core_news_md  - for medium model (42 MB)
# source - https://spacy.io/models/de


import spacy
from spacy.lang.de.examples import sentences 


# Example from documentation
nlp = spacy.load("de_core_news_sm")
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_)

Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen
Die DET nk
ganze ADJ nk
Stadt NOUN sb
ist AUX ROOT
ein DET nk
Startup NOUN pd
: PUNCT punct
Shenzhen NOUN sb
ist AUX cj
das DET nk
Silicon PROPN pnc
Valley PROPN sb
für ADP mnr
Hardware-Firmen NOUN nk


In [3]:
## Data loading

import numpy as np
import pandas as pd

standard_data = pd.read_csv('./data/4yrs_derstandard_frontpage_data.csv')
standard_data

Unnamed: 0,title,subtitle,link,datetime,kicker,n_posts,storylabels
0,Real Madrid stolpert mit Aluminiumpech im Tite...,Die Königlichen können Bilbao daheim nicht bes...,https://www.derstandard.at/story/2000112599363...,2019-12-22T23:44,Primera Division,30,
1,Bolivien weist venezolanische Diplomaten aus,InterimspräsidentinJeanine Áñez wirft denBotsc...,https://www.derstandard.at/story/2000112598924...,2019-12-22T22:50,Übergangsregierung,16,
2,Erdoğan warnt vor neuer Flüchtlingswelle aus S...,"Türkischer Präsident: ""80.000 Menschen Richtun...",https://www.derstandard.at/story/2000112598130...,2019-12-22T21:43,Bürgerkrieg,104,
3,Massenkarambolage mit 63 Fahrzeugen in Virginia,Autos stießen auf vereister Brücke zusammen,https://www.derstandard.at/story/2000112597972...,2019-12-22T21:29,Weihnachtsverkehr,35,
4,"Salzburg schlägt Caps, Meister KAC mit vierter...",Die Bullen sind damit der Gewinner der Runde: ...,https://www.derstandard.at/story/2000112595206...,2019-12-22T20:54,Eishockey,4,
...,...,...,...,...,...,...,...
182102,Wer braucht die Kirche?,Dass sich die Kirche nach soschwerwiegendenVer...,https://www.derstandard.at/story/3000000200743...,2023-12-22T06:00,Dominik Straub,1,Kommentar
182103,Sonderregelung verlängert: Mehr als 1.000 Ärzt...,"Der""Pandemieparagraf""im Ärztegesetz hat mehr a...",https://www.derstandard.at/story/3000000200621...,2023-12-22T06:00,Pandemieparagraf,148,
182104,"Stadtforscher: ""Architektur ist Teil unserer W...",Jetzt anhören: In Zukunft müssen Städte wieder...,https://www.derstandard.at/story/3000000200499...,2023-12-22T06:00,Edition Zukunft,54,Podcast
182105,David Alaba zum zehnten Mal zu Österreichs Fuß...,Zehn von zwölf Trainern wählten den derzeit ve...,https://www.derstandard.at/story/3000000200745...,2023-12-22T05:46,Fußball,34,


## Part 1,2: Tokenizing + lemmatizing text columns

First, title and subtitile columns are split into individual words. Second, words are replaced by their standard forms (großem - groß, rettete - retten). Finally, the list of standard forms is joined back to a string.

In [6]:
# Initializing two columns
standard_data['title_tokens'] = standard_data['title']
standard_data['subtitle_tokens'] = standard_data['subtitle']

standard_data['title_vectors'] = standard_data['title_tokens']
standard_data['subtitle_vectors'] = standard_data['subtitle_tokens']

# Load model
nlp = spacy.load("de_core_news_sm")
nlp.get_pipe('lemmatizer')
nlp.get_pipe("tok2vec")

<spacy.pipeline.tok2vec.Tok2Vec at 0x2c9a850e7b0>

In [7]:
# loops through rows

for index, row in standard_data.iterrows():
    
    text = nlp(row['title_tokens'].replace("-", " ").replace(",", " ").replace(": ", " "))
    token_list = ' '.join([token.lemma_ for token in text]) # list of standardized tokens joined to a string

    vector_list = [token.vector for token in text]

    standard_data.at[index, 'title_tokens'] = token_list # store in the dataframe
    standard_data.at[index, 'title_vectors'] = vector_list 

    # try except because subtitle is sometimes empty, then it outputs error

    try:

        text = nlp(row['subtitle_tokens'])
        token_list = ' '.join([token.lemma_ for token in text]) # list of standardized tokens joined to a string
        vector_list = [token.vector for token in text]

    except:
        token_list = '' # if empty, token list empty
        token_list = []

    standard_data.at[index, 'subtitle_tokens'] = token_list # store in the dataframe
    standard_data.at[index, 'subtitle_vectors'] = vector_list



In [8]:
standard_data

Unnamed: 0,title,subtitle,link,datetime,kicker,n_posts,storylabels,title_tokens,subtitle_tokens,title_vectors,subtitle_vectors
0,Real Madrid stolpert mit Aluminiumpech im Tite...,Die Königlichen können Bilbao daheim nicht bes...,https://www.derstandard.at/story/2000112599363...,2019-12-22T23:44,Primera Division,30,,Real Madrid stolpern mit Aluminiumpech in Tite...,der Königlich können Bilbao daheim nicht besiegen,"[[0.96273005, 2.1436386, 0.8224912, 0.13955253...","[[-3.936081, -2.228262, 1.3592032, 0.5570401, ..."
1,Bolivien weist venezolanische Diplomaten aus,InterimspräsidentinJeanine Áñez wirft denBotsc...,https://www.derstandard.at/story/2000112598924...,2019-12-22T22:50,Übergangsregierung,16,,Bolivien weisen venezolanisch Diplomat aus,InterimspräsidentinJeanine Áñez werfen denBots...,"[[-1.4946201, 1.7278106, -1.7266822, 2.2589207...","[[1.1945176, 2.054468, -1.6709826, -2.319276, ..."
2,Erdoğan warnt vor neuer Flüchtlingswelle aus S...,"Türkischer Präsident: ""80.000 Menschen Richtun...",https://www.derstandard.at/story/2000112598130...,2019-12-22T21:43,Bürgerkrieg,104,,Erdoğan warnen vor neu Flüchtlingswelle aus Sy...,Türkischer Präsident -- -- 80.000 Mensch Richt...,"[[-0.95836586, 1.7555954, 3.0623906, -2.843035...","[[-0.35086232, -1.4924178, 0.8819543, -5.56001..."
3,Massenkarambolage mit 63 Fahrzeugen in Virginia,Autos stießen auf vereister Brücke zusammen,https://www.derstandard.at/story/2000112597972...,2019-12-22T21:29,Weihnachtsverkehr,35,,Massenkarambolage mit 63 Fahrzeug in Virginia,Auto stoßen auf vereist Brücke zusammen,"[[0.51267594, 2.9074597, -4.5659633, -0.097496...","[[-4.7611976, -3.1829047, -1.3174409, 1.369393..."
4,"Salzburg schlägt Caps, Meister KAC mit vierter...",Die Bullen sind damit der Gewinner der Runde: ...,https://www.derstandard.at/story/2000112595206...,2019-12-22T20:54,Eishockey,4,,Salzburg schlagen Caps Meister KAC mit viert...,der Bulle sein damit der Gewinner der Runde --...,"[[0.32886082, 3.6778326, -1.9720674, 0.3804658...","[[-3.6109104, -2.2922082, 0.29590467, -1.18459..."
...,...,...,...,...,...,...,...,...,...,...,...
182102,Wer braucht die Kirche?,Dass sich die Kirche nach soschwerwiegendenVer...,https://www.derstandard.at/story/3000000200743...,2023-12-22T06:00,Dominik Straub,1,Kommentar,wer brauchen der Kirche --,dass sich der Kirche nach soschwerwiegendenVer...,"[[-1.3705893, 0.8805902, -2.953909, 2.2801714,...","[[0.1805143, -3.8692951, -1.4018897, -3.042925..."
182103,Sonderregelung verlängert: Mehr als 1.000 Ärzt...,"Der""Pandemieparagraf""im Ärztegesetz hat mehr a...",https://www.derstandard.at/story/3000000200621...,2023-12-22T06:00,Pandemieparagraf,148,,Sonderregelung verlängern mehr als 1.000 Arzt ...,der -- Pandemieparagraf -- in Ärztegesetz habe...,"[[-1.371767, 3.4383852, -3.9077249, -1.940309,...","[[0.8398843, -0.224679, 0.9350044, -3.3680367,..."
182104,"Stadtforscher: ""Architektur ist Teil unserer W...",Jetzt anhören: In Zukunft müssen Städte wieder...,https://www.derstandard.at/story/3000000200499...,2023-12-22T06:00,Edition Zukunft,54,Podcast,Stadtforscher -- Architektur sein Teil unser W...,jetzt anhören -- in Zukunft müssen Stadt wiede...,"[[0.13110606, 2.49866, -1.1296308, -2.0916157,...","[[-1.1108141, 0.34965265, -2.2161102, 2.272627..."
182105,David Alaba zum zehnten Mal zu Österreichs Fuß...,Zehn von zwölf Trainern wählten den derzeit ve...,https://www.derstandard.at/story/3000000200745...,2023-12-22T05:46,Fußball,34,,David Alaba zu zehnter Mal zu Österreich Fußba...,zehn von zwölf Trainer wählen der derzeit verl...,"[[-0.6869915, -1.9356687, 0.2901172, -2.323410...","[[0.40371332, -0.29915434, 0.6121069, 1.124182..."


## Step 3: Embeddings

Two new columns are created. They contain lists of vectors. A vector represents embedding of a single token, a title then is converted to a list of such token. The numeric representation of text enables us standard training of a neural net with the vectors as inputs.

**note: moved up**

standard_data['title_vectors'] = standard_data['title_tokens']
standard_data['subtitle_vectors'] = standard_data['subtitle_tokens']
nlp.get_pipe("tok2vec")

for index, row in standard_data.iterrows():
    
    token_list = nlp(row['title_tokens'])
    vector_list = [token.vector for token in token_list]
    standard_data.at[index, 'title_vectors'] = vector_list 

    try:

        token_list = nlp(row['subtitle_tokens'])
        vector_list = [token.vector for token in token_list]

    except:
        token_list = []

    standard_data.at[index, 'subtitle_vectors'] = vector_list



In [9]:
standard_data.iloc[1,10]

[array([ 1.1945176e+00,  2.0544679e+00, -1.6709826e+00, -2.3192761e+00,
        -2.8761718e+00, -3.6133254e+00,  1.1357737e+00,  3.5059733e+00,
        -1.5503708e+00, -2.2641780e+00,  8.0017340e-01,  2.8729141e-02,
         7.8730440e-01,  5.7879090e-03, -4.0702635e-01, -1.4896834e+00,
        -3.0737720e+00, -1.9451411e+00, -8.8539243e-02, -3.5955997e+00,
         1.0919449e+00,  1.0278519e+00,  3.0935278e+00, -3.0229239e+00,
        -1.7530608e+00, -1.5500312e+00, -1.8601739e+00,  8.8529861e-01,
         1.8957378e+00,  1.0409238e+00,  4.1576304e+00,  1.4223669e+00,
        -3.3631283e-01, -4.7835767e-01, -7.8345305e-01,  7.8910856e+00,
        -2.2569852e+00,  1.0568581e+00,  1.2868596e+00,  3.6069062e+00,
        -2.6259234e+00,  2.7638268e+00, -5.3197646e-01,  1.8343658e+00,
         1.2027991e+00, -3.7033862e-01, -4.5672369e-01,  7.3619461e+00,
        -5.8385782e+00,  2.9527030e+00, -4.0685368e+00,  5.3899269e+00,
         8.3109838e-01,  8.0962420e-01, -7.4443048e-01, -7.33091

In [10]:
import numpy as np

standard_data_copy = standard_data

for index,row in standard_data_copy.iterrows():

    l = standard_data_copy.loc[index, 'title_vectors']
    standard_data_copy.at[index, 'title_vectors'] = np.sum(l, axis=0)

    l = standard_data_copy.loc[index, 'subtitle_vectors']
    standard_data_copy.at[index, 'subtitle_vectors'] = np.sum(l, axis=0)

In [11]:
X = np.vstack(standard_data_copy['title_vectors'].to_numpy())
X



array([[  9.081697  ,   6.8953657 ,  -3.216856  , ...,   8.002752  ,
        -10.435414  ,   8.292292  ],
       [  6.5489306 ,  11.075163  ,   1.0434568 , ...,   5.016325  ,
        -11.797001  ,  -2.2782614 ],
       [  6.039239  ,   3.42458   ,   7.0853643 , ...,   6.87098   ,
        -14.393793  ,   4.5429664 ],
       ...,
       [-12.674403  ,   2.175221  ,   3.6549788 , ...,  11.986953  ,
          3.295246  ,  17.236275  ],
       [  4.402149  , -12.800626  ,  14.75838   , ...,   6.024646  ,
         -7.96585   ,   5.300123  ],
       [ 12.515891  ,   3.4930995 ,   0.40148246, ...,   8.258898  ,
          1.8255993 ,  -0.6187186 ]], dtype=float32)

In [12]:
y = (standard_data_copy['n_posts'].to_numpy() > 50).astype(int)
y = np.column_stack(standard_data_copy['n_posts'].to_numpy() > 50).astype(int).T

y.shape
#X.shape

(182107, 1)

In [13]:
from sklearn.model_selection import train_test_split


X_train, X_test, Y_train, Y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42069)


In [14]:
print("Baseline accuracy: ")
np.sum(Y_test)/np.size(Y_test)

Baseline accuracy: 


0.5655098566800285

In [15]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [16]:
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)

Y_train = torch.tensor(Y_train, dtype=torch.float32).reshape(-1, 1)
Y_test = torch.tensor(Y_test, dtype=torch.float32).reshape(-1, 1)

In [51]:
class CustomClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(96, 12)
        self.act1 = nn.ReLU()
        self.hidden2 = nn.Linear(12, 12)
        self.act2 = nn.ReLU()
        self.output = nn.Linear(12, 1)
        self.act_output = nn.Sigmoid()

    def forward(self, x):
        x = self.act1(self.hidden1(x))
        x = self.act2(self.hidden2(x))
        x = self.act_output(self.output(x))
        return x

model = CustomClassifier()


In [52]:
loss_fn = nn.L1Loss(size_average=None, reduce=None, reduction='mean')
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [53]:
import random
random.seed(42069)
n_epochs = 5000
batch_size = int(len(X_train) / 50)
print(batch_size)
for epoch in range(n_epochs):
    for i in range(0, len(X_train), batch_size):
        Xbatch = X_train[i:i+batch_size]
        y_pred = model(Xbatch)
        ybatch = Y_train[i:i+batch_size]
        loss = loss_fn(y_pred, ybatch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Finished epoch {epoch}, latest loss {loss}, accuracy {sum(ybatch == y_pred.round())/len(ybatch)} vs baseline {sum(ybatch) / len(ybatch)}')

2913
Finished epoch 0, latest loss 0.2893819510936737, accuracy tensor([0.7429]) vs baseline tensor([0.6000])
Finished epoch 1, latest loss 0.18757584691047668, accuracy tensor([0.8286]) vs baseline tensor([0.6000])
Finished epoch 2, latest loss 0.1739780455827713, accuracy tensor([0.8286]) vs baseline tensor([0.6000])
Finished epoch 3, latest loss 0.17016100883483887, accuracy tensor([0.8286]) vs baseline tensor([0.6000])
Finished epoch 4, latest loss 0.1670149713754654, accuracy tensor([0.8286]) vs baseline tensor([0.6000])
Finished epoch 5, latest loss 0.16128791868686676, accuracy tensor([0.8286]) vs baseline tensor([0.6000])
Finished epoch 6, latest loss 0.14992594718933105, accuracy tensor([0.8571]) vs baseline tensor([0.6000])
Finished epoch 7, latest loss 0.1432872712612152, accuracy tensor([0.8571]) vs baseline tensor([0.6000])
Finished epoch 8, latest loss 0.13580690324306488, accuracy tensor([0.8571]) vs baseline tensor([0.6000])
Finished epoch 9, latest loss 0.1051688566803

In [54]:
Y_hat = model(X_test).round()
# round predictions
Y_hat

tensor([[0.],
        [1.],
        [1.],
        ...,
        [1.],
        [1.],
        [1.]], grad_fn=<RoundBackward0>)

In [55]:
print(sum(Y_test == Y_hat)/len(Y_test))
loss = loss_fn(Y_test, Y_hat)

tensor([0.6351])


In [56]:
loss

tensor(0.3649, grad_fn=<MeanBackward0>)

In [57]:
sum(Y_test)/len(Y_test)