# Text Processing

This notebook imports scraped data, splits text column to words, extracts and normalizes tokens. Next, tokens are converted to embeddings.

In [1]:
### Setup

# pip install spacy
# python -m spacy download de_core_news_sm  - for small model (13 MB)
# python -m spacy download de_core_news_md  - for medium model (42 MB)
# source - https://spacy.io/models/de


import spacy
from spacy.lang.de.examples import sentences 


# Example from documentation
nlp = spacy.load("de_core_news_sm")
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_)

Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen
Die DET nk
ganze ADJ nk
Stadt NOUN sb
ist AUX ROOT
ein DET nk
Startup NOUN pd
: PUNCT punct
Shenzhen NOUN sb
ist AUX cj
das DET nk
Silicon PROPN pnc
Valley PROPN sb
für ADP mnr
Hardware-Firmen NOUN nk


In [2]:
## Data loading

import numpy as np
import pandas as pd

standard_data = pd.read_csv('./data/derstandard_frontpage_data.csv')
standard_data

Unnamed: 0,title,subtitle,link,datetime,kicker,n_posts,storylabels
0,Liverpool nach großem Zittern im Ligacup-Halbf...,Minamino rettete die Reds in der Nachspielzeit...,https://www.derstandard.at/story/2000132122809...,2021-12-22T23:47,Premier League,7,
1,USA lassen Pfizers Covid-Tablette Paxlovid für...,Am Dienstag wurden in Österreich 2.269 Neuinfe...,https://www.derstandard.at/jetzt/livebericht/2...,2021-12-22T23:39,Omikron-Welle,22127,NachleseLivebericht
2,Real Madrid baut Vorsprung in LaLiga aus,Madrilenen nach 2:1 in Bilbao acht Punkte vor ...,https://www.derstandard.at/story/2000132122694...,2021-12-22T23:38,Primera Division,40,
3,Paris Saint-Germain wendet zweite Liga-Niederl...,Icardi besorgteAusgleichstreffergegen Grbic-Te...,https://www.derstandard.at/story/2000132122616...,2021-12-22T23:30,Fußball,3,
4,OSZE verkündet Einigung auf Waffenstillstand i...,"DieKonfliktparteiensollen zugestimmt haben, da...",https://www.derstandard.at/story/2000132122215...,2021-12-22T22:33,Weihnachtsfrieden,44,
...,...,...,...,...,...,...,...
90979,Jesus-Geburt unter Palmen,Auch der Koran erzählt über die Geburt von Jes...,https://www.derstandard.at/story/3000000200744...,2023-12-22T06:00,Wussten Sie schon?,1,
90980,"""Aquaman and the Lost Kingdom"" scheitert an ve...",Der Erfolg derSuperheldenfilmeleidet immer öft...,https://www.derstandard.at/story/3000000200724...,2023-12-22T06:00,Im Kino,242,
90981,One-Man-Show mit fatalen Folgen für die Demokr...,Die Wahlen in Serbien waren eine Farce. Die Eu...,https://www.derstandard.at/story/3000000200698...,2023-12-22T06:00,Vedran Džihić,113,Kommentar der anderen
90982,David Alaba zum zehnten Mal zu Österreichs Fuß...,Zehn von zwölf Trainern wählten den derzeit ve...,https://www.derstandard.at/story/3000000200745...,2023-12-22T05:46,Fußball,33,


## Part 1,2: Tokenizing + lemmatizing text columns

First, title and subtitile columns are split into individual words. Second, words are replaced by their standard forms (großem - groß, rettete - retten). Finally, the list of standard forms is joined back to a string.

In [3]:
# Initializing two columns
standard_data['title_tokens'] = standard_data['title']
standard_data['subtitle_tokens'] = standard_data['subtitle']

# Load model
nlp = spacy.load("de_core_news_sm")
nlp.get_pipe('lemmatizer')


<spacy.pipeline.edit_tree_lemmatizer.EditTreeLemmatizer at 0x13d6a7ba450>

In [4]:
# loops through rows

for index, row in standard_data.iterrows():
    
    text = nlp(row['title_tokens'].replace("-", " ").replace(",", " ").replace(": ", " "))
    token_list = ' '.join([token.lemma_ for token in text]) # list of standardized tokens joined to a string
    standard_data.at[index, 'title_tokens'] = token_list # store in the dataframe

    # try except because subtitle is sometimes empty, then it outputs error

    try:

        text = nlp(row['subtitle_tokens'])
        token_list = ' '.join([token.lemma_ for token in text]) # list of standardized tokens joined to a string

    except:
        token_list = '' # if empty, token list empty

    standard_data.at[index, 'subtitle_tokens'] = token_list # store in the dataframe



In [5]:
standard_data

Unnamed: 0,title,subtitle,link,datetime,kicker,n_posts,storylabels,title_tokens,subtitle_tokens
0,Liverpool nach großem Zittern im Ligacup-Halbf...,Minamino rettete die Reds in der Nachspielzeit...,https://www.derstandard.at/story/2000132122809...,2021-12-22T23:47,Premier League,7,,Liverpool nach groß Zittern in Ligacup Halbfinal,Minamino retten der Red in der Nachspielzeit i...
1,USA lassen Pfizers Covid-Tablette Paxlovid für...,Am Dienstag wurden in Österreich 2.269 Neuinfe...,https://www.derstandard.at/jetzt/livebericht/2...,2021-12-22T23:39,Omikron-Welle,22127,NachleseLivebericht,USA lassen Pfizer Covid Tablette Paxlovid für ...,an Dienstag werden in Österreich 2.269 Neuinfe...
2,Real Madrid baut Vorsprung in LaLiga aus,Madrilenen nach 2:1 in Bilbao acht Punkte vor ...,https://www.derstandard.at/story/2000132122694...,2021-12-22T23:38,Primera Division,40,,Real Madrid bauen Vorsprung in LaLiga aus,Madrilen nach 2:1 in Bilbao acht Punkt vor FC ...
3,Paris Saint-Germain wendet zweite Liga-Niederl...,Icardi besorgteAusgleichstreffergegen Grbic-Te...,https://www.derstandard.at/story/2000132122616...,2021-12-22T23:30,Fußball,3,,Paris Saint Germain wenden zweiter Liga Nieder...,Icardi besorgteAusgleichstreffergegen Grbic-Te...
4,OSZE verkündet Einigung auf Waffenstillstand i...,"DieKonfliktparteiensollen zugestimmt haben, da...",https://www.derstandard.at/story/2000132122215...,2021-12-22T22:33,Weihnachtsfrieden,44,,OSZE verkünden Einigung auf Waffenstillstand i...,DieKonfliktparteiensollen zustimmen haben -- d...
...,...,...,...,...,...,...,...,...,...
90979,Jesus-Geburt unter Palmen,Auch der Koran erzählt über die Geburt von Jes...,https://www.derstandard.at/story/3000000200744...,2023-12-22T06:00,Wussten Sie schon?,1,,Jesus Geburt unter Palme,auch der Koran erzählen über der Geburt von Je...
90980,"""Aquaman and the Lost Kingdom"" scheitert an ve...",Der Erfolg derSuperheldenfilmeleidet immer öft...,https://www.derstandard.at/story/3000000200724...,2023-12-22T06:00,Im Kino,242,,-- Aquaman and the Lost Kingdom -- scheitern a...,der Erfolg derSuperheldenfilmeleidet immer öft...
90981,One-Man-Show mit fatalen Folgen für die Demokr...,Die Wahlen in Serbien waren eine Farce. Die Eu...,https://www.derstandard.at/story/3000000200698...,2023-12-22T06:00,Vedran Džihić,113,Kommentar der anderen,One man Show mit fatal Folge für der Demokrati...,der Wahl in Serbien sein ein Farce -- der euro...
90982,David Alaba zum zehnten Mal zu Österreichs Fuß...,Zehn von zwölf Trainern wählten den derzeit ve...,https://www.derstandard.at/story/3000000200745...,2023-12-22T05:46,Fußball,33,,David Alaba zu zehnter Mal zu Österreich Fußba...,zehn von zwölf Trainer wählen der derzeit verl...


## Step 3: Embeddings

Two new columns are created. They contain lists of vectors. A vector represents embedding of a single token, a title then is converted to a list of such token. The numeric representation of text enables us standard training of a neural net with the vectors as inputs.

In [6]:
standard_data['title_vectors'] = standard_data['title_tokens']
standard_data['subtitle_vectors'] = standard_data['subtitle_tokens']
nlp.get_pipe("tok2vec")

for index, row in standard_data.iterrows():
    
    token_list = nlp(row['title_tokens'])
    vector_list = [token.vector for token in token_list]
    standard_data.at[index, 'title_vectors'] = vector_list 

    try:

        token_list = nlp(row['subtitle_tokens'])
        vector_list = [token.vector for token in token_list]

    except:
        token_list = []

    standard_data.at[index, 'subtitle_vectors'] = vector_list



In [7]:
standard_data.iloc[1,10]

[array([ 3.692384  , -0.79341817,  0.8014476 ,  0.21542087,  3.3309894 ,
        -1.7508197 ,  1.5662454 ,  4.1913977 ,  1.610764  ,  0.9857319 ,
         2.6465712 , -1.1758938 , -2.8921268 , -1.4700353 , -0.65892154,
         0.65152645, -2.3798072 , -1.7871587 , -1.6304971 ,  1.1759071 ,
         1.1745667 , -0.54721475, -1.747736  ,  1.7272619 , -0.68712485,
         2.0964904 ,  4.372034  , -1.7668109 ,  2.5359218 ,  0.7000592 ,
        -2.1182008 , -0.87628216,  0.8273686 ,  4.1580505 , -1.0567715 ,
        -4.0108004 , -1.9487038 , -2.7814364 ,  1.1276196 , -0.4488173 ,
        -1.7948387 ,  2.4206367 , -2.1504884 , -1.4431915 ,  3.3593378 ,
         1.4053435 , -0.2260904 ,  0.47888428, -1.6598024 , -0.23835683,
        -3.222202  , -0.57097244,  3.3159392 , -2.5651915 , -1.507254  ,
        -0.7389581 , -1.1597584 , -1.6395983 ,  0.4221772 , -3.281804  ,
        -0.6234259 , -0.8441899 ,  1.3783156 ,  2.5865333 , -3.5404754 ,
         1.8326724 ,  0.5941204 , -1.8874086 , -0.9

In [8]:
import numpy as np

standard_data_copy = standard_data

for index,row in standard_data_copy.iterrows():

    l = standard_data_copy.loc[index, 'title_vectors']
    standard_data_copy.at[index, 'title_vectors'] = np.sum(l, axis=0)

    l = standard_data_copy.loc[index, 'subtitle_vectors']
    standard_data_copy.at[index, 'subtitle_vectors'] = np.sum(l, axis=0)

In [9]:
X = np.vstack(standard_data_copy['title_vectors'].to_numpy())
X



array([[  4.561758  ,  -6.725279  ,  -2.3595262 , ...,   5.359077  ,
         -1.8784206 ,   8.5382805 ],
       [  5.2742243 , -11.190229  ,  -2.6303904 , ...,  12.713042  ,
         -9.304037  ,  -4.9678845 ],
       [ 12.325029  ,  -3.5658174 ,   2.7926927 , ...,  13.089906  ,
        -10.963812  ,  -1.7138405 ],
       ...,
       [ 18.424406  ,  -8.1488085 ,  -8.883829  , ...,   2.5849488 ,
        -15.968649  ,  -7.9356976 ],
       [ -2.038211  , -18.766294  ,   7.7005734 , ...,  17.543652  ,
        -13.355871  ,   6.8655033 ],
       [ 12.515891  ,   3.4930995 ,   0.40148246, ...,   8.258898  ,
          1.8255993 ,  -0.6187186 ]], dtype=float32)

In [10]:
y = (standard_data_copy['n_posts'].to_numpy() > 50).astype(int)
y = np.column_stack(standard_data_copy['n_posts'].to_numpy() > 50).astype(int).T

y.shape
#X.shape

(90984, 1)

In [11]:
from sklearn.model_selection import train_test_split


X_train, X_test, Y_train, Y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42069)


In [12]:
print("Baseline accuracy: ")
np.sum(Y_test)/np.size(Y_test)

Baseline accuracy: 


0.5777325932846074

In [13]:
def shapes(names,values):
    names = names.split(',')
    for i in range(len(names)): print(names[i]+':',values[i].shape)

def f(x):
    return 1. / (1. + np.exp(-x))

def err(a, y):
    return np.sqrt(np.sum((a - y)**2) / np.size(y))

def acc(a, y):
    a = np.round(a).astype(int)
    correct = (a == y)
    return np.sum(correct) / np.size(correct)


def nn2(X, y, l_init=0.001, epochs=500, h=24, rs=1):
    np.random.seed(rs)
    Wij = np.random.rand(h, X.shape[1]) - 0.5
    Wjk = np.random.rand(y.shape[1], h) - 0.5
    for ep in range(epochs):
        l = l_init / (int(ep / epochs * 5) + 1)
        zj = np.dot(Wij, X.T)
        aj = f(zj)
        zk = np.dot(Wjk, aj)
        ak = f(zk)
        dk = (ak - y.T) * ak * (1 - ak)
        dj = aj * (1 - aj) * np.dot(dk.T, Wjk).T
        Wij += np.dot(-l * dj, X)
        Wjk += np.dot(-l * dk, aj.T)
        if ep % (epochs/10) == 0: 
            print('ep: %3d  err: %9.3f  acc: %4.2f' % 
                  (ep, err(ak.T, y), acc(ak.T, y),))
            print(l)
    shapes('X,y,Wij,Wjk,aj,ak',(X,y,Wij,Wjk,aj,ak))
    return Wij, Wjk

Wij, Wjk = nn2(X_train, Y_train)


ep:   0  err:     0.515  acc: 0.55
0.001


  return 1. / (1. + np.exp(-x))


ep:  50  err:     0.477  acc: 0.62
0.001
ep: 100  err:     0.475  acc: 0.63
0.0005
ep: 150  err:     0.471  acc: 0.63
0.0005
ep: 200  err:     0.470  acc: 0.63
0.0003333333333333333
ep: 250  err:     0.469  acc: 0.63
0.0003333333333333333
ep: 300  err:     0.469  acc: 0.63
0.00025
ep: 350  err:     0.468  acc: 0.64
0.00025
ep: 400  err:     0.468  acc: 0.64
0.0002
ep: 450  err:     0.467  acc: 0.64
0.0002
X: (72787, 96)
y: (72787, 1)
Wij: (24, 96)
Wjk: (1, 24)
aj: (24, 72787)
ak: (1, 72787)


In [14]:
int(99/100)

0

In [15]:
def ffwd(X, y, Wij, Wjk):
    if y.ndim==1: y = np.reshape(np.ravel(y), (len(y),1))
    zj = np.dot(Wij, X.T)
    aj = f(zj)
    zk = np.dot(Wjk, aj)
    ak = f(zk)
    print(err(ak.T, y))
    return acc(ak.T, y)

#Wij, Wjk = nn2(X_train, Y_train)
ffwd(X_test, Y_test, Wij, Wjk)

#Wij, Wjk = nn2(Xtr, ytr, epochs=500, h=50)
#ffwd(Xte, yte, Wij, Wjk)


0.47605918586227197


  return 1. / (1. + np.exp(-x))


0.6248832225092048

In [16]:
np.sum(Y_test) / np.size(Y_test)

0.5777325932846074