## Import required packages

In [1]:
import pandas as pd
import numpy as np

# data manipulating:
import unicodedata
import re 
from datetime import datetime
from langdetect import detect

# NLP packages:
import hu_core_ud_lg as hu
nlp2 = hu.load()
import spacy

# Tokenization:
from spacy.tokenizer import Tokenizer
from spacy.lang.hu import Hungarian
nlp1 = Hungarian()

# Faster than Spacy:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

In [42]:
import warnings
warnings.filterwarnings('ignore')

## Import files

In [2]:
# import the created hungarian stop_words.txt:
with open("stop_words.txt", "r", encoding='utf-8') as f:
    stop = [i for line in f for i in line.split('\n')]
    stop = list(filter(None, stop))

In [57]:
# import dataframe:
df_origo = pd.read_csv("origo2015.csv", sep="%%", encoding="utf-8", header=0)
len(df_origo)

38968

## Corpus preprocessing


In [25]:
def data_cleaning(df):
    """
    Transforming the imported Origo.hu dataframes:
    - Header cleaning,
    - Regex,
    - Drop NA rows,
    - Drop duplicated articles,
    - Create datetime column.
    """
    df.columns = df.columns.str.replace('"', '')
    df['szoveg'] = df["szoveg"].apply(lambda x: re.sub(r'TOVÁBBI CIKKEK A ROVATBÓL.*$', '', x))
    
    for column in df:
        df[column] = df[column].astype(str).str.replace('"', '')
        
    df['szoveg'] = df['szoveg'].str.replace('\xa0', ' ')
    
    df = df[~df.duplicated(['cim', 'szoveg'], keep = 'last')]
    df = df[df['cim'] != 'NA']
    
    """
    nyelv=[]
    for i in range(0,(len(df))):
        if len(df['szoveg'].iloc[i])>2 :
            nyelv.append(detect(df['szoveg'].iloc[i]))
        else: 
            nyelv.append(np.NaN)    
    
    df = df[df['nyelv'] == 'hu']
    """
                
    df['nap'] = pd.to_datetime(df['datum']).dt.date # .dt.normalize()
    
    return df

In [6]:
def title_cleaning(df):
    """
    Transforming title column for the NLP:
    - Remove punctuations,
    - Lowercasing,
    - Remove stopwords.
    """
    df['cim_cleaned'] = df['cim'].str.replace('-', ' ')
    df['cim_cleaned'] = df['cim_cleaned'].str.replace('[^\w\s]', '')
    df['cim_cleaned'] = df['cim_cleaned'].str.replace('[0-9]', ' ')
    
    df['cim_cleaned'] = df['cim_cleaned'].str.lower()
    df['cim_cleaned'] = df['cim_cleaned'].str.replace('\xa0', ' ')
    
    df['cim_cleaned'] = df['cim_cleaned'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
        
    return df

In [7]:
def article_cleaning(df):
    """
    Transforming szoveg (merged) column for the NLP:
    - Remove punctuations,
    - Lowercasing,
    - Remove stopwords.
    """
    df['szoveg_cleaned'] = df['szoveg'].str.replace('-', ' ')
    df['szoveg_cleaned'] = df['szoveg_cleaned'].str.replace('[^\w\s]', '')
    df['szoveg_cleaned'] = df['szoveg_cleaned'].str.replace('[0-9]', ' ')

    df['szoveg_cleaned'] = df['szoveg_cleaned'].str.lower()
    df['szoveg_cleaned'] = df['szoveg_cleaned'].str.replace('\xa0', ' ')
    
    df['szoveg_cleaned'] = df['szoveg_cleaned'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    
    return df

In [8]:
'''
def desc_stat(df):
    """
    Calculating basic statistics for title and article, after cleaning.
    """
    # descriptive statistics for the titles:
    df['cim_word_cnt_wth_stop'] = df['cim_without_punct'].apply(lambda x: len(str(x).split(' ')))
    df['cim_word_cnt_wthout_stop'] = df['cim_without_stop'].apply(lambda x: len(str(x).split(' ')))
    df['cim_stop_cnt'] = df['cim_without_punct'].apply(lambda x: len([x for x in x.split() if x in stop]))
    
    # descriptive statistics for the articles:
    df['szoveg_word_cnt_wth_stop'] = df['szoveg_without_punct'].apply(lambda x: len(str(x).split(' ')))
    df['szoveg_word_cnt_wth_stop'] = df['szoveg_without_stop'].apply(lambda x: len(str(x).split(' ')))
    df['szoveg_stop_cnt'] = df['szoveg_without_punct'].apply(lambda x: len([x for x in x.split() if x in stop]))
    
    return df
'''

In [59]:
# function calls:
df_origo = data_cleaning(df_origo)
df_origo = article_cleaning(df_origo)
df_origo = title_cleaning(df_origo)

print(df_origo.columns)
print(len(df_origo))

Index(['Id', 'cim', 'szerzo', 'szoveg', 'datum', 'url', 'nap',
       'szoveg_cleaned', 'cim_cleaned'],
      dtype='object')
35751


In [60]:
pd.set_option('display.max_rows', 41000)
df_origo['nap']

"1"        2015-01-01
"2"        2015-01-01
"3"        2015-01-02
"5"        2015-04-04
"6"        2015-04-04
"7"        2015-04-04
"8"        2015-04-04
"9"        2015-04-04
"10"       2015-04-04
"11"       2015-04-04
"12"       2015-04-04
"13"       2015-04-04
"14"       2015-04-04
"15"       2015-01-11
"17"       2015-04-04
"18"       2015-04-04
"19"       2015-04-04
"20"       2015-04-04
"21"       2015-04-04
"22"       2015-04-04
"23"       2015-04-04
"24"       2015-04-04
"25"       2015-04-04
"26"       2015-04-04
"27"       2015-01-11
"28"       2015-04-04
"29"       2015-04-04
"30"       2015-04-04
"31"       2015-04-04
"32"       2015-04-04
"33"       2015-04-04
"34"       2015-04-04
"35"       2015-04-04
"36"       2015-04-04
"37"       2015-04-04
"38"       2015-01-11
"39"       2015-04-04
"40"       2015-04-04
"41"       2015-04-04
"42"       2015-04-04
"44"       2015-04-04
"45"       2015-04-04
"46"       2015-04-04
"48"       2015-04-04
"49"       2015-04-04
"50"      

## Natural Language Processing - Hungarian

### Tokenization

In [61]:
# With NLTK:
df_origo["merged_tokens"] = df_origo["szoveg_cleaned"].apply(lambda x: tknzr.tokenize(x))

### Lemmatization

In [62]:
df_origo['merged_lemmas'] = df_origo['szoveg_cleaned'].apply(lambda x: [i.lemma_ for i in nlp2(x)])

In [63]:
df_origo[['merged_tokens','merged_lemmas']][1:4]

Unnamed: 0,merged_tokens,merged_lemmas
"""2""","[továbbra, fizetni, m, os, út, használatáért, ...","[továbbra, fizet, m, os, út, használat, autós,..."
"""3""","[illegális, bevándorlót, hagyott, sorsára, emb...","[illegális, bevándorló, hagy, sors, embercsemp..."
"""5""","[mínusz, fok, szombat, hajnalban, nógrád, megy...","[mínusz, fok, szombat, hajnal, nógra, megyei, ..."


### Named Entity Recognition


In [1]:
'''
def ner_rec(df):
        """
        Named Entity Recognition on the article's tokens.
        """
        ners = []
        for j in range(0, len(df['merged_tokens'])):
            ner = [i.pos_ for i in nlp2(str(df['merged_tokens'][j]))]
            ner = str(ner)
            ner = re.sub("'", '', ner)
            ner = re.sub("\[|\]"," ", ner)
            #a = re.sub('  ,', '', a)
            ners.append(ner)

        df['merged_ners'] = ners
        df['merged_ners'] = df['merged_ners'].apply(lambda x: x.split(","))

    return df

df_index = ner_rec(df_index)
'''

'\ndef ner_rec(df):\n        """\n        Named Entity Recognition on the article\'s tokens.\n        """\n        ners = []\n        for j in range(0, len(df[\'merged_tokens\'])):\n            ner = [i.pos_ for i in nlp2(str(df[\'merged_tokens\'][j]))]\n            ner = str(ner)\n            ner = re.sub("\'", \'\', ner)\n            ner = re.sub("\\[|\\]"," ", ner)\n            #a = re.sub(\'  ,\', \'\', a)\n            ners.append(ner)\n\n        df[\'merged_ners\'] = ners\n        df[\'merged_ners\'] = df[\'merged_ners\'].apply(lambda x: x.split(","))\n\n    return df\n\ndf_index = ner_rec(df_index)\n'

In [None]:
#df_origo['merged_ners'] = df_origo['szoveg_cleaned'].apply(lambda x: [i.pos_ for i in nlp2(x)])

In [None]:
#df_origo[['merged_tokens','merged_lemmas', 'merged_ners']][0:2]

## Export preprocessed dataframe to pickle

In [64]:
df_origo.to_pickle('2015_origo.pkl')

In [70]:
df = pd.read_pickle('2020_origo.pkl')
df.columns

Index(['Id', 'cim', 'szerzo', 'szoveg', 'datum', 'url', 'nap',
       'szoveg_cleaned', 'cim_cleaned', 'merged_tokens', 'merged_lemmas'],
      dtype='object')