## Import required packages

In [None]:
import pandas as pd
import numpy as np

# data manipulating:
import unicodedata
import re 
from datetime import datetime
#from langdetect import detect

# NLP packages:
import hu_core_ud_lg as hu
nlp2 = hu.load()
import spacy

# Tokenization:
from spacy.tokenizer import Tokenizer
from spacy.lang.hu import Hungarian
nlp1 = Hungarian()

# Faster than Spacy:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

## Import files

In [None]:
# import the created hungarian stop_words.txt:
with open("stop_words.txt", "r", encoding='utf-8') as f:
    stop = [i for line in f for i in line.split('\n')]
    stop = list(filter(None, stop))

In [None]:
# import dataframe:
df_index = pd.read_csv("C:/Users/molna/Desktop/Projektek/Python oktatás/python_bevez_oktatas/data/2020.csv", sep="%%", encoding="utf-8", header=0)
len(df_index)

## Corpus preprocessing

In [None]:
def data_cleaning(df):
    """
    Transforming the imported Index.hu dataframes:
    - Header cleaning,
    - Merge columns,
    - Regex,
    - Drop wrong rows,
    - Drop english articles, 
    - Create day column.
    """
    df.columns = df.columns.str.replace('"', '')

    for column in df:
        df[column] = df[column].astype(str).str.replace('"', '')

    df = df[(df['szoveg'] != 'NA')]
    df = df[df['szoveg'] != ' '] 
    df = df[df['szoveg'] != '  '] 
    df = df[~df['tag'].str.contains('Napirajz')]
    df = df[df['cim'] != 'NA']

    df = df[~df.duplicated(['cim', 'szoveg'], keep = 'last')]

    df['merged'] = df['head'] + df['szoveg']
    df['merged'] = df['merged'].astype(str).str.replace('NA', '')

    df["merged"] = df["merged"].apply(lambda x: re.sub('Common.charts.register.*;', '', x,flags=re.DOTALL))

    df['merged'] = df['merged'].str.replace('\xa0', ' ')
    df['merged'] = df['merged'].str.replace('Ne maradjon le semmiről! Facebook', '')

    '''
    nyelv=[]
    for i in range(0,(len(df))):
        if len(df['merged'].iloc[i])>2 :
            nyelv.append(detect(df['merged'].iloc[i]))
        else: 
            nyelv.append(np.NaN)    
    
    df = df[df['nyelv'] == 'hu'] 
    '''
    df['datum'] = df['datum'].str.split('Módosítva:').str[0]
    df['nap'] = pd.to_datetime(df['datum']).dt.date
    
    return df

In [None]:
def title_cleaning(df):
    """
    Transforming title column for the NLP:
    - Remove punctuations,
    - Lowercasing,
    - Remove stopwords.
    """
    
    df['cim_cleaned'] = df['cim'].str.replace('-', ' ')
    df['cim_cleaned'] = df['cim_cleaned'].str.replace('[^\w\s]', '')
    df['cim_cleaned'] = df['cim_cleaned'].str.replace('[0-9]', ' ')
    
    df['cim_cleaned'] = df['cim_cleaned'].str.lower()
    df['cim_cleaned'] = df['cim_cleaned'].str.replace('\xa0', ' ')

    df['cim_cleaned'] = df['cim_cleaned'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

    return df

In [None]:
def merged_cleaning(df):
    """
    Transforming szoveg (merged) column for the NLP:
    - Remove punctuations,
    - Lowercasing,
    - Remove stopwords.
    """
    
    df['szoveg_cleaned'] = df['merged'].str.replace('-', ' ')
    df['szoveg_cleaned'] = df['szoveg_cleaned'].str.replace('[^\w\s]', ' ')
    df['szoveg_cleaned'] = df['szoveg_cleaned'].str.replace('[0-9]', ' ')
    
    df['szoveg_cleaned'] = df['szoveg_cleaned'].str.lower()
    df['szoveg_cleaned'] = df['szoveg_cleaned'].str.replace('\xa0', ' ')

    df['szoveg_cleaned'] = df['szoveg_cleaned'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

    return df

In [None]:
'''
def desc_stat(df):
    """
    #Calculating basic statistics for title and article, after cleaning.
    """

    # descriptive statistics for the titles:
    df['cim_word_cnt_wth_stop'] = df['cim_without_punct'].apply(lambda x: len(str(x).split(' ')))
    df['cim_word_cnt_wthout_stop'] = df['cim_without_stop'].apply(lambda x: len(str(x).split(' ')))
    df['cim_stop_cnt'] = df['cim_without_punct'].apply(lambda x: len([x for x in x.split() if x in stop]))

    # descriptive statistics for the articles:
    df['merged_word_cnt_wth_stop'] = df['merged_without_punct'].apply(lambda x: len(str(x).split(' ')))
    df['merged_word_cnt_wth_stop'] = df['merged_without_stop'].apply(lambda x: len(str(x).split(' ')))
    df['merged_stop_cnt'] = df['merged_without_punct'].apply(lambda x: len([x for x in x.split() if x in stop]))

    return df
'''

In [None]:
# function calls:
df_index = data_cleaning(df_index)
df_index = title_cleaning(df_index)
df_index = merged_cleaning(df_index)
#df_index = desc_stat(df_index)

print(df_index.columns)
print(len(df_index))

In [None]:
df_index.head(2)

## Natural Language Processing - Hungarian

### Tokenization

In [None]:
# With Spacy:
'''
def tokenizer(df):
    tokenizer = Tokenizer(nlp1.vocab)
    df['cim_tokens'] = [tokenizer (i) for i in df['cim_without_stop']]
    df['merged_tokens'] = [tokenizer (i) for i in df['merged_without_stop']]

    return df 

df_index = tokenizer(df_index)
'''

In [None]:
# With NLTK:
df_index["merged_tokens"] = df_index["szoveg_cleaned"].apply(lambda x: tknzr.tokenize(x))

### Lemmatization

In [None]:
'''
def lemmatizer(df):
    """
    #Lemmatize the article's tokens.
    """
    lemmas = []
    for j in range(0, len(df['merged_tokens'])):
        lemma = [i.lemma_ for i in nlp2(str(df['merged_tokens'][j]))]
        lemma = str(lemma)
        lemma = re.sub("'", '', lemma)
        lemma = re.sub("\[|\]"," ", lemma)
        #a = re.sub(' ,', '', a)
        #a = re.sub(' ', '', a)
        lemmas.append(lemma)

    df['merged_lemmas'] = lemmas
    df['merged_lemmas'] = df['merged_lemmas'].apply(lambda x: x.split(","))

    return df
    
df_index = lemmatizer(df_index) 
'''

In [None]:
df_index['merged_lemmas'] = df_index['szoveg_cleaned'].apply(lambda x: [i.lemma_ for i in nlp2(x)])

In [None]:
df_index[['merged_tokens','merged_lemmas']][0:4]

### Named Entity Recognition

In [None]:
'''
def ner_rec(df):
        """
        Named Entity Recognition on the article's tokens.
        """
        ners = []
        for j in range(0, len(df['merged_tokens'])):
            ner = [i.pos_ for i in nlp2(str(df['merged_tokens'][j]))]
            ner = str(ner)
            ner = re.sub("'", '', ner)
            ner = re.sub("\[|\]"," ", ner)
            #a = re.sub('  ,', '', a)
            ners.append(ner)

        df['merged_ners'] = ners
        df['merged_ners'] = df['merged_ners'].apply(lambda x: x.split(","))

    return df

df_index = ner_rec(df_index)
'''

In [None]:
df_index['merged_ners'] = df_index['szoveg_cleaned'].apply(lambda x: [i.pos_ for i in nlp2(x)])

In [None]:
df_index[['merged_tokens','merged_lemmas', 'merged_ners']][0:2]

## Export dataframe to pickle

In [None]:
df_index.to_pickle('index_2020_nlp.pkl')

In [None]:
df2 = pd.read_pickle('2016_index.pkl')
print(len(df2))
df2.columns

In [None]:
df2['cim']

## Import pickle dataframe

In [None]:
import gc
import pandas as pd
import numpy as np

In [None]:

gc.collect()

In [None]:
df = pd.read_pickle('minta_nlp.pkl')

In [None]:
df.columns

## További adattisztítás

In [None]:
from gensim.models import Word2Vec
# https://towardsdatascience.com/word2vec-skip-gram-model-part-1-intuition-78614e4d6e0b

# https://towardsdatascience.com/an-implementation-guide-to-word2vec-using-numpy-and-google-sheets-13445eebd281

In [None]:
# szót nézi
CBOW_model = Word2Vec(df_index["merged_lemmas"], min_count=5, workers=3, window=9, sg=0)

In [None]:
CBOW_model.most_similar(positive= ['koronavírus', 'covid'], negative=[], topn=10, restrict_vocab=None, indexer=None)

In [None]:
CBOW_model.save("word2vec.model")


In [None]:
skipgram_model = Word2Vec(df["merged_lemmas"], min_count=5, workers=3, window=9, sg=1)

In [None]:
skipgram_model.most_similar(positive=[' politika'], negative=[], topn=100, restrict_vocab=None, indexer=None)

In [None]:
kell = [' szabadidősport',
        ' versenysport',
        ' utánpótlásprogram',
        ' mob',
        ' versenyrendszer',
        ' sportszövetség',
        ' foci',
        ' sportirányítás',
        ' sportágfejlesztési',
        ' sikersportág',
        ' bozsikprogram',
        ' futball',
        ' utánpótlásnevelés',
        ' utánpótlásbázis',
        ' ökölvívás',
        ' utánpótlásképzés',
        ' parasport',
        ' sportállamtitkárság',
        ' mobfőtitkár',
        ' jégkorongszövetség',
        ' utánpótlásnevelő',
        ' csapatsportág',
        ' látványsportág',
        ' héraklészprogram',
        ' labdarúgás',
        ' kajakkenu',
        ' utánpótlás',
        ' mefs',
        ' dzsúdó',
        ' társaságiadókedvezmény',
        ' látványcsapatsport',
        ' birkózás',
        ' öttusa',
        ' labdarúgószövetség',
        ' látványsport',
        ' küzdősport']

In [None]:
CBOW_model.most_similar(positive= kell, negative=[], topn=100, restrict_vocab=None, indexer=None)

In [None]:
def environment(mire,mennyi_kozel,mennyi_tavol):
    neighbours_neighbours=[]
    for i in range(0,mennyi_kozel):
        a=skipgram_model.most_similar(positive=mire, negative=[], topn=mennyi_kozel, restrict_vocab=None, indexer=None)[i][0]
        lista=skipgram_model.most_similar(positive=[a], negative=[], topn=mennyi_tavol, restrict_vocab=None, indexer=None)
        valami=[]
        for j in range(0,len(lista)):
            valami.append(lista[j][0])
    neighbours_neighbours.append(valami)
    return(neighbours_neighbours)

In [None]:
# np.unique(environment(' sport', 10, 10))

In [None]:
skipgram_model.most_similar(positive=[], negative=[' korrupció'], topn=10, restrict_vocab=None, indexer=None)

In [None]:
skipgram_model.most_similar(positive=[' korrupció'], negative=[], topn=10, restrict_vocab=None, indexer=None)

In [None]:
CBOW_model.most_similar(positive=[' sport', ' tao'], negative=[], topn=20, restrict_vocab=None, indexer=None)

In [None]:
CBOW_model.most_similar(positive=[' tao'], negative=[], topn=20, restrict_vocab=None, indexer=None)

In [None]:
CBOW_model.most_similar(positive=[' sport', ' politika'], negative=[' szabadidő'], topn=20, restrict_vocab=None, indexer=None)

In [None]:
CBOW_model.most_similar(positive=[' origo'], negative=[], topn=20, restrict_vocab=None, indexer=None)

In [None]:
a = CBOW_model.most_similar(positive=[' origo'], negative=[], topn=20, restrict_vocab=None, indexer=None)

In [None]:
szolista = []
for i in range(0, len(a)):
    szolista.append(a[i][0])

In [None]:
szolista

In [None]:
lista = []
for i in range(0, len(szolista)):
    lista.append(re.sub(' ', '', szolista[i]))

In [None]:
df[df['lemmas'].astype(str).str.contains(r'\b(?:{})\b'.format('|'.join(lista)))] # szűrés str.contains() fv-el.