### Importing libraries

In [1]:
import majka
import random
import unicodedata
import pandas as pd
import plotly.express as px

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from nltk.probability import FreqDist
from gensim.models import Word2Vec

In [2]:
# Specifying Majka library for lemmatization
lemmatize = majka.Majka('./majka.w-lt')
lemmatize.tags = False # Shows only lemma form of word, no tags
lemmatize.first_only = True # Shows only one example of lemma form

### Defining functions

In [3]:
def list_of_lists(df):
    """ 
    Takes each row (=sentence) from dataframe
    and adds it to a list.
    Each list is then seperated into words.
    """
    df_values = df.values.tolist()
    lists = []
    for index in range(df.size):
        lists.append(df_values[index][0].split(',')) 
    return lists


def no_diacritics(sentences):
    """
    Removes diacritics from every word.
    """
    return "".join(word for word in unicodedata.normalize("NFD", sentences) 
                   if not unicodedata.combining(word))


def frequency(dataset):
    """
    Using FreqDist() from NLTK library to find out
    how often each words appears in dataset.
    """
    freq = FreqDist()
    for i in range(len(dataset)):
        for j in range(len(dataset[i])):
            freq[dataset[i][j]] += 1
    return(freq)


def no_freq_words(dataset):
    """
    Removing most frequent words, which
    do not have any particular meaning,
    from dataset.
    """
    for word in most_freq:
        for index in range(len(dataset)):
            while word in dataset[index]:
                dataset[index].remove(word)

        
def preprocessing(word):
    """
    Removing diacritics,
    lemmatizing and lowercasing word.
    """
    word=word.upper()
    if len(lemmatize.find(word))>0:
        word=no_diacritics(lemmatize.find(word)[0]['lemma'].lower())
    else:
        word=no_diacritics(word.lower())
    return word


def r_len(dataset):
    """
    Creating function range(len())
    to make code shorter.
    """
    return range(len(dataset))

## Data preprocessing

In [4]:
# Loading the dataset
df = pd.read_csv('psp_records_list.csv')
df.tail()

Unnamed: 0,Line
19816,"pan,poslanec,mikuláš,ferjenčík,faktickou,pozná..."
19817,"já,se,také,pokusím,vystoupit,naposledy,každopá..."
19818,"pan,předseda,kalousek"
19819,"rámci,konsenzuální,diskuse,navrhuji,kompromis,..."
19820,"tuto,chvíli,nemám,nikoho,přihlášeného,do,obecn..."


In [5]:
# Creating list of lists made of rows from df
vocabulary = list_of_lists(df)

In [6]:
# Example of vocabulary
vocabulary[:2]

[['já',
  'vám',
  'děkuji',
  'dále',
  'se',
  'hlásil',
  'pan',
  'předseda',
  'kalousek'],
 ['děkuji',
  'dámy',
  'pánové',
  'promiňte',
  'pokládám',
  'za',
  'vhodné',
  'aby',
  'poslanecké',
  'sněmovně',
  'nahlas',
  'zazněla',
  'informace',
  'že',
  'ústavní',
  'soud',
  'vyhlásil',
  'zrušení',
  'zákona',
  'zdanění',
  'restitucí',
  'ústava',
  'tedy',
  'zůstala',
  'tomto',
  'státě',
  'ochráněna',
  'právní',
  'stav',
  've',
  'věci',
  'vyrovnání',
  'státu',
  'církvemi',
  'zůstává',
  'stejný',
  'tak',
  'jak',
  'byl',
  'nastaven',
  'od',
  'vám',
  'milí',
  'kolegové',
  'kteří',
  'jste',
  'se',
  'na',
  'pokusu',
  'tuto',
  'komunistickou',
  'krádež',
  'podíleli',
  'zůstala',
  'jenom',
  'věčná',
  'ostuda',
  'dlouhý',
  'potlesk',
  'vpravo']]

In [7]:
# Creating list of lists containing words in their lemma form.
# As input for lemmatize.find using 
# uppercase words because Majka library works better then.
lemma_vocab = vocabulary.copy()

for i in r_len(lemma_vocab):
    for j in r_len(lemma_vocab[i]):
        if len((lemmatize.find(lemma_vocab[i][j].upper())))>0:
            lemma_vocab [i][j]=lemmatize.find(lemma_vocab[i][j].upper())[0]['lemma']

In [8]:
# Lemma_vocab example
lemma_vocab[:2]

[['já',
  'vy',
  'děkovat',
  'dále',
  'se',
  'hlásit',
  'Pan',
  'předseda',
  'Kalousek'],
 ['děkovat',
  'dáma',
  'pán',
  'prominout',
  'pokládat',
  'za',
  'vhodný',
  'aby',
  'poslanecký',
  'sněmovna',
  'nahlas',
  'zaznět',
  'informace',
  'že',
  'ústavní',
  'soud',
  'vyhlásit',
  'zrušení',
  'zákon',
  'zdanění',
  'restituce',
  'ústava',
  'Ted',
  'zůstat',
  'tento',
  'stát',
  'ochránit',
  'právní',
  'stav',
  'v',
  'věc',
  'vyrovnání',
  'stát',
  'církev',
  'zůstávat',
  'stejný',
  'tak',
  'jak',
  'být',
  'nastaven',
  'od',
  'vy',
  'milí',
  'kolega',
  'který',
  'být',
  'se',
  'na',
  'pokus',
  'tento',
  'komunistický',
  'krádež',
  'podílet',
  'zůstat',
  'jenom',
  'věčné',
  'ostuda',
  'Dlouhý',
  'potlesk',
  'vpravo']]

In [9]:
# Removing diacritics and 
# lowercasing all words
for i in r_len(lemma_vocab):
    for j in r_len(lemma_vocab[i]):
        lemma_vocab[i][j] = no_diacritics(lemma_vocab[i][j].lower())

In [10]:
# Lemma_vocab example
lemma_vocab[:2]

[['ja',
  'vy',
  'dekovat',
  'dale',
  'se',
  'hlasit',
  'pan',
  'predseda',
  'kalousek'],
 ['dekovat',
  'dama',
  'pan',
  'prominout',
  'pokladat',
  'za',
  'vhodny',
  'aby',
  'poslanecky',
  'snemovna',
  'nahlas',
  'zaznet',
  'informace',
  'ze',
  'ustavni',
  'soud',
  'vyhlasit',
  'zruseni',
  'zakon',
  'zdaneni',
  'restituce',
  'ustava',
  'ted',
  'zustat',
  'tento',
  'stat',
  'ochranit',
  'pravni',
  'stav',
  'v',
  'vec',
  'vyrovnani',
  'stat',
  'cirkev',
  'zustavat',
  'stejny',
  'tak',
  'jak',
  'byt',
  'nastaven',
  'od',
  'vy',
  'mili',
  'kolega',
  'ktery',
  'byt',
  'se',
  'na',
  'pokus',
  'tento',
  'komunisticky',
  'kradez',
  'podilet',
  'zustat',
  'jenom',
  'vecne',
  'ostuda',
  'dlouhy',
  'potlesk',
  'vpravo']]

In [11]:
# Most frequent words in lemma_vocab
# before removing any word
frequency_bef = frequency(lemma_vocab)

In [12]:
# 10 most common words in dataset
frequency_bef.most_common(n=10)

[('byt', 142491),
 ('se', 79751),
 ('to', 68783),
 ('ze', 51493),
 ('na', 46701),
 ('ten', 41451),
 ('ktery', 38006),
 ('pan', 30441),
 ('mit', 27321),
 ('navrh', 26061)]

In [13]:
# Saving 19 most common words from dataset into a list
# (20th words I believe is to be important)
# and then removing those words from lemma_vocab
most_freq = []
for i in range(20):
    most_freq.append(frequency_bef.most_common()[i][0])
            
no_freq_words(lemma_vocab)

In [14]:
# lemma_vocab example
lemma_vocab[:2]

[['vy', 'dale', 'hlasit', 'predseda', 'kalousek'],
 ['dama',
  'prominout',
  'pokladat',
  'vhodny',
  'poslanecky',
  'snemovna',
  'nahlas',
  'zaznet',
  'informace',
  'ustavni',
  'soud',
  'vyhlasit',
  'zruseni',
  'zakon',
  'zdaneni',
  'restituce',
  'ustava',
  'ted',
  'zustat',
  'stat',
  'ochranit',
  'pravni',
  'stav',
  'v',
  'vec',
  'vyrovnani',
  'stat',
  'cirkev',
  'zustavat',
  'stejny',
  'jak',
  'nastaven',
  'od',
  'vy',
  'mili',
  'kolega',
  'pokus',
  'komunisticky',
  'kradez',
  'podilet',
  'zustat',
  'jenom',
  'vecne',
  'ostuda',
  'dlouhy',
  'potlesk',
  'vpravo']]

In [15]:
# Most frequent words after removing 
# frequantly used words in dataset
frequency_aft1 = frequency(lemma_vocab)

In [16]:
# Load czech stopwords.
# Removing stopworsd from dataset.

cz_stopwords = pd.read_json("stop_words_czech.json")
for i in r_len(cz_stopwords):
    word = preprocessing(cz_stopwords.iloc[i,0])
    most_freq.append(word) if word not in most_freq else most_freq
    
no_freq_words(lemma_vocab)

In [17]:
# Example of final lemma_vocab form
lemma_vocab[:2]

[['hlasit', 'predseda', 'kalousek'],
 ['dama',
  'prominout',
  'pokladat',
  'vhodny',
  'poslanecky',
  'snemovna',
  'nahlas',
  'zaznet',
  'informace',
  'ustavni',
  'soud',
  'vyhlasit',
  'zruseni',
  'zakon',
  'zdaneni',
  'restituce',
  'ustava',
  'zustat',
  'stat',
  'ochranit',
  'pravni',
  'stav',
  'vec',
  'vyrovnani',
  'stat',
  'cirkev',
  'zustavat',
  'stejny',
  'nastaven',
  'mili',
  'kolega',
  'pokus',
  'komunisticky',
  'kradez',
  'podilet',
  'zustat',
  'vecne',
  'ostuda',
  'dlouhy',
  'potlesk',
  'vpravo']]

In [18]:
# Most frequent words in lemma_vocab
# after removing czech stopwords from dataset
frequency_aft = frequency(lemma_vocab)

In [19]:
# Finding out the changes in number of words 
# used in lemma_vocab before anything was removed
# and then after removing words
x=0
y=0
z=0

for value in frequency_bef.values():
    x += value
for value in frequency_aft.values():
    y += value
for value in frequency_aft1.values():
    z += value

print("\nNumber of all words in dataset before removing words:",x,
      "\nNumber of words in dataset after removing most_freq:",z,
      "\nNumber of words in dataset after removing most_freq+cz_stopwords:",y,
      "\nFinal difference:",x-y)


Number of all words in dataset before removing words: 2897666 
Number of words in dataset after removing most_freq: 2141137 
Number of words in dataset after removing most_freq+cz_stopwords: 1633914 
Final difference: 1263752


# Word2Vec - library gensim

In [20]:
# Making Word2Vec model and creating 
# vocabulary from lemma_vocab dataset
model = Word2Vec(size=300, min_count=25, window=5, workers=10)
model.build_vocab(lemma_vocab)
total_examples = model.corpus_count

# Training the model
model.train(lemma_vocab, total_examples=total_examples, epochs=15)

(21632420, 24508710)

In [21]:
# Word vectors
# = Vectors representing the meaning of a word
ex = ['Zeman','konference']
for word in ex:
    print(f'Reprezentace slova "{word}" jako vektor o velikost {model.vector_size}:\
    \n {model.wv[preprocessing(word)]}\n')

Reprezentace slova "Zeman" jako vektor o velikost 300:    
 [ 3.22607517e-01 -2.89785475e-01 -2.63615042e-01  2.64255434e-01
 -2.69250959e-01  1.78170234e-01 -3.89721423e-01  5.20880878e-01
  4.96032119e-01  4.55039263e-01  3.35975796e-01 -2.42405515e-02
  3.33715618e-01  2.16738746e-01  2.14182422e-01  3.45988333e-01
  1.16779789e-01 -2.55051613e-01 -5.64700007e-01  5.00202775e-01
 -6.29882365e-02 -2.23987438e-02 -3.80098075e-01 -2.78072476e-01
  1.45558432e-01 -1.11257091e-01  5.91279101e-03 -4.06233847e-01
 -1.99854717e-01  2.27981638e-02  1.33335620e-01  3.70354027e-01
  4.61969048e-01  3.07144374e-01  4.03128803e-01  7.35893175e-02
 -1.35614589e-01 -2.84695327e-01  4.68533635e-01 -2.80735642e-01
 -1.61377862e-01 -5.24016023e-01  4.88441825e-01 -5.62891662e-01
  6.75946474e-02 -1.49525359e-01 -1.35563806e-01  2.46801227e-01
 -2.18251675e-01  7.60917306e-01 -1.86539471e-01  5.46102151e-02
  4.38468248e-01  8.48959833e-02 -2.05510721e-01 -5.02337575e-01
  2.49713659e-01  2.90820390e-

In [22]:
# Finding most similar words 1
ex = ['Zeman', 'ODS', 'TOP', 'Fiala']
for word in ex:
    print(f'10 nejpodobnějších slov ke slovu "{word}":\n\
{model.wv.most_similar(preprocessing(word))}\n')

10 nejpodobnějších slov ke slovu "Zeman":
[('nemec', 0.592239499092102), ('babisovi', 0.5849370956420898), ('general', 0.5763773918151855), ('prezident', 0.5578403472900391), ('babis', 0.5389994382858276), ('krecek', 0.5361361503601074), ('ceskoslovensko', 0.5251922011375427), ('pavel', 0.5159570574760437), ('izraelsky', 0.5131109356880188), ('vrazda', 0.5087348818778992)]

10 nejpodobnějších slov ke slovu "ODS":
[('topit', 0.7720670700073242), ('cssd', 0.68853759765625), ('kscm', 0.6539381742477417), ('pirat', 0.6304103136062622), ('komunista', 0.5621398687362671), ('fiala', 0.5548536777496338), ('opozicni', 0.5468418598175049), ('-podporit', 0.5173612833023071), ('jmenem', 0.5000512599945068), ('kdu', 0.48242926597595215)]

10 nejpodobnějších slov ke slovu "TOP":
[('ods', 0.7720670700073242), ('kscm', 0.6561376452445984), ('pirat', 0.6377772092819214), ('cssd', 0.6203767657279968), ('-podporit', 0.566278338432312), ('opozicni', 0.544070303440094), ('klub', 0.5372275114059448), ('csl'

In [23]:
# Finding most similar words 2 
model.wv.most_similar(positive=['svoboda','volba'],negative=['babis'])

[('odneti', 0.4053291082382202),
 ('listina', 0.396473228931427),
 ('-podmineny', 0.3896358609199524),
 ('rovnost', 0.3809761106967926),
 ('kandidovat', 0.3777309060096741),
 ('korespondencni', 0.3730629086494446),
 ('prezidentsky', 0.3705733120441437),
 ('referendum', 0.36678335070610046),
 ('pokus', 0.36486852169036865),
 ('omezovani', 0.36199092864990234)]

In [24]:
# Looking at similarity between two words
ex = ['Poslanecká','sněmovna','KDU','ČSL',
           'Zeman','prezident','měna','peníze',
           'Česko','republika']
print('Podobnost mezi následujícími dvěma slovy:')
for i in range(len(ex)-1):
    print(f'"{ex[i]}" a "{ex[i+1]}":\
    {model.wv.similarity(preprocessing(ex[i]),preprocessing(ex[i+1]))}')

Podobnost mezi následujícími dvěma slovy:
"Poslanecká" a "sněmovna":    0.16101276874542236
"sněmovna" a "KDU":    0.11074370890855789
"KDU" a "ČSL":    0.8701298832893372
"ČSL" a "Zeman":    0.16828572750091553
"Zeman" a "prezident":    0.5578403472900391
"prezident" a "měna":    0.009858846664428711
"měna" a "peníze":    0.140177384018898
"peníze" a "Česko":    0.013132994994521141
"Česko" a "republika":    0.2768818140029907


In [25]:
# Looking at similarity between two sets of words
ex = [['zeman','prezident'],['babis','predseda'],
       ['ods','fiala'],['ano','babis'],
       ['ods','fiala','mistopredseda'],['ano','babis','predseda']]
for i in range(len(ex)-1):
    print(f'{ex[i]} X {ex[i+1]} = {model.wv.n_similarity(ex[i],ex[i+1])}')

['zeman', 'prezident'] X ['babis', 'predseda'] = 0.46512168645858765
['babis', 'predseda'] X ['ods', 'fiala'] = 0.463428795337677
['ods', 'fiala'] X ['ano', 'babis'] = 0.37958407402038574
['ano', 'babis'] X ['ods', 'fiala', 'mistopredseda'] = 0.341896653175354
['ods', 'fiala', 'mistopredseda'] X ['ano', 'babis', 'predseda'] = 0.5797679424285889


## Visualization 

In [26]:
# Using tsne for dimensionality reduction,
# that is, reducing currect dimension into 2 dimension
N=1000
N_words = random.sample(list(model.wv.vocab), N)
X = model.wv[N_words]
tsne_model = TSNE(n_components=2, init='pca', n_iter=2500,
                  random_state=20, metric='cosine', perplexity=35)
Y = tsne_model.fit_transform(X)



In [30]:
# Using Kmean for creating clusters of words
Kmean = KMeans(n_clusters=8, init='k-means++', max_iter=300,
       n_init=10, n_jobs=1, random_state=None, tol=0.0001, verbose=0)
Kmean = Kmean.fit(Y)


'n_jobs' was deprecated in version 0.23 and will be removed in 1.0 (renaming of 0.25).



In [31]:
# Creating a name for each cluster group
name = []
for i in r_len(Kmean.cluster_centers_):
    n = 'Cluster ' + str(i+1)
    name.append(n)

In [34]:
# 2D plot for word vectors using t-SNE & clusters
fig = px.scatter(x=Y[:,0], y=Y[:,1], text=list(N_words))
fig.update_traces(textposition='top center', mode='text')
fig.update_layout(title_text='Word2Vector visualisation using t-SNE and clusters')

df = pd.DataFrame(columns=['x','y'], index=r_len(name))
for i in r_len(name):
    df.iloc[i,0] = Kmean.cluster_centers_[i,0]
    df.iloc[i,1] = Kmean.cluster_centers_[i,1]

fig2 = px.scatter(df, x='x', y='y', hover_name=name)
fig2.update_traces(marker=dict(size=50,
                              line=dict(width=1,
                                        color='black'),
                              color='#FEDA15'),
                   opacity=0.8)
fig.add_trace(fig2.data[0])

fig.show()