## Importování dat

In [1]:
import pandas as pd

df = pd.read_csv('psp_records_list.csv')
df.tail()

Unnamed: 0,Line
19816,"pan,poslanec,mikuláš,ferjenčík,faktickou,pozná..."
19817,"já,se,také,pokusím,vystoupit,naposledy,každopá..."
19818,"pan,předseda,kalousek"
19819,"rámci,konsenzuální,diskuse,navrhuji,kompromis,..."
19820,"tuto,chvíli,nemám,nikoho,přihlášeného,do,obecn..."


## Listy vět sestavené z jednotlivých slov

In [2]:
def get_sentences(df):
    df_values = df.values.tolist()
    all_sentences = []
    for index in range(df.size):
        all_sentences.append(df_values[index][0].split(',')) 
    return all_sentences

all_sentences = get_sentences(df)

## Lemmatizace

In [3]:
import simplemma

langdata = simplemma.load_data('cs')
lemma = []
for i in range(len(all_sentences)):
    lemma.append([simplemma.lemmatize(t, langdata).lower() for t in all_sentences[i]])

## Zbavení se diakritiky

In [4]:
# lemma = all_sentences.copy()

In [5]:
import unicodedata

def deaccent(unistr):
    return "".join(aChar 
                   for aChar in unicodedata.normalize("NFD", unistr) 
                   if not unicodedata.combining(aChar))

for i in range(len(lemma)):
    for j in range(len(lemma[i])):
        lemma[i][j] = deaccent(lemma[i][j])

## Zbavení se nejfrekventovanějších slov

In [6]:
from nltk.probability import FreqDist

def frequent(lst):
    fdist = FreqDist()
    for i in range(len(lst)):
        for j in range(len(lst[i])):
            fdist[lst[i][j]] += 1
    return(fdist)
            
frequency_bef = frequent(lemma)

In [7]:
most_common = []
for i in range(21):
    most_common.append(frequency_bef.most_common()[i][0])

In [8]:
def remove_common_words(lst):
    for word in most_common:
        for index in range(len(lst)):
            while word in lst[index]:
                lst[index].remove(word)
                
remove_common_words(lemma)

In [9]:
frequency_aft1 = frequent(lemma)

## Odstranění tzv. stopwords

In [10]:
stopwords = pd.read_json("stop_words_czech.json")
    
for i in range(stopwords.size):
    word = simplemma.lemmatize(stopwords.iloc[i,0], langdata)
    word = deaccent(word.lower())
    if word not in most_common:
        most_common.append(word)

In [11]:
remove_common_words(lemma)

In [12]:
frequency_aft = frequent(lemma)

In [13]:
x=0
y=0
z=0

for value in frequency_bef.values():
    x += value
for value in frequency_aft.values():
    y += value
for value in frequency_aft1.values():
    z += value

print("\nNumber of characters before removing words:",x,
      "\nNumber of characters after removing most_common:",z,
      "\nNumber of characters after removing most_common+stopwords:",y,
      "\nFinal difference:",x-y)


Number of characters before removing words: 2897666 
Number of characters after removing most_common: 2037704 
Number of characters after removing most_common+stopwords: 1618770 
Final difference: 1278896


# Word2Vec - knihovna gensim

In [14]:
from gensim.models import Word2Vec

In [15]:
word2vec = Word2Vec(lemma, size=200, min_count=4, window=10, alpha=0.025, workers=10, iter=10)

In [16]:
vocabulary = word2vec.wv.vocab

## Slovo reprezentováno jako vektor

In [17]:
word2vec.wv['zeman']

array([ 3.51175576e-01, -2.79717555e-04, -1.82915241e-01, -3.40548217e-01,
       -2.60031611e-01,  1.36953562e-01, -1.55284822e-01, -1.02680236e-01,
        6.49239868e-02, -2.79741406e-01, -2.80437857e-01, -3.54300551e-02,
        1.30780235e-01,  8.73002261e-02, -2.61862546e-01, -2.65694857e-01,
       -1.35317042e-01, -1.14354350e-01, -2.34881222e-01,  3.78472544e-03,
        2.45107915e-02,  3.59781235e-01,  4.16202009e-01, -1.10081986e-01,
       -1.75781980e-01,  1.56864107e-01, -1.40179351e-01,  2.98176616e-01,
        1.03017040e-01, -1.08725980e-01, -9.24001262e-02, -2.30912685e-01,
        1.46904543e-01,  4.91378307e-02, -3.79550338e-01, -5.52398637e-02,
       -5.00715196e-01,  2.60691345e-01,  9.81825814e-02, -4.20340598e-01,
        1.84053704e-01,  5.07472344e-02, -1.56234711e-01, -3.18947703e-01,
        3.73416208e-02, -4.18067724e-01, -5.11574864e-01,  1.25493437e-01,
       -3.88484634e-02,  3.11053246e-01, -4.06919658e-01, -2.38618225e-01,
        4.99025136e-01,  

## Nalezení n nejpodobnějších slov

In [18]:
search_word = input("> ").lower()
search_word = deaccent(simplemma.lemmatize(search_word, langdata))

try:
    most_similar = word2vec.wv.most_similar(search_word)
    print(*most_similar, sep='\n')
except:
    print("Slovo se nenachází ve slovníku.")

> Miloš
('zeman', 0.8373407125473022)
('general', 0.7359596490859985)
('zemana', 0.728776752948761)
('babis', 0.7067705392837524)
('prezident', 0.6905307769775391)
('andrej', 0.6855937242507935)
('vacka', 0.6632065773010254)
('minsk', 0.6614105701446533)
('lukasenkovi', 0.6601101756095886)
('eduard', 0.6591899394989014)


In [19]:
search_word = input("> ").lower()
search_word = deaccent(simplemma.lemmatize(search_word, langdata))

try:
    most_similar = word2vec.wv.most_similar(search_word)
    print(*most_similar, sep='\n')
except:
    print("Slovo se nenachází ve slovníku.")

> Zeman
('milos', 0.8373407125473022)
('zemana', 0.714470624923706)
('babis', 0.6769558787345886)
('garrigue', 0.66814124584198)
('bakala', 0.6578290462493896)
('andrej', 0.655814528465271)
('dr', 0.6424289345741272)
('masaryk', 0.6303917169570923)
('vacka', 0.6233670711517334)
('plukovnik', 0.6230596303939819)


In [20]:
search_word = input("> ").lower()
search_word = deaccent(simplemma.lemmatize(search_word, langdata))

try:
    most_similar = word2vec.wv.most_similar(search_word)
    print(*most_similar, sep='\n')
except:
    print("Slovo se nenachází ve slovníku.")

> ANO
('trikolora', 0.6071336269378662)
('spd', 0.4940462112426758)
('cssd', 0.48995640873908997)
('blm', 0.46352341771125793)
('komunista', 0.45486903190612793)
('ozyvat', 0.4403122365474701)
('pravda', 0.4396103620529175)
('lavice', 0.43255114555358887)
('ods', 0.43176037073135376)
('smich', 0.4316036105155945)


In [21]:
search_word = input("> ").lower()
search_word = deaccent(simplemma.lemmatize(search_word, langdata))

try:
    most_similar = word2vec.wv.most_similar(search_word)
    print(*most_similar, sep='\n')
except:
    print("Slovo se nenachází ve slovníku.")

> ODS
('top', 0.83887779712677)
('cssd', 0.8099815845489502)
('kscm', 0.797346830368042)
('lidovec', 0.7641809582710266)
('piratu', 0.7255417108535767)
('predkladali', 0.7247452735900879)
('opozicni', 0.6996139883995056)
('komunista', 0.6667767763137817)
('obcansti', 0.662468671798706)
('levicova', 0.6528711318969727)
