## Importování dat

In [1]:
import pandas as pd

df = pd.read_csv('psp_records_list.csv')
df.tail()

Unnamed: 0,Line
19816,"pan,poslanec,mikuláš,ferjenčík,faktickou,pozná..."
19817,"já,se,také,pokusím,vystoupit,naposledy,každopá..."
19818,"pan,předseda,kalousek"
19819,"rámci,konsenzuální,diskuse,navrhuji,kompromis,..."
19820,"tuto,chvíli,nemám,nikoho,přihlášeného,do,obecn..."


## Listy vět sestavené z jednotlivých slov

In [2]:
def get_sentences(df):
    df_values = df.values.tolist()
    all_sentences = []
    for index in range(df.size):
        all_sentences.append(df_values[index][0].split(',')) 
    return all_sentences

all_sentences = get_sentences(df)

## Lemmatizace

In [3]:
import majka
morph = majka.Majka('./majka.w-lt')

morph.tags = False
morph.first_only = True

In [4]:
lemma = all_sentences.copy()
for i in range(len(lemma)):
    for j in range(len(lemma[i])):
        if len((morph.find(lemma[i][j].upper())))>0:
            lemma[i][j]=morph.find(lemma[i][j].upper())[0]['lemma']

## Zbavení se diakritiky

In [5]:
import unicodedata

def deaccent(unistr):
    return "".join(aChar 
                   for aChar in unicodedata.normalize("NFD", unistr) 
                   if not unicodedata.combining(aChar))

for i in range(len(lemma)):
    for j in range(len(lemma[i])):
        lemma[i][j] = deaccent(lemma[i][j].lower())

## Zbavení se nejfrekventovanějších slov

In [6]:
from nltk.probability import FreqDist

def frequent(lst):
    fdist = FreqDist()
    for i in range(len(lst)):
        for j in range(len(lst[i])):
            fdist[lst[i][j]] += 1
    return(fdist)
            
frequency_bef = frequent(lemma)

In [7]:
most_common = []
for i in range(20):
    most_common.append(frequency_bef.most_common()[i][0])

In [8]:
def remove_common_words(lst):
    for word in most_common:
        for index in range(len(lst)):
            while word in lst[index]:
                lst[index].remove(word)
                
remove_common_words(lemma)

In [9]:
frequency_aft1 = frequent(lemma)

## Odstranění tzv. stopwords

In [10]:
stopwords = pd.read_json("stop_words_czech.json")
    
for i in range(stopwords.size):
    s_word = stopwords.iloc[i,0]
    if len(morph.find(s_word))>0:
        word=deaccent(morph.find(s_word)[0]['lemma'].lower())
    else:
        word=deaccent(s_word.lower())
    if word not in most_common:
        most_common.append(word)

In [11]:
remove_common_words(lemma)

In [12]:
frequency_aft = frequent(lemma)

In [13]:
x=0
y=0
z=0

for value in frequency_bef.values():
    x += value
for value in frequency_aft.values():
    y += value
for value in frequency_aft1.values():
    z += value

print("\nNumber of characters before removing words:",x,
      "\nNumber of characters after removing most_common:",z,
      "\nNumber of characters after removing most_common+stopwords:",y,
      "\nFinal difference:",x-y)


Number of characters before removing words: 2897666 
Number of characters after removing most_common: 2141137 
Number of characters after removing most_common+stopwords: 1666126 
Final difference: 1231540


# Word2Vec - knihovna gensim

In [14]:
from gensim.models import Word2Vec

In [15]:
word2vec = Word2Vec(lemma, size=200, min_count=8, window=10, alpha=0.025, workers=10, iter=10)

In [16]:
vocabulary = word2vec.wv.vocab

## Slovo reprezentováno jako vektor

In [17]:
word2vec.wv['zeman']

array([-0.43386754,  0.74826026, -0.00257947, -0.06355972, -0.10731003,
       -0.17768575, -0.5169839 , -0.12985788, -0.20703898, -0.03963907,
       -0.20919181,  1.2892985 , -0.6796307 ,  0.46543524,  0.04570214,
        0.45597324,  0.27578112,  0.14600573,  0.14410338,  0.10679486,
        0.5053489 , -0.00880062,  0.1137164 , -0.08536599,  0.07035284,
        0.87398916, -0.09384713,  0.30372128, -0.63399464, -0.07319361,
        0.30825043,  0.25326455, -0.10992944,  0.17186844, -0.5212144 ,
       -0.66902685,  0.4599228 ,  0.6616107 ,  0.1336313 ,  0.6063353 ,
        0.51631314,  0.21274175,  0.04280899, -0.91724056,  0.3544108 ,
       -0.49956453, -0.5216537 ,  0.37241638, -0.17816351, -0.51408416,
       -0.49368367, -0.3833033 ,  0.653759  ,  0.23066974,  0.07694104,
        0.2932137 ,  0.20498553,  0.04653297, -0.10810111, -0.13792792,
       -0.66973644,  0.07798081, -0.4647603 , -0.16538948, -0.37987795,
       -0.01009958, -0.69191754, -0.0076103 , -0.81763124,  1.23

## Nalezení n nejpodobnějších slov

In [18]:
search_word = input("> ").upper()
if len(morph.find(search_word))>0:
    search_word=deaccent(morph.find(search_word)[0]['lemma'].lower())
else:
    search_word=deaccent(search_word.lower())

try:
    most_similar = word2vec.wv.most_similar(search_word)
    print(*most_similar, sep='\n')
except:
    print("Slovo se nenachází ve slovníku.")

> Miloš
('zeman', 0.8559572696685791)
('general', 0.7472396492958069)
('vacek', 0.7243351936340332)
('prezident', 0.7154787182807922)
('babisovi', 0.6910240054130554)
('zlodejna', 0.681834876537323)
('zlodej', 0.6817171573638916)
('tehdejsi', 0.6758270263671875)
('soudruh', 0.6694729328155518)
('tvrdik', 0.6689713001251221)


In [19]:
search_word = input("> ").upper()
if len(morph.find(search_word))>0:
    search_word=deaccent(morph.find(search_word)[0]['lemma'].lower())
else:
    search_word=deaccent(search_word.lower())

try:
    most_similar = word2vec.wv.most_similar(search_word)
    print(*most_similar, sep='\n')
except:
    print("Slovo se nenachází ve slovníku.")

> Zeman
('milos', 0.8559572696685791)
('general', 0.6801844835281372)
('krecek', 0.662620484828949)
('vacek', 0.6537578105926514)
('lukasenka', 0.6450490951538086)
('zlodejna', 0.6441500782966614)
('statnost', 0.6429609060287476)
('alexandra', 0.6413673758506775)
('tvrdik', 0.632576584815979)
('babisovi', 0.6288683414459229)


In [22]:
search_word = input("> ").upper()
if len(morph.find(search_word))>0:
    search_word=deaccent(morph.find(search_word)[0]['lemma'].lower())
else:
    search_word=deaccent(search_word.lower())

try:
    most_similar = word2vec.wv.most_similar(search_word)
    print(*most_similar, sep='\n')
except:
    print("Slovo se nenachází ve slovníku.")

> ANO
('trikolora', 0.6527727246284485)
('protikorupcni', 0.5108012557029724)
('spd', 0.49982357025146484)
('cssd', 0.49099281430244446)
('bds', 0.4617321491241455)
('ozyvat', 0.45251235365867615)
('kscm', 0.43097496032714844)
('plena', 0.4118102192878723)
('nesroz', 0.38261961936950684)
('komunista', 0.3783489167690277)


In [21]:
search_word = input("> ").upper()
if len(morph.find(search_word))>0:
    search_word=deaccent(morph.find(search_word)[0]['lemma'].lower())
else:
    search_word=deaccent(search_word.lower())

try:
    most_similar = word2vec.wv.most_similar(search_word)
    print(*most_similar, sep='\n')
except:
    print("Slovo se nenachází ve slovníku.")

> TOP
('ods', 0.8299254775047302)
('lidovec', 0.7952907085418701)
('kscm', 0.7395617365837097)
('pirat', 0.6912156343460083)
('kdu', 0.6887631416320801)
('csl', 0.6845287680625916)
('cssd', 0.6718961000442505)
('senatorsky', 0.6696460843086243)
('stan', 0.654228687286377)
('jmenem', 0.640989363193512)
