In [2]:
import spacy
import pandas as pd
import nltk
import re
from collections import Counter
import numpy as np
import os

In [3]:
os.makedirs('./Results/MK', exist_ok = True)

In [4]:
def flat_list(var):
    flat = [item for sublist in var for item in sublist]
    return flat

In [5]:
df = pd.read_csv('./Data/Koneski_final_finished_dataset.csv')

In [6]:
stihovi_mkd = df['Stih']
pesni_mkd = df['Pesna']

In [7]:
stihovi_list = stihovi_mkd.to_list()
pesni_list = pesni_mkd.to_list()
pesni_list = set(pesni_list)
pesni_list = (list(pesni_list))
pesni_list = [x for x in pesni_list if str(x) != 'nan']

In [8]:
nlp = spacy.load('mk_core_news_lg')

# Stihovi

## Word count without lematization and with stop words

In [109]:
stihovi_cleaned = []
stihovi_sentences = []
ids = []
for i, stih in enumerate(stihovi_list):
    if type(stih) is not float:
        stih = stih.lower()
        stih = stih.split()
        stih = [re.sub(r'[^\w\s]','',word) for word in stih]
        stih = [word for word in stih if word.isalnum()]
        if len(stih) == 0:
            continue
        stihovi_sentences.append(' '.join(stih))
        stihovi_cleaned.append(stih)
        ids.append(i)

In [110]:
godina_od_list = df.iloc[ids, 1]
godina_do_list = df.iloc[ids, 2]
ime_na_pesni = df.iloc[ids, :]['Pesna_Ime']
ime_na_zbirka = df.iloc[ids, :]['Zbirka']
ime_na_podzbirka = df.iloc[ids, :]['Podzbirka']

In [111]:
sentences_df = pd.DataFrame({'sentence': stihovi_sentences, 
                             'year_from': godina_od_list, 
                             'year_to': godina_do_list, 
                             'song_name':ime_na_pesni, 
                             'zbirka_name':ime_na_zbirka, 
                             'podzbirka_name':ime_na_podzbirka})

In [112]:
sentences_df.to_csv('./Results/MK/sentences_without_lem_with_stop.csv')

### Average words in stih

In [113]:
number_of_words = 0
for stih in stihovi_cleaned:
    number_of_words += len(stih)
print(number_of_words/len(stihovi_cleaned))
stihovi_cleaned_flat = flat_list(stihovi_cleaned)

4.416950959488273


In [114]:
stihovi_cleaned_flat = flat_list(stihovi_cleaned)

In [115]:
stih_count = Counter(stihovi_cleaned_flat)

In [116]:
stih_count_df = pd.DataFrame(stih_count.most_common())
stih_count_df.columns = ['words', 'count']

In [117]:
stih_count_df.to_csv('./Results/MK/stih_word_count_without_lem_with_stopwords.csv')

In [118]:
stih_count_df_10 = stih_count_df[stih_count_df['count'] > 10]
stih_count_df_10.to_csv('./Results/MK/stih_word_count_lem_with_stopwords_10.csv')

In [263]:
potential_stopwords = stih_count_df[stih_count_df['count'] > stih_count_df['count'].quantile(.80)]

In [264]:
potential_stopwords.to_csv('./Results/MK/potential_stopwords.csv')

In [269]:
potential_stopwords.head(70)

Unnamed: 0,words,count
0,и,1512
1,да,1237
2,на,1049
3,се,1034
4,во,765
...,...,...
65,нас,56
66,би,56
67,нè,55
68,зошто,55


In [272]:
potential_stopwords = potential_stopwords[potential_stopwords['count']>50]['words']

In [273]:
potential_stopwords = potential_stopwords.tolist()

## Word count with lematization and removed stop words

In [276]:
stih = stihovi_list[2]
print(stih)
stih = stih.lower()
stih = nltk.word_tokenize(stih)
stih = [re.sub(r'[^\w\s]','',word) for word in stih]
stih = [word for word in stih if not word in potential_stopwords]
stih = [word for word in stih if word.isalpha()]
print(stih)

како било кално –
['било', 'кално']


In [258]:
if 'за' in potential_stopwords:
    print("yes")

In [277]:
stihovi_cleaned = []
stihovi_sentences = []
ids = []
for i, stih in enumerate(stihovi_list):
    if i % 1000 == 0:
        print(i)
    if type(stih) is not float and type(stih) is not int:
        stih = stih.lower()
        stih = nltk.word_tokenize(stih)
        stih = [re.sub(r'[^\w\s]','',word) for word in stih]
        stih = [word for word in stih if not word in potential_stopwords]
        stih = [word for word in stih if word.isalpha()]
        processed_stih = []
        for word in stih:
            doc = nlp(word)
            for each_word in doc:
                lemma = each_word.lemma_ 
            processed_stih.append(lemma)
        if len(processed_stih) == 0:
            continue
        stihovi_sentences.append(' '.join(processed_stih))
        stihovi_cleaned.append(processed_stih)
        ids.append(i)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [278]:
godina_od_list = df.iloc[ids, 1]
godina_do_list = df.iloc[ids, 2]
ime_na_pesni = df.iloc[ids, :]['Pesna_Ime']
ime_na_zbirka = df.iloc[ids, :]['Zbirka']
ime_na_podzbirka = df.iloc[ids, :]['Podzbirka']

In [279]:
sentences_df = pd.DataFrame({'sentence': stihovi_sentences, 
                             'year_from': godina_od_list, 
                             'year_to': godina_do_list, 
                             'song_name':ime_na_pesni, 
                             'zbirka_name':ime_na_zbirka, 
                             'podzbirka_name':ime_na_podzbirka})

In [280]:
sentences_df.to_csv('./Results/MK/sentences_with_lem_without_stop.csv')

### Average words per stih

In [281]:
number_of_words = 0
for stih in stihovi_cleaned:
    number_of_words += len(stih)
print(number_of_words/len(stihovi_cleaned))
stihovi_cleaned_flat = flat_list(stihovi_cleaned)

2.6036406978004116


In [282]:
stihovi_cleaned_flat = flat_list(stihovi_cleaned)

In [283]:
stih_count = Counter(stihovi_cleaned_flat)

In [284]:
stih_count_df = pd.DataFrame(stih_count.most_common())
stih_count_df.columns = ['words', 'count']
stih_count_df.to_csv('./Results/MK/stih_word_count.csv')

In [285]:
stih_count_df_10 = stih_count_df[stih_count_df['count'] > 10]
stih_count_df_10.to_csv('./Results/MK/stih_word_count_10.csv')

## Word count without lematization and removed stop words

In [286]:
stihovi_cleaned = []
stihovi_sentences = []
ids = []
for i, stih in enumerate(stihovi_list):
    if i % 1000 == 0:
        print(i)
    if type(stih) is not float and type(stih) is not int:
        stih = stih.lower()
        stih = nltk.word_tokenize(stih)
        stih = [re.sub(r'[^\w\s]','',word) for word in stih]
        stih = [word for word in stih if not word in potential_stopwords]
        stih = [word for word in stih if word.isalpha()]
        processed_stih = []
        for word in stih:
            processed_stih.append(word)
        if len(processed_stih) == 0:
            continue
        stihovi_sentences.append(' '.join(processed_stih))
        stihovi_cleaned.append(processed_stih)
        ids.append(i)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [287]:
godina_od_list = df.iloc[ids, 1]
godina_do_list = df.iloc[ids, 2]
ime_na_pesni = df.iloc[ids, :]['Pesna_Ime']
ime_na_zbirka = df.iloc[ids, :]['Zbirka']
ime_na_podzbirka = df.iloc[ids, :]['Podzbirka']

In [288]:
sentences_df = pd.DataFrame({'sentence': stihovi_sentences, 
                             'year_from': godina_od_list, 
                             'year_to': godina_do_list, 
                             'song_name':ime_na_pesni, 
                             'zbirka_name':ime_na_zbirka, 
                             'podzbirka_name':ime_na_podzbirka})

In [289]:
sentences_df.to_csv('./Results/sentences_without_lem_without_stop.csv')

### Average words per stih

In [290]:
number_of_words = 0
for stih in stihovi_cleaned:
    number_of_words += len(stih)
print(number_of_words/len(stihovi_cleaned))
stihovi_cleaned_flat = flat_list(stihovi_cleaned)

2.6036406978004116


In [291]:
stihovi_cleaned_flat = flat_list(stihovi_cleaned)

In [292]:
stih_count = Counter(stihovi_cleaned_flat)

In [293]:
stih_count_df = pd.DataFrame(stih_count.most_common())
stih_count_df.columns = ['words', 'count']
stih_count_df.to_csv('./Results/MK/stih_word_count_without_lem.csv')

In [294]:
stih_count_df_10 = stih_count_df[stih_count_df['count'] > 10]
stih_count_df_10.to_csv('./Results/MK/stih_word_count_without_lem_10.csv')

# Pesni

## Word count with lematization and removed stop words

In [295]:
pesni_cleaned = []
pesni_sentences = []
ids = []
for i, pesna in enumerate(pesni_list):
    if i % 1000 == 0:
        print(i)
    if type(pesna) is not float and type(pesna) is not int:
        pesna = pesna.lower()
        pesna = nltk.word_tokenize(pesna)
        pesna = [re.sub(r'[^\w\s]','',word) for word in pesna]
        pesna = [word for word in pesna if not word in potential_stopwords]
        pesna = [word for word in pesna if word.isalpha()]
        processed_pesna = []
        for word in pesna:
            doc = nlp(word)
            for each_word in doc:
                lemma = each_word.lemma_ 
            processed_pesna.append(lemma)
        if len(processed_pesna) == 0:
            continue
        pesni_sentences.append(' '.join(processed_pesna))
        pesni_cleaned.append(processed_pesna)
        ids.append(i)

0


In [296]:
godina_od_list = df.iloc[ids, 1]
godina_do_list = df.iloc[ids, 2]
ime_na_pesni = df.iloc[ids, :]['Pesna_Ime']
ime_na_zbirka = df.iloc[ids, :]['Zbirka']
ime_na_podzbirka = df.iloc[ids, :]['Podzbirka']

In [297]:
sentences_df = pd.DataFrame({'pesna': pesni_sentences, 
                             'year_from': godina_od_list, 
                             'year_to': godina_do_list, 
                             'song_name':ime_na_pesni, 
                             'zbirka_name':ime_na_zbirka, 
                             'podzbirka_name':ime_na_podzbirka})

In [298]:
sentences_df.to_csv('./Results/MK/pesni_lem_without_stopwords.csv')

### Average words per pesna

In [299]:
number_of_words = 0
pesna_len = []
for pesna in pesni_cleaned:
    number_of_words += len(pesna)
    pesna_len.append(len(pesna))
print(number_of_words/len(pesni_cleaned))

46.03256704980843


In [300]:
pesna_len_counter = Counter(pesna_len)

In [301]:
pesna_len_counter_df = pd.DataFrame.from_dict(pesna_len_counter, orient='index').reset_index()

In [302]:
pesna_len_counter_df = pesna_len_counter_df.rename(columns={'index':'words_in_pesna', 0:'count'})

In [303]:
pesna_len_counter_df.to_csv('./Results/MK/word_count_in_pesna_lem_without_stop.csv')

## Word count without lematization and removed stop words

In [304]:
pesni_cleaned = []
pesni_sentences = []
ids = []
for i, pesna in enumerate(pesni_list):
    if i % 1000 == 0:
        print(i)
    if type(pesna) is not float and type(pesna) is not int:
        pesna = pesna.lower()
        pesna = nltk.word_tokenize(pesna)
        pesna = [re.sub(r'[^\w\s]','',word) for word in pesna]
        pesna = [word for word in pesna if not word in potential_stopwords]
        pesna = [word for word in pesna if word.isalpha()]
        processed_pesna = []
        for word in pesna:
            processed_pesna.append(word)
        if len(processed_pesna) == 0:
            continue
        pesni_sentences.append(' '.join(processed_pesna))
        pesni_cleaned.append(processed_pesna)
        ids.append(i)

0


In [305]:
godina_od_list = df.iloc[ids, 1]
godina_do_list = df.iloc[ids, 2]
ime_na_pesni = df.iloc[ids, :]['Pesna_Ime']
ime_na_zbirka = df.iloc[ids, :]['Zbirka']
ime_na_podzbirka = df.iloc[ids, :]['Podzbirka']

In [306]:
sentences_df = pd.DataFrame({'pesna': pesni_sentences, 
                             'year_from': godina_od_list, 
                             'year_to': godina_do_list, 
                             'song_name':ime_na_pesni, 
                             'zbirka_name':ime_na_zbirka, 
                             'podzbirka_name':ime_na_podzbirka})

In [307]:
sentences_df.to_csv('./Results/MK/pesni_without_lem_without_stopwords.csv')

### Average words per pesna

In [308]:
number_of_words = 0
pesna_len = []
for pesna in pesni_cleaned:
    number_of_words += len(pesna)
    pesna_len.append(len(pesna))
print(number_of_words/len(pesni_cleaned))

46.03256704980843


In [309]:
pesna_len_counter = Counter(pesna_len)

In [310]:
pesna_len_counter_df = pd.DataFrame.from_dict(pesna_len_counter, orient='index').reset_index()

In [311]:
pesna_len_counter_df = pesna_len_counter_df.rename(columns={'index':'words_in_pesna', 0:'count'})

In [312]:
pesna_len_counter_df.to_csv('./Results/MK/word_count_in_pesna_no_lem_without_stop.csv')

## Word count without lematization and without removed stop words

In [313]:
pesni_cleaned = []
pesni_sentences = []
ids = []
for i, pesna in enumerate(pesni_list):
    if i % 1000 == 0:
        print(i)
    if type(pesna) is not float and type(pesna) is not int:
        pesna = pesna.lower()
        pesna = nltk.word_tokenize(pesna)
        pesna = [re.sub(r'[^\w\s]','',word) for word in pesna]
        pesna = [word for word in pesna if word.isalpha()]
        processed_pesna = []
        for word in pesna:
            processed_pesna.append(word)
        if len(processed_pesna) == 0:
            continue
        pesni_sentences.append(' '.join(processed_pesna))
        pesni_cleaned.append(processed_pesna)
        ids.append(i)

0


In [314]:
godina_od_list = df.iloc[ids, 1]
godina_do_list = df.iloc[ids, 2]
ime_na_pesni = df.iloc[ids, :]['Pesna_Ime']
ime_na_zbirka = df.iloc[ids, :]['Zbirka']
ime_na_podzbirka = df.iloc[ids, :]['Podzbirka']

In [315]:
sentences_df = pd.DataFrame({'pesna': pesni_sentences, 
                             'year_from': godina_od_list, 
                             'year_to': godina_do_list, 
                             'song_name':ime_na_pesni, 
                             'zbirka_name':ime_na_zbirka, 
                             'podzbirka_name':ime_na_podzbirka})

In [316]:
sentences_df.to_csv('./Results/MK/pesni_without_lem_with_stopwords.csv')

### Average words per pesna

In [317]:
number_of_words = 0
pesna_len = []
for pesna in pesni_cleaned:
    number_of_words += len(pesna)
    pesna_len.append(len(pesna))
print(number_of_words/len(pesni_cleaned))

79.2183908045977


In [318]:
pesna_len_counter = Counter(pesna_len)

In [319]:
pesna_len_counter_df = pd.DataFrame.from_dict(pesna_len_counter, orient='index').reset_index()

In [320]:
pesna_len_counter_df = pesna_len_counter_df.rename(columns={'index':'words_in_pesna', 0:'count'})

In [321]:
pesna_len_counter_df.to_csv('./Results/MK/word_count_in_pesna_no_lem_with_stop.csv')

# NER

In [9]:
sentences = pd.read_csv('./Results/MK/sentences_with_lem_without_stop.csv')

In [10]:
sentences_list = sentences['sentence']

In [11]:
sentences_list

0                    лаеле пците
1                     било кален
2                    сето наваса
3             темничиштево пален
4                   крваво време
                  ...           
9224                поништи ужас
9225                       човек
9226                  може сфати
9227     проклета слабост зборне
9228    поживинченост ако молчиш
Name: sentence, Length: 9229, dtype: object

In [12]:
sentences_list[1]

'било кален'

In [13]:
doc = nlp("пес за живот")
print(doc.ents)
for ent in doc.ents:
    print("doc.ents")

()


In [14]:
words = []
entities = []
ids = []
for i, sentence in enumerate(sentences_list):
    doc = nlp(sentence)
    for ent in doc.ents:
        words.append(ent.text)
        entities.append(ent.label_)
        ids.append(i)

In [15]:
godina_od_list = df.iloc[ids, 1]
godina_do_list = df.iloc[ids, 2]
ime_na_pesni = df.iloc[ids, :]['Pesna_Ime']
ime_na_zbirka = df.iloc[ids, :]['Zbirka']
ime_na_podzbirka = df.iloc[ids, :]['Podzbirka']

In [16]:
words_entities_mk = pd.DataFrame({'word': words, 
                             'entity': entities,
                             'year_from': godina_od_list, 
                             'year_to': godina_do_list, 
                             'song_name':ime_na_pesni, 
                             'zbirka_name':ime_na_zbirka, 
                             'podzbirka_name':ime_na_podzbirka})

In [17]:
words_entities_mk

Unnamed: 0,word,entity,year_from,year_to,song_name,zbirka_name,podzbirka_name
3,темничиштево,NORP,1941,1945,ПЕСНА ЗА ЖИВОТОТ,ОД СТАРИОТ НОТЕС (1941‡1945),ОД СТАРИОТ НОТЕС (1941‡1945)
9,ииии,PERSON,1941,1945,ПЕСНА ЗА ЖИВОТОТ,ОД СТАРИОТ НОТЕС (1941‡1945),ОД СТАРИОТ НОТЕС (1941‡1945)
27,горок,GPE,1941,1945,КАМБАНИ,ОД СТАРИОТ НОТЕС (1941‡1945),ОД СТАРИОТ НОТЕС (1941‡1945)
30,ноќта,TIME,1941,1945,НОЌНА ПЕСНА,ОД СТАРИОТ НОТЕС (1941‡1945),ОД СТАРИОТ НОТЕС (1941‡1945)
33,опијани,GPE,1941,1945,НОЌНА ПЕСНА,ОД СТАРИОТ НОТЕС (1941‡1945),ОД СТАРИОТ НОТЕС (1941‡1945)
...,...,...,...,...,...,...,...
9138,секавица,GPE,1993,1993,МРТВА ПРИРОДА,ЦРН ОВЕН (1993),ОTPОР
9149,сврдле,GPE,1993,1993,ЏИЏЕ,ЦРН ОВЕН (1993),ОTPОР
9176,еднаш,TIME,1993,1993,ОБРОК,ЦРН ОВЕН (1993),ОTPОР
9214,допрва,ORDINAL,1993,1993,АНТИЧКА ТРАГЕДИЈА,ЦРН ОВЕН (1993),ОTPОР


In [18]:
words_entities_mk.to_csv('./Results/MK/ner.csv')