## _Grebanje veba_ (_web scraping_) 

### lyricsmaster

In [None]:
from lyricsmaster import LyricWiki

provider = LyricWiki()
izvodjaci = ['Bajaga', 'Bajaga & Instruktori', 'Bebi Dol', 'Bijelo Dugme',  'Dino Merlin', 'Divlje Jagode', 'Elektricni Orgazam', 'Galija', 'Goran Bregovic', 'Goran Karan', 'Hari Mata Hari', 'Haustor', 'Idoli', 'Indexi', 'Josipa Lisac', 'Kerber', 'Madame Piano', 'Mirzino Jato', 'Negative (RS)', 'Neverne Bebe', 'Nina Badric', 'Oktobar 1864', 'Partibrejkers', 'Prljavo Kazaliste', 'Rani Mraz', 'Smak', 'Van Gogh', 'Vesna Pisarovic', 'YU Grupa', 'Zabranjeno Pusenje', 'Zana', 'Zdravko Colic', 'Zeljko Joksimovic'] # lista izvođača čiji tekstovi se preuzimaju

def korpus(izvodjaci): 
    for izvodjac in izvodjaci:
        try:
            discography = provider.get_lyrics(izvodjac)
            for album in discography:    
                print('Album: ', album.title)
                for song in album:
                    print('Song: ', song.title)
                    print('Lyrics: ', song.lyrics)
                    discography.save() # čuvanje tekstova u {user}/Documents/lyricsmaster/
        except: # ukoliko na sajtu postoji samo naslov pesme, ali ne i sadržaj teksta 
            continue

korpus(izvodjaci)

### Beautiful Soup

In [None]:
from pyquery import PyQuery as pq
from lxml import etree
import requests
from bs4 import BeautifulSoup

with open('metrolyrics.txt', 'w') as m:
    linkovi = ['http://www.metrolyrics.com/azra-lyrics.html', 'http://www.metrolyrics.com/riblja-corba-lyrics.html', 'http://www.metrolyrics.com/ekv-lyrics.html']
    for link in linkovi:
        response = requests.get(link) # poseta veb-sajtu
        doc = pq(response.content) # razdvajanje različitih sadržaja
        titles = doc('.title') # pronalaženje naslova u sadržaju
        for title in titles: # za svaki naslov, 
            response_title = requests.get(title.attrib['href']) # pronalaženje svakog naslova
            doc2 = pq(response_title.content) # razdvajanje sadržaja
            verse = doc2('.verse') # pronalaženje teksta numere
            print(verse.text()) # prikaz tekstova
            m.write((' ').join(verse.text().split()))

## Automatska XML anotacija -- yattag

In [None]:
import os
from os import listdir 
from os.path import isfile, join
import html
from yattag import Doc, indent

def replace_char_entities(s): 
    return html.escape(s) 

myroot = '/Users/ljudmilapetkovic/Documents/LyricsMaster'
doc, tag, text = Doc().tagtext()

authors = [dir for dir in listdir(myroot) if not isfile(join(myroot, dir))]

with tag('exYuPesme'):
    
    for author in authors:
        
        curr_path = '{}/{}'.format(myroot, author)
        albums = [dir for dir in listdir(curr_path) if not isfile(join(curr_path, dir))]
        
        with tag('autor', ime=author, brojAlbuma=len(albums), pol=""):
            
            for album in albums:
                with tag('album', naziv=album, godina=""):
                    
                    album_path = '{}/{}'.format(curr_path, album)
                    songs = [f for f in listdir(album_path) if isfile(join(album_path, f))]
                    for song in songs:
                        if not song.startswith('.DS_S'):
                            with tag('pesma', naslovPesme=song[:-4]):
                                song_path = '{}/{}'.format(album_path, song)
                                for stih in open(song_path, encoding="utf8", errors='ignore').read().split('\n')[:-1]:
                                    with tag('li'):
                                        text('{}'.format(replace_char_entities(stih)))
            

result = indent(
doc.getvalue(),
indentation = ' '*4,
newline = '\r\n'
)

print(result)
with open('exyuxml (bez .DS-Store).xml', 'w') as x:
    x.write(result)

## NLTK

### Sufiksi na -ija

In [None]:
import nltk
import cyrtranslit

def sufiksi():     
    
    with open('ex-yu-korpus.txt', 'r') as f:
        data = str(f.readlines())
        data = cyrtranslit.to_latin(data)
        tokens = nltk.word_tokenize(data)
        tokens = [token.lower() for token in tokens]
        tokens = nltk.Text(tokens) 
        lista_tokena = sorted(w for w in set(tokens) if w.endswith('ija'))
        print('Lista tokena:', len(lista_tokena), '\n\n', lista_tokena)
        
        
sufiksi()

### Konkordanca

In [None]:
from nltk.tokenize import word_tokenize
from nltk.text import Text 
import nltk.corpus, cyrtranslit 

def konkordanca(rec):
    with open('ex-yu-korpus.txt', 'r') as f:
        data = str(f.readlines())
        data = cyrtranslit.to_latin(data)
        tokens = nltk.word_tokenize(data)
        conc = nltk.Text(tokens)
        conc.concordance(rec)
        
konkordanca('partija')

## Generisanje pojedinačnih .txt fajlova tekstova pesama

### ...za svakog autora

In [None]:
import os
from os import listdir 
from os.path import isfile, join
import html

def replace_char_entities(s): 
    return html.escape(s) 

myroot = '/Users/ljudmilapetkovic/Documents/LyricsMaster'

authors = [dir for dir in listdir(myroot) if not isfile(join(myroot, dir))]

def autor_pesme(a, datoteka):
    with open(datoteka, 'w') as x:
        for author in authors:
            curr_path = '{}/{}'.format(myroot, author)
            if curr_path.endswith('{}'.format(a)):
                albums = [dir for dir in listdir(curr_path) if not isfile(join(curr_path, dir))]
                for album in albums:
                    album_path = '{}/{}'.format(curr_path, album)
                    songs = [f for f in listdir(album_path) if isfile(join(album_path, f))]
                    for song in songs:
                        if not song.startswith('.DS_S'):
                            song_path = '{}/{}'.format(album_path, song)
                            x.write(song[:-4] + '\n\n')
                            print(song[:-4], '\n\n')
                            for stih in open(song_path, encoding="utf8", errors='ignore').read().split('\n')[:-1]:
                                line = '{}'.format(replace_char_entities(stih))
                                print(line)
                                x.write(line + '\n') 
            else:
                pass

a = 'Madame-Piano'            
autor_pesme(a, a + '.txt')

### ...i za svaki album određenog autora

In [None]:
import os
from os import listdir 
from os.path import isfile, join
import html

def replace_char_entities(s): 
    return html.escape(s) 

myroot = '/Users/ljudmilapetkovic/Documents/LyricsMaster'

def autor_album(a, al, datoteka):
    with open(datoteka, 'w') as x:
        authors = [dir for dir in listdir(myroot) if not isfile(join(myroot, dir))]
        for author in authors:
            curr_path = '{}/{}'.format(myroot, author)
            if curr_path.endswith('{}'.format(a)):
                albums = [dir for dir in listdir(curr_path) if not isfile(join(curr_path, dir))]
                for album in albums:
                    album_path = '{}/{}'.format(curr_path, album)
                    if album_path.endswith('{}'.format(al)):
                        songs = [f for f in listdir(album_path) if isfile(join(album_path, f))]
                        for song in songs:
                            if not song.startswith('.DS_S'):
                                print(song[:-4], '\n')
                                x.write(song[:-4] + '\n')
                                song_path = '{}/{}'.format(album_path, song)
                                for stih in open(song_path, encoding="utf8", errors='ignore').read().split('\n')[:-1]:
                                    line = '{}'.format(replace_char_entities(stih))
                                    x.write(line)
                                    print(line)

a = 'Bajaga-Instruktori'
al = 'Zmaj-Od-Nocaja'                               
autor_album(a, al, a + '_' + al + '.txt')                  