# Metody wstępnego przetwarzania / czyszczenia danych

In [61]:
import pandas as pd
import nltk

In [62]:
df = pd.read_csv('df_eng_songs.csv', header=0, names=['title_pl', 'txt_pl', 'txt_en'], index_col=0)
df.head()

Unnamed: 0,title_pl,txt_pl,txt_en
0,Abba Ojcze,Ty wyzwoliłeś nas Panie z kajdan i samych sie...,"You delivered us from chains and ourselves, an..."
1,Alleluja (Niech zabrzmi Panu),"Alleluja, Alleluja, Alleluja, Alleluja. Nie...","Alleluia, Alleluia, Alleluia, Alleluia. Let th..."
2,"Alleluja, Alleluja, Amen Amen, Alleluja","Alleluja, Alleluja, Amen, Amen, Alleluja. ...","Alleluia, Alleluia, Amen, Amen, Alleluia. Let ..."
3,"Blisko, blisko, blisko jesteś","Blisko, blisko, blisko Jesteś Panie mój Blisk...","Close, close, close You are my Lord, Close to ..."
4,Bo góry mogą ustąpić,Bo góry mogą ustąpić i pagórki się zachwiać. ...,Because the mountains can give way and the hil...


## Tokenizacja

In [63]:
import string
import re

def clean_text3(text):
    text = "".join([word for word in text if word not in string.punctuation]) # interpunkcja
    tokens = nltk.word_tokenize(text) # tokenizacja
    return tokens

In [64]:
for column in df.columns:
    df[column + '_tokenized'] = df[column].apply(lambda x: clean_text3(x))
df.head()

Unnamed: 0,title_pl,txt_pl,txt_en,title_pl_tokenized,txt_pl_tokenized,txt_en_tokenized
0,Abba Ojcze,Ty wyzwoliłeś nas Panie z kajdan i samych sie...,"You delivered us from chains and ourselves, an...","[Abba, Ojcze]","[Ty, wyzwoliłeś, nas, Panie, z, kajdan, i, sam...","[You, delivered, us, from, chains, and, oursel..."
1,Alleluja (Niech zabrzmi Panu),"Alleluja, Alleluja, Alleluja, Alleluja. Nie...","Alleluia, Alleluia, Alleluia, Alleluia. Let th...","[Alleluja, Niech, zabrzmi, Panu]","[Alleluja, Alleluja, Alleluja, Alleluja, Niech...","[Alleluia, Alleluia, Alleluia, Alleluia, Let, ..."
2,"Alleluja, Alleluja, Amen Amen, Alleluja","Alleluja, Alleluja, Amen, Amen, Alleluja. ...","Alleluia, Alleluia, Amen, Amen, Alleluia. Let ...","[Alleluja, Alleluja, Amen, Amen, Alleluja]","[Alleluja, Alleluja, Amen, Amen, Alleluja, Pok...","[Alleluia, Alleluia, Amen, Amen, Alleluia, Let..."
3,"Blisko, blisko, blisko jesteś","Blisko, blisko, blisko Jesteś Panie mój Blisk...","Close, close, close You are my Lord, Close to ...","[Blisko, blisko, blisko, jesteś]","[Blisko, blisko, blisko, Jesteś, Panie, mój, B...","[Close, close, close, You, are, my, Lord, Clos..."
4,Bo góry mogą ustąpić,Bo góry mogą ustąpić i pagórki się zachwiać. ...,Because the mountains can give way and the hil...,"[Bo, góry, mogą, ustąpić]","[Bo, góry, mogą, ustąpić, i, pagórki, się, zac...","[Because, the, mountains, can, give, way, and,..."


## Stopwords

In [65]:
import codecs 

polish_stops = codecs.open("polishStopWords",'r','utf-8')
stopwords_en = nltk.corpus.stopwords.words('english')
stopwords_pl = polish_stops.read().split('\n')
polish_stops.close()

In [66]:
def lower_nostops(text):
    text = [word.lower() for word in text if (word not in stopwords_pl) and (word not in stopwords_en)]
    return text

In [67]:
for column in df.columns[-3:]:
    df[column + '_lower_nostops'] = df[column].apply(lambda x: lower_nostops(x))
df.head()

Unnamed: 0,title_pl,txt_pl,txt_en,title_pl_tokenized,txt_pl_tokenized,txt_en_tokenized,title_pl_tokenized_lower_nostops,txt_pl_tokenized_lower_nostops,txt_en_tokenized_lower_nostops
0,Abba Ojcze,Ty wyzwoliłeś nas Panie z kajdan i samych sie...,"You delivered us from chains and ourselves, an...","[Abba, Ojcze]","[Ty, wyzwoliłeś, nas, Panie, z, kajdan, i, sam...","[You, delivered, us, from, chains, and, oursel...","[abba, ojcze]","[ty, wyzwoliłeś, panie, kajdan, samych, siebie...","[you, delivered, us, chains, christ, becoming,..."
1,Alleluja (Niech zabrzmi Panu),"Alleluja, Alleluja, Alleluja, Alleluja. Nie...","Alleluia, Alleluia, Alleluia, Alleluia. Let th...","[Alleluja, Niech, zabrzmi, Panu]","[Alleluja, Alleluja, Alleluja, Alleluja, Niech...","[Alleluia, Alleluia, Alleluia, Alleluia, Let, ...","[alleluja, niech, zabrzmi, panu]","[alleluja, alleluja, alleluja, alleluja, niech...","[alleluia, alleluia, alleluia, alleluia, let, ..."
2,"Alleluja, Alleluja, Amen Amen, Alleluja","Alleluja, Alleluja, Amen, Amen, Alleluja. ...","Alleluia, Alleluia, Amen, Amen, Alleluia. Let ...","[Alleluja, Alleluja, Amen, Amen, Alleluja]","[Alleluja, Alleluja, Amen, Amen, Alleluja, Pok...","[Alleluia, Alleluia, Amen, Amen, Alleluia, Let...","[alleluja, alleluja, amen, amen, alleluja]","[alleluja, alleluja, amen, amen, alleluja, pok...","[alleluia, alleluia, amen, amen, alleluia, let..."
3,"Blisko, blisko, blisko jesteś","Blisko, blisko, blisko Jesteś Panie mój Blisk...","Close, close, close You are my Lord, Close to ...","[Blisko, blisko, blisko, jesteś]","[Blisko, blisko, blisko, Jesteś, Panie, mój, B...","[Close, close, close, You, are, my, Lord, Clos...","[blisko, blisko, blisko, jesteś]","[blisko, blisko, blisko, jesteś, panie, blisko...","[close, close, close, you, lord, close, i, fee..."
4,Bo góry mogą ustąpić,Bo góry mogą ustąpić i pagórki się zachwiać. ...,Because the mountains can give way and the hil...,"[Bo, góry, mogą, ustąpić]","[Bo, góry, mogą, ustąpić, i, pagórki, się, zac...","[Because, the, mountains, can, give, way, and,...","[bo, góry, ustąpić]","[bo, góry, ustąpić, pagórki, zachwiać, ale, mi...","[because, mountains, give, way, hills, get, sw..."


## Lemmatyzacja / Stemming

### Lemmatyzacja (lemmatisation) 
Algorytm znajdowania lemmy, formy podstawowej wyrazu w obszarze części mowy którą reprezentuje. Inaczej: wyszukiwanie formy kanonicznej leksemu. W komputerowej analizie języka naturalnego lemmatyzacja jest operacją bardziej precyzyjną niż stemmatyzacja. 

In [59]:
wn = nltk.WordNetLemmatizer()
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

### Stemmatyzacja (stemming) 
Sprowadzenie wyrazu reprezentującego dowolną część mowy do stemu, rdzenia postaci źródłowej, najczęściej rzeczownika, bądź niekiedy nawet sekwencji znaków nie mającej samodzielnego znaczenia w języku naturalnym. 

In [58]:
ps = nltk.PorterStemmer()
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

In [68]:
for column in df.columns[-1:]:
    df[column + '_stemmed'] = df[column].apply(lambda x: stemming(x))
    df[column + '_lemmatized'] = df[column].apply(lambda x: lemmatizing(x))
df.head()

Unnamed: 0,title_pl,txt_pl,txt_en,title_pl_tokenized,txt_pl_tokenized,txt_en_tokenized,title_pl_tokenized_lower_nostops,txt_pl_tokenized_lower_nostops,txt_en_tokenized_lower_nostops,txt_en_tokenized_lower_nostops_stemmed,txt_en_tokenized_lower_nostops_lemmatized
0,Abba Ojcze,Ty wyzwoliłeś nas Panie z kajdan i samych sie...,"You delivered us from chains and ourselves, an...","[Abba, Ojcze]","[Ty, wyzwoliłeś, nas, Panie, z, kajdan, i, sam...","[You, delivered, us, from, chains, and, oursel...","[abba, ojcze]","[ty, wyzwoliłeś, panie, kajdan, samych, siebie...","[you, delivered, us, chains, christ, becoming,...","[you, deliv, us, chain, christ, becom, brother...","[you, delivered, u, chain, christ, becoming, b..."
1,Alleluja (Niech zabrzmi Panu),"Alleluja, Alleluja, Alleluja, Alleluja. Nie...","Alleluia, Alleluia, Alleluia, Alleluia. Let th...","[Alleluja, Niech, zabrzmi, Panu]","[Alleluja, Alleluja, Alleluja, Alleluja, Niech...","[Alleluia, Alleluia, Alleluia, Alleluia, Let, ...","[alleluja, niech, zabrzmi, panu]","[alleluja, alleluja, alleluja, alleluja, niech...","[alleluia, alleluia, alleluia, alleluia, let, ...","[alleluia, alleluia, alleluia, alleluia, let, ...","[alleluia, alleluia, alleluia, alleluia, let, ..."
2,"Alleluja, Alleluja, Amen Amen, Alleluja","Alleluja, Alleluja, Amen, Amen, Alleluja. ...","Alleluia, Alleluia, Amen, Amen, Alleluia. Let ...","[Alleluja, Alleluja, Amen, Amen, Alleluja]","[Alleluja, Alleluja, Amen, Amen, Alleluja, Pok...","[Alleluia, Alleluia, Amen, Amen, Alleluia, Let...","[alleluja, alleluja, amen, amen, alleluja]","[alleluja, alleluja, amen, amen, alleluja, pok...","[alleluia, alleluia, amen, amen, alleluia, let...","[alleluia, alleluia, amen, amen, alleluia, let...","[alleluia, alleluia, amen, amen, alleluia, let..."
3,"Blisko, blisko, blisko jesteś","Blisko, blisko, blisko Jesteś Panie mój Blisk...","Close, close, close You are my Lord, Close to ...","[Blisko, blisko, blisko, jesteś]","[Blisko, blisko, blisko, Jesteś, Panie, mój, B...","[Close, close, close, You, are, my, Lord, Clos...","[blisko, blisko, blisko, jesteś]","[blisko, blisko, blisko, jesteś, panie, blisko...","[close, close, close, you, lord, close, i, fee...","[close, close, close, you, lord, close, i, fee...","[close, close, close, you, lord, close, i, fee..."
4,Bo góry mogą ustąpić,Bo góry mogą ustąpić i pagórki się zachwiać. ...,Because the mountains can give way and the hil...,"[Bo, góry, mogą, ustąpić]","[Bo, góry, mogą, ustąpić, i, pagórki, się, zac...","[Because, the, mountains, can, give, way, and,...","[bo, góry, ustąpić]","[bo, góry, ustąpić, pagórki, zachwiać, ale, mi...","[because, mountains, give, way, hills, get, sw...","[becaus, mountain, give, way, hill, get, sway,...","[because, mountain, give, way, hill, get, sway..."


## Zapis do pliku csv

In [69]:
df.to_csv(path_or_buf='prep_songs.csv', header=df.columns)