In [1]:
import pandas as pd
import unicodedata
import re
import nltk

In [2]:
data_df = pd.read_csv(r"C:\Users\MalindaPieris\Documents\MscResearch\Sinhala-Audio-Classfication-notebooks\notebooks\nlp\dataset-metadata -with-lyrics.csv")
data_df = pd.DataFrame(data_df)

In [3]:
def lineBreakReplace(text):
    return text.replace("\r\n", " ")

In [4]:
data_df['Lyrics'] =data_df.apply(lambda x: lineBreakReplace(x.Lyrics), axis=1)

In [5]:
def removeEscapSequences(text):
    return "".join(ch for ch in text if unicodedata.category(ch)[0]!="C")

In [6]:
data_df['Lyrics'] =data_df.apply(lambda x: removeEscapSequences(x.Lyrics), axis=1)

In [7]:
def remove_digits(text):
    return ''.join([i for i in text if not i.isdigit()])

In [8]:
data_df['Lyrics'] =data_df.apply(lambda x: removeEscapSequences(x.Lyrics), axis=1)

In [9]:
def removeHtmlTags(text):
    p = re.compile(r'<.*?>')
    return p.sub('', text)

In [10]:
data_df['Lyrics'] = data_df.apply(lambda x: removeHtmlTags(x.Lyrics), axis=1)

In [11]:
data_df.head()

Unnamed: 0,song_id,song_name,Artist,Lyrics,Type
0,C 01,Kaulu Piyan Path Wahanna,Kasun Kalhara,කවුළු පියන්පත් වහන්න සඳළුතලාවේ ඔබ දුරයි නෙතට ම...,Calm
1,C 02,Ruk Aththana Mala Mudune\n,Nanda Malini,රුක් අත්තන මල මුදුනේ බඹරු නටන හැන්දෑවේ… සැදෑ ...,Calm
2,C 03,Samanala Mudune,H R Jothipala and Latha Walpola,සමනළ මුදුනේ සිරිපද සිඹ සිඹ උදා ඉරක් පායයි සැන...,Calm
3,C 04,Niwan Dutu Himi,Victor Rathnayaka,නිවන් දුටු හිමි රැවන් පිළිරැව පමණි මට අද ශේෂ ව...,Calm
4,C 05,Suwanda Danee Danee Danenawa,Rookantha Gunathilake,සුවඳ දැනී දැනී දැනෙනවා... උදා හිරු එළියේ... අළ...,Calm


In [12]:
def removeSpecialCharacters(sentence,keep_apostrophes=True):
    sentence = sentence.strip()
    if keep_apostrophes:
        PATTERN = r'[?|$|&|*|%|@|(|)|~]'
        filtered_sentence = re.sub(PATTERN, r'', sentence)
        return filtered_sentence
    return sentence

In [13]:
def replace_full_stops(sentence):
    return sentence.replace("."," ")

In [14]:
data_df['Lyrics'] = data_df.apply(lambda x: removeSpecialCharacters(x.Lyrics), axis=1)
data_df['Lyrics'] = data_df.apply(lambda x: replace_full_stops(x.Lyrics), axis=1)

In [15]:
def remove_english_letters(sentence):
    non_english_sentence = re.sub(r'[a-zA-Z]', '', sentence)
    return non_english_sentence

In [16]:
data_df['Lyrics']  = data_df['Lyrics'] .apply(lambda x: remove_english_letters(x))

In [17]:
lines = list(open('stop-words-sinhala.txt',encoding="utf8"))
sinhala_stop_words = [re.sub("\n","",x) for x in lines]

In [18]:
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens

In [19]:
data_df['token_list']  = data_df['Lyrics'] .apply(lambda x: tokenize_text(x))

In [20]:
def remove_sinhala_stop_words(tokens):
    filtered_tokens = [token for token in tokens if token not in sinhala_stop_words]
    return filtered_tokens

In [21]:
data_df['token_list_after_sw'] = data_df.apply(lambda x: remove_sinhala_stop_words(x.token_list), axis=1)

In [22]:
def remove_two_letter_words(tokenized_words):
    # Use list comprehension to filter out words with length 2
    filtered_words = [word for word in tokenized_words if len(word) > 3]
    return filtered_words

In [23]:
data_df['token_list_after_sw'] = data_df.apply(lambda x: remove_two_letter_words(x.token_list_after_sw), axis=1)

In [24]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
data_df['pre_processed_text'] = data_df.apply(lambda x: TreebankWordDetokenizer().detokenize(x.token_list_after_sw), axis=1)

In [25]:
from collections import Counter
import numpy as np
list_of_words = ' '.join(data_df['pre_processed_text']).split()
most_common_words = [word for word, count in Counter(list_of_words).items() if count in np.arange(25,100)]
unique_words = [word for word, count in Counter(list_of_words).items() if count in [1,2,3,4,5,6,7,8,9]]
all_words_to_remove = most_common_words+unique_words;
def remove_most_common_unique_words(tokens):
    filtered_tokens = [token for token in tokens if token not in all_words_to_remove]
    return filtered_tokens

In [26]:
data_df['token_list_after_common_words'] = data_df.apply(lambda x: remove_most_common_unique_words(x.token_list_after_sw), axis=1)

In [27]:
data_df['pre_processed_text'] = data_df.apply(lambda x: TreebankWordDetokenizer().detokenize(x.token_list_after_common_words), axis=1)

In [28]:
list_of_words = ' '.join(data_df['pre_processed_text']).split()
Counter(list_of_words)

Counter({'නෙතට': 10,
         'තලාවේ': 10,
         'මුදුනේ': 10,
         'බඹරු': 10,
         'අඳුර': 11,
         'කවුරුද': 14,
         'එන්නේ…': 10,
         'නුවන්': 23,
         'මුතු': 24,
         'පිණි': 11,
         'හිරු': 14,
         'පුරා': 15,
         'දැහැමි': 10,
         'දැයක්': 10,
         'පිබිදෙයි': 10,
         'නිවන්': 10,
         'දුටු': 15,
         'මැනවී': 17,
         'බුදු': 14,
         'කරැණාවෙන්': 21,
         'අතින්': 12,
         'මලක්': 14,
         'දැනී': 10,
         'එළියේ': 24,
         'හැඟුම්': 18,
         'ලස්සන': 20,
         'ඇවිදින්': 21,
         'දිගේ': 19,
         'ජීවිතේ': 13,
         'සුන්දර': 11,
         'නොවී': 12,
         'කල්පනා': 13,
         'සැලෙන': 15,
         'රහසේ': 13,
         'පහන්': 20,
         'හදින්': 12,
         'ලොවක්': 10,
         'අහසේ': 12,
         'යහනේ': 10,
         'පැන්': 11,
         'පොදක්': 16,
         'ආදරෙයි': 18,
         'සිරි': 17,
         'ඔබගේ': 20,
         'මතකයි': 20,
         'කි

In [29]:
data_df['word_count'] = data_df['pre_processed_text'].str.split().str.len()
data_df.head(5)

Unnamed: 0,song_id,song_name,Artist,Lyrics,Type,token_list,token_list_after_sw,pre_processed_text,token_list_after_common_words,word_count
0,C 01,Kaulu Piyan Path Wahanna,Kasun Kalhara,කවුළු පියන්පත් වහන්න සඳළුතලාවේ ඔබ දුරයි නෙතට ම...,Calm,"[කවුළු, පියන්පත්, වහන්න, සඳළුතලාවේ, ඔබ, දුරයි,...","[කවුළු, පියන්පත්, වහන්න, සඳළුතලාවේ, දුරයි, නෙත...",නෙතට තලාවේ නෙතට තලාවේ නෙතට තලාවේ නෙතට තලාවේ නෙ...,"[නෙතට, තලාවේ, නෙතට, තලාවේ, නෙතට, තලාවේ, නෙතට, ...",10
1,C 02,Ruk Aththana Mala Mudune\n,Nanda Malini,රුක් අත්තන මල මුදුනේ බඹරු නටන හැන්දෑවේ… සැදෑ ...,Calm,"[රුක්, අත්තන, මල, මුදුනේ, බඹරු, නටන, හැන්දෑවේ…...","[රුක්, අත්තන, මුදුනේ, බඹරු, හැන්දෑවේ…, සැදෑ, අ...",මුදුනේ බඹරු අඳුර කවුරුද එන්නේ… නුවන් කවුරුද එන...,"[මුදුනේ, බඹරු, අඳුර, කවුරුද, එන්නේ…, නුවන්, කව...",38
2,C 03,Samanala Mudune,H R Jothipala and Latha Walpola,සමනළ මුදුනේ සිරිපද සිඹ සිඹ උදා ඉරක් පායයි සැනස...,Calm,"[සමනළ, මුදුනේ, සිරිපද, සිඹ, සිඹ, උදා, ඉරක්, පා...","[සමනළ, මුදුනේ, සිරිපද, ඉරක්, පායයි, සැනසිලි, ම...",මුදුනේ දැහැමි දැයක් පිබිදෙයි දැහැමි දැයක් පිබි...,"[මුදුනේ, දැහැමි, දැයක්, පිබිදෙයි, දැහැමි, දැයක...",35
3,C 04,Niwan Dutu Himi,Victor Rathnayaka,නිවන් දුටු හිමි රැවන් පිළිරැව පමණි මට අද ශේෂ ව...,Calm,"[නිවන්, දුටු, හිමි, රැවන්, පිළිරැව, පමණි, මට, ...","[නිවන්, දුටු, හිමි, රැවන්, පිළිරැව, පමණි, වුයේ...",නිවන් දුටු මැනවී බුදු කරැණාවෙන් කරැණාවෙන් කරැණ...,"[නිවන්, දුටු, මැනවී, බුදු, කරැණාවෙන්, කරැණාවෙන...",33
4,C 05,Suwanda Danee Danee Danenawa,Rookantha Gunathilake,සුවඳ දැනී දැනී දැනෙනවා උදා හිරු එළියේ අළ...,Calm,"[සුවඳ, දැනී, දැනී, දැනෙනවා, උදා, හිරු, එළියේ, ...","[සුවඳ, දැනී, දැනී, දැනෙනවා, හිරු, එළියේ, අළුත්...",දැනී දැනී හිරු එළියේ හැඟුම් දැනී දැනී ලස්සන ඇව...,"[දැනී, දැනී, හිරු, එළියේ, හැඟුම්, දැනී, දැනී, ...",23


In [30]:
def total_unique_words(words):
    return nltk.FreqDist(words)

print("Total number of unique words",total_unique_words(list_of_words))

Total number of unique words <FreqDist with 152 samples and 2089 outcomes>


In [31]:
def total_words(words):
    return(len(words))
print("Total number of words",total_words(list_of_words))

Total number of words 2089


In [32]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [33]:
X= data_df['pre_processed_text']
y=label_encoder.fit_transform(data_df['Type'])

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

countvectorizer = CountVectorizer(analyzer= 'word')
tfidfvectorizer = TfidfVectorizer(analyzer='word')
count_wm = countvectorizer.fit_transform(X_train)
tfidf_wm = tfidfvectorizer.fit_transform(X_train)
count_tokens = countvectorizer.get_feature_names()
tfidf_tokens = tfidfvectorizer.get_feature_names()

In [36]:
df_countvect = pd.DataFrame(data = count_wm.toarray(),columns = count_tokens)
df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)
print("Count Vectorizer\n")
df_tfidfvect.head()

Count Vectorizer



Unnamed: 0,අත,අතර,අඳ,අප,අභ,අම,අහස,ආදර,ආදරණ,ආදරය,...,වන,වර,වලක,වසන,සන,හඬන,හද,හමය,හස,ළම
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.089896,0.0,0.0,0.0,...,0.062296,0.0,0.0,0.0,0.040715,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:

# tfidfvectorizer_csv = TfidfVectorizer(analyzer='word')
# tfidf_wm_tocsv = tfidfvectorizer_csv.fit_transform(X)
# tfidf_tokens_csv = tfidfvectorizer_csv.get_feature_names()
# df_tfidfvect = pd.DataFrame(data = tfidf_wm_tocsv.toarray(),columns = tfidf_tokens_csv)

# df_tfidfvect.to_csv("tfidf_all.csv")

In [38]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(df_tfidfvect, y_train)
#pred_forest = random_forest.predict(xtest)

RandomForestClassifier()

In [39]:
####testing

In [40]:
df_countvect_test = countvectorizer.transform(X_test)
df_tfidfvect_test = tfidfvectorizer.transform(X_test)

In [41]:
pred_forest = random_forest.predict(df_tfidfvect_test)

In [42]:
pred_forest

array([0, 0, 2, 0, 2, 2, 0, 2, 2, 2, 0, 0, 0, 1, 2, 2, 2, 0, 1, 2, 1, 2,
       2, 1, 0, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 1, 2, 2, 1,
       2, 2, 2, 2, 1, 2, 1, 2, 2, 0])

In [43]:
y_test

array([2, 1, 2, 1, 1, 0, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 2, 1, 1, 0, 1, 0,
       0, 0, 0, 2, 1, 1, 2, 1, 2, 1, 1, 2, 0, 2, 0, 0, 1, 1, 0, 2, 1, 1,
       2, 0, 1, 0, 2, 1, 2, 0, 1, 1])

In [44]:
from sklearn import metrics

In [45]:
print("Accuracy:",metrics.accuracy_score(y_test, pred_forest)*100)
#print("Precision:",metrics.precision_score(y_test, pred_forest))
#print("Recall:",metrics.recall_score(y_test, pred_forest))

Accuracy: 27.77777777777778


In [46]:
##decsion tree

In [50]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [51]:
classifier = MultinomialNB()
classifier.fit(df_tfidfvect, y_train)

MultinomialNB()

In [52]:
y_pred = classifier.predict(df_tfidfvect_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

Accuracy: 31.48148148148148


In [53]:
##################EndofScript###################