# Data Cleaning

## Cleaning The Data

In [None]:
import pandas as pd

data_df = pd.read_csv("lyrics.csv",index_col=0)
data_df

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def clean_text_1(text):
    # Lowercase
    text = text.lower()
    # Remove special text in brackets ([chorus],[guitar],etc)
    text = re.sub('\[.*?\]', '', text)
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)    
    # Remove quotes
    text = re.sub('[‘’“”…]', '', text)
    # Remove new line \n 
    text = re.sub('\n', ' ', text)
    # Remove stop_word
    stop_words = stopwords.words('english')
    words = word_tokenize(text)
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text
    

In [None]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.lyrics.apply(clean_text_1))

In [None]:
data_df.loc['David_Bowie']['lyrics'][:500]

In [None]:
data_clean.loc['David_Bowie']['lyrics'][:500]

## Stemming / Lemmatization

In [None]:
import nltk

from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [None]:
#A list of words to be stemmed
verb_list = ["was","were","am","general","generalize","generalizing","insurance","insured"]
noun_list = ["dogs","feet","insurance","knowledge"]
adjec_list= ["harder","better","faster","stronger"]

In [None]:
porter = PorterStemmer()
lancaster = LancasterStemmer()

print("%-20s %-20s %-20s"% ("Word","Porter Stemmer","lancaster Stemmer"))
for word in verb_list:
    print("%-20s %-20s %-20s"%(word, porter.stem(word),lancaster.stem(word)))
for word in noun_list:
    print("%-20s %-20s %-20s"%(word, porter.stem(word),lancaster.stem(word)))
for word in adjec_list:
    print("%-20s %-20s %-20s"%(word, porter.stem(word),lancaster.stem(word)))

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer 

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("generalized")) # try pos='v':
#POS: part of speech ADJ=a, ADJ_SAT=s, ADV=r, NOUN=n, VERB=v

print("%-20s  %-20s"% ("Word","WordNet Lemmatizer"))
for word in verb_list:
    print("%-20s %-20s"%(word,lemmatizer.lemmatize(word, pos='v')))
for word in noun_list:
    print("%-20s %-20s"%(word,lemmatizer.lemmatize(word, pos='n')))
for word in adjec_list:
    print("%-20s %-20s"%(word,lemmatizer.lemmatize(word, pos='a')))

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
# POS tagging

from nltk import word_tokenize, pos_tag

txt = "Remember when you were young, you shone like the sun Shine on you crazy diamond"
pos_tag(word_tokenize(txt))

POS tag list:

- CC	coordinating conjunction
- CD	cardinal digit
- DT	determiner
- EX	existential there (like: "there is" ... think of it like "there exists")
- FW	foreign word
- IN	preposition/subordinating conjunction
- JJ	adjective	'big'
- JJR	adjective, comparative	'bigger'
- JJS	adjective, superlative	'biggest'
- LS	list marker	1)
- MD	modal	could, will
- NN	noun, singular 'desk'
- NNS	noun plural	'desks'
- NNP	proper noun, singular	'Harrison'
- NNPS	proper noun, plural	'Americans'
- PDT	predeterminer	'all the kids'
- POS	possessive ending	parent\'s
- PRP	personal pronoun	I, he, she
- PRP\$ 	possessive pronoun	my, his, hers
- RB	adverb	very, silently,
- RBR	adverb, comparative	better
- RBS	adverb, superlative	best
- RP	particle	give up
- TO	to	go 'to' the store.
- UH	interjection	errrrrrrrm
- VB	verb, base form	take
- VBD	verb, past tense	took
- VBG	verb, gerund/present participle	taking
- VBN	verb, past participle	taken
- VBP	verb, sing. present, non-3d	take
- VBZ	verb, 3rd person sing. present	takes
- WDT	wh-determiner	which
- WP	wh-pronoun	who, what
- WP\$	possessive wh-pronoun	whose
- WRB	wh-abverb	where, when

In [None]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

def lemmatize_tag(text):
    lemma=[]
    for i,j in pos_tag(word_tokenize(text)) :
        p=j[0].lower()
        if p in ['j','n','v']:
            if p == 'j':
                p = 'a'
            lemma.append(wnl.lemmatize(i,p))
        else :
            lemma.append(wnl.lemmatize(i))    
    return ' '.join(lemma)



In [None]:
data_df.loc['David_Bowie']['lyrics'][:500]

In [None]:
data_clean.loc['David_Bowie']['lyrics'][:500]

In [None]:
data_clean = pd.DataFrame(data_clean.lyrics.apply(lemmatize_tag))

In [None]:
data_clean.loc['David_Bowie']['lyrics'][:500]

### Save clean data

In [None]:
data_clean.to_csv('lyrics_clean.csv')

## Question
1. Which further clean can be aplied to the text?