# Calculating word frequencies inside a song text

In [91]:
# -*- coding: utf-8 -*-
import pandas as pd
from pprint import pprint
import codecs
import re
from itertools import islice

In [92]:
songtext_file = 'Adentro.txt'
song = ''

with codecs.open(songtext_file, 'r', encoding='utf-8') as f:
    # importing file content to a string and
    # transforming all text to lowercase, because we're interested in unique words
    song = f.read().lower()
    # removing U+2028 (unicode line separator) to avoid later matching errors
    # https://stackoverflow.com/questions/2201633/replace-newlines-in-a-unicode-string
    song = ''.join(song.splitlines())

# looking at a section of the resulting song string
print song[0:1000]

se que mis rimas aveces causan disgustos, cuando mis neuronas corren hasta yo mismo me asusto mis respuestas pueden ser tan agresivas, que hasta las letras me huyen porque tienen miedo de que las escriba no tengo rifles pa matarte solo basta con la pista convierto letras en ideas como un ilusionista en una linea te mato te fracturo te lesiono y en en la siguiente te resucito cuando te menciono eso es parte de mi arte que todo el mundo sepa, que estas rimas son pa ti sin tener que mencionarte y no lo hago para evitar el roce lo hago para no hacerte famoso en los países donde nadie te conoce después de ver como se mueven las guerras y las guerrillas tu crees que le voy a tener miedo a tu pandilla? dispara cuando quieras raperito maleantoso aquí no gana el mas maleante gana el mas ingenioso en tu cabeza tu eres un narco buscado por la policía y tus pistolas son como los unicornios de fantasía no hay problema en que tengas enemigos imaginarios pero si en que los chamaquitos crean que eres 

In [93]:
# removing special characters that are not part of words, such as ( and ) in "(coro)"
trytext = u"lalála,!? 3345 (coro), moró - lala."

print trytext
# removing some common special characters, punctuation, and numbers,
# while keeping what's necessary for representing unicode characters!

print re.sub(r'[\d(),.!?/-]', '', trytext)

lalála,!? 3345 (coro), moró - lala.
lalála  coro moró  lala


In [94]:
song = re.sub(r'[\d(),.!?/-]', '', song)
print song[0:100]

se que mis rimas aveces causan disgustos cuando mis neuronas corren hasta yo mismo me asusto mis res


In [95]:
# getting a list containing each word as an entry (splitting the string at ' ')
all_words_raw = song.split(' ')
# how many are there?
len(all_words_raw)

638

In [96]:
# remove empty string entries (only whitespaces)
# that come to be from removed special characters, such as " - " -> " "
# removes also "0" - but numbers are anyways removed and not of interest for vocabulary
# https://stackoverflow.com/questions/3845423/remove-empty-strings-from-a-list-of-strings
all_words = filter(None, all_words_raw)
# how many are there now? (2 removed)
len(all_words)

636

In [97]:
# checking whether we did the right thing here
for word in all_words_raw:
    if word not in all_words:
        print "removed: ", word, " <-"

removed:    <-
removed:    <-


In [98]:
# show a few - see the internal representation of unicode characters

all_words[0:5]

[u'se', u'que', u'mis', u'rimas', u'aveces']

In [99]:
spanish_unicode_characters = [u'í', u'á', u'é', u'ó', u'ú', u'ñ', u'ü']
# according to: http://character-code.com/spanish-html-codes.php

# get the index of one word containing a relevant unicode character
# https://stackoverflow.com/questions/6531482/how-to-check-if-a-string-contains-an-element-from-a-list-in-python
index = 0
for word in all_words:
    index += 1
    if any(u_char in spanish_unicode_characters for u_char in word):
        example_unicode_word = word
        unicode_index = index - 1
        break

# https://stackoverflow.com/questions/3235386/python-using-format-on-a-unicode-escaped-string
print u"The first word containing a Spanish unicode character, {}, \
is found at index {}".format(example_unicode_word, unicode_index)

The first word containing a Spanish unicode character, países, is found at index 108


In [100]:
all_words[unicode_index]

u'pa\xedses'

In [101]:
# another example that life is good!
# when using print, the internal unicode representation gets outputted correctly formatted
print all_words[unicode_index]

países


In [102]:
# now I need to count how often do words appear in the song text
word_frequencies_in_text = {}

for word in all_words:
    # avoiding that sentence beginner words are counted extra
    word = word.lower()
    # avoiding the empty thingies that come from removing some special chars and nums
    if word != '':
        # this counts them all
        if word not in word_frequencies_in_text:
            word_frequencies_in_text[word] = 0
        word_frequencies_in_text[word] += 1

# looking at a few of the frequency dict entries
# https://stackoverflow.com/questions/7971618/python-return-first-n-keyvalue-pairs-from-dict
n = 5
pprint(list(islice(word_frequencies_in_text.iteritems(), n)))

[(u'tratar', 1),
 (u'estudiante', 1),
 (u'rifles', 1),
 (u'duelen', 1),
 (u'bien', 1)]


In [103]:
# getting a list of the unique words in the songtext
unique_words = set(all_words)
print "There are {} unique words in the song".format(len(unique_words))

There are 331 unique words in the song


Okay, so this gives us a frequency calculation for the words in one specific song text, and a short overview of the song's content. Next step is to see how they relate to the most frequent words in Spanish.

---

# Which of the 5000 most common words are served by these lyrics?

In [104]:
# importing the frequency list as a pandas dataframe
es_word_frequencies = pd.read_csv('es.csv')

es_word_frequencies.head()

Unnamed: 0,word,frequency
0,de,3405234
1,que,3349162
2,no,3166057
3,a,2368719
4,la,2288023


In [105]:
# checking whether we're dealing with unicode strings
es_word_frequencies.iloc[36]
# yep we are.

word            más
frequency    377638
Name: 36, dtype: object

In [108]:
# this list is very long
print len(es_word_frequencies)

535232


In [109]:
top_5000 = es_word_frequencies[0:5000]
print "A new df with only the top {} most frequent Spanish words.".format(len(top_5000))
top_5000.head()

A new df with only the top 5000 most frequent Spanish words.


Unnamed: 0,word,frequency
0,de,3405234
1,que,3349162
2,no,3166057
3,a,2368719
4,la,2288023


In [110]:
# adding a third column that will serve as a checkpoint whether the word is
# already served by a song lyric
# initially this column will be set to "False" for each word
# https://stackoverflow.com/questions/12555323/adding-new-column-to-existing-dataframe-in-python-pandas
top_5000 = top_5000.assign(in_lyrics = pd.Series(False, index=top_5000.index))
top_5000.head()

Unnamed: 0,word,frequency,in_lyrics
0,de,3405234,False
1,que,3349162,False
2,no,3166057,False
3,a,2368719,False
4,la,2288023,False


In [111]:
# now for the nitty-gritty.
# checking through each word of the lyrics
for song_word in word_frequencies_in_text.keys():
    # if it has a corresponding entry in the frequency df
    # (if the row of the df
    #     where the field entry in the "word" column is the same as the songtext word
    # is NOT empty)
    if not top_5000[top_5000.word == song_word].empty:
        # then set the corresponding field in the "in_lyrics" column to "True"
        top_5000.in_lyrics[top_5000.word == song_word] = True
    
    ##### ATTENTION ####
    # if the df is empty then this means the word in the lyrics does NOT exist in the frequency df
    # this is probably due to flexion of a verb, or plural version of a noun
    # TODO: how to properly deal with this (probably through lemmas)
    # for now, just print a warning and the word for closer inspection
    else:
        print song_word, "<---- is not in frequency list"

  result = lib.scalar_compare(x, y, op)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


rifles <---- is not in frequency list
duelen <---- is not in frequency list
ingenioso <---- is not in frequency list
corren <---- is not in frequency list
aveces <---- is not in frequency list
escriba <---- is not in frequency list
arrepentí <---- is not in frequency list
policía <---- is not in frequency list
también <---- is not in frequency list
sicario <---- is not in frequency list
excursión <---- is not in frequency list
venden <---- is not in frequency list
ilusionista <---- is not in frequency list
suenan <---- is not in frequency list
robarte <---- is not in frequency list
algún <---- is not in frequency list
fobia <---- is not in frequency list
exploten <---- is not in frequency list
pistolas <---- is not in frequency list
sesos <---- is not in frequency list
desigualdades <---- is not in frequency list
cabrón <---- is not in frequency list
boxeadores <---- is not in frequency list
aquí <---- is not in frequency list
acá <---- is not in frequency list
tirarme <---- is not in 

In [112]:
# checking the edited df
top_5000.head()

Unnamed: 0,word,frequency,in_lyrics
0,de,3405234,True
1,que,3349162,True
2,no,3166057,True
3,a,2368719,True
4,la,2288023,True


In [113]:
# while the most frequent words seem to be included, the least frequent are not
top_5000.tail()

Unnamed: 0,word,frequency,in_lyrics
4995,regresará,1421,False
4996,cayendo,1421,False
4997,positivo,1420,False
4998,entro,1420,False
4999,chofer,1420,False


In [114]:
# and here are the results for this songtext:
words_served_by_song = top_5000[top_5000.in_lyrics == True]
print "The lyrics of {} consists of {} unique words that include {} of \
the 5000 most common Spanish words.".format(songtext_file, len(unique_words), len(words_served_by_song))

The lyrics of Adentro.txt consists of 331 unique words that include 208 of the 5000 most common Spanish words.


# Attempting to resolve the issue with día et. al.

In [120]:
# trying to put also the dict inot a df, maybe they have the same representation then
#words = pd.Series(word_frequencies_in_text.keys())

df = pd.DataFrame.from_items(key, value in word_frequencies_in_text.iteritems())
df

NameError: name 'key' is not defined