# this notebook contains:
* loaded corpora from json files
* all text analysis of lyrics, under corresponding headings

In [1]:
import json

In [2]:
winners_chart = json.load(open('data/winning_song_corpus.json'))
noms = json.load(open('data/nominees_corpus.json'))

In [3]:
%run "functions.ipynb"

In [4]:
%run "keyness_function-Copy1.ipynb"

In [5]:
iris = next((song for song in noms if song['song_title'] == 'Iris'), None)
iris['lyrics'] = open('data/iris_lyrics.txt').read()

## modules

In [6]:
import os
import random
import re
import string
from collections import Counter

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

%matplotlib inline

import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# pos tagging
from nltk import pos_tag, pos_tag_sents, FreqDist, ConditionalFreqDist



[nltk_data] Downloading package stopwords to /Commjhub/jupyterhub/comm
[nltk_data]     318_fall2019/nelkassabany/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
to_strip = ',.\xa0:-()\';$"/?][!`Ą@Ś§¨’–“”…ï‘>&\\%˝˘*'

## frequency lists

In [8]:
win_word_freq = Counter()
win_bigram_freq = Counter()

remove = '()[]:;.-!?,'
win_ttr = []

for song in range(len(winners_chart)):
    #isolate lyrics
    #tokenize/clean
    #run thru counter
    lyrics_raw = winners_chart[song]['lyrics']
    
    song_toks = tokenize(lyrics_raw, lowercase = True, strip_chars = remove)
    
    win_word_freq.update(song_toks)
    
    #repeat for bigrams
    bigrams = get_ngram_tokens(song_toks, 2)
    win_bigram_freq.update(bigrams)
    
    #calculate TTR, add to list
    ttr = (len(set(song_toks)) / len(song_toks)) * 100
    win_ttr.append(ttr)
    
    
win_word_freq.most_common(50), win_bigram_freq.most_common(50)


([('you', 631),
  ('the', 580),
  ('i', 543),
  ('and', 406),
  ('a', 349),
  ('to', 325),
  ('me', 268),
  ('it', 246),
  ('my', 236),
  ('in', 189),
  ('your', 164),
  ('that', 162),
  ('oh', 161),
  ('we', 161),
  ('on', 161),
  ('be', 153),
  ('of', 148),
  ("don't", 125),
  ('verse', 121),
  ('chorus', 120),
  ("i'm", 117),
  ('love', 111),
  ('with', 109),
  ('are', 109),
  ('for', 108),
  ('so', 106),
  ('just', 100),
  ('like', 97),
  ('all', 96),
  ('know', 90),
  ("it's", 90),
  ('make', 88),
  ('what', 87),
  ('yeah', 85),
  ('can', 85),
  ('if', 83),
  ('but', 83),
  ('now', 80),
  ('every', 78),
  ('when', 76),
  ('is', 76),
  ('got', 76),
  ('go', 70),
  ('never', 66),
  ('world', 63),
  ("i'll", 63),
  ('no', 61),
  ('heart', 59),
  ("you're", 59),
  ('do', 55)],
 [('oh oh', 81),
  ('in the', 49),
  ('we are', 49),
  ('verse 2', 46),
  ('verse 1', 45),
  ('and i', 45),
  ('are the', 44),
  ('the world', 40),
  ('yeah yeah', 40),
  ("that's what", 35),
  ('if you', 34),
 

In [9]:
nom_word_freq = Counter()
nom_bigram_freq = Counter()
nom_ttr = []

remove = '()[]:;.-!?,'

for song in range(len(noms)):
    #isolate lyrics
    #tokenize/clean
    #run thru counter
    lyrics_raw = noms[song]['lyrics']
    
    song_toks = tokenize(lyrics_raw, lowercase = True, strip_chars = remove)
    
    nom_word_freq.update(song_toks)
    
    # repeat for bigrams    
    bigrams = get_ngram_tokens(song_toks, 2)
    nom_bigram_freq.update(bigrams)
    
    #calculate and store TTR
    ttr = (len(set(song_toks)) / len(song_toks)) * 100
    nom_ttr.append(ttr)
    
    
nom_word_freq.most_common(50), nom_bigram_freq.most_common(50)

([('i', 3121),
  ('you', 3043),
  ('the', 2676),
  ('and', 1925),
  ('to', 1645),
  ('a', 1310),
  ('me', 1309),
  ('my', 1235),
  ('it', 1035),
  ('in', 966),
  ('that', 832),
  ('of', 718),
  ('be', 697),
  ('your', 691),
  ('love', 657),
  ('all', 633),
  ('for', 619),
  ("i'm", 610),
  ('on', 599),
  ('oh', 595),
  ('just', 570),
  ('is', 556),
  ('know', 520),
  ('yeah', 514),
  ('chorus', 507),
  ('but', 496),
  ('verse', 488),
  ('we', 480),
  ("don't", 477),
  ('when', 423),
  ('can', 416),
  ('like', 416),
  ('no', 413),
  ('with', 410),
  ("it's", 402),
  ('so', 374),
  ('do', 374),
  ('if', 356),
  ('this', 342),
  ('what', 324),
  ('got', 302),
  ('up', 294),
  ('way', 293),
  ("you're", 292),
  ('was', 277),
  ('one', 277),
  ('see', 263),
  ('say', 262),
  ('baby', 255),
  ('go', 255)],
 [('in the', 284),
  ('and i', 227),
  ('yeah yeah', 227),
  ('na na', 225),
  ('oh oh', 221),
  ('verse 1', 189),
  ('verse 2', 185),
  ('i know', 176),
  ("i don't", 165),
  ('i can', 14

## keyness analysis

In [10]:
## key words in the WINNING batch, how they compare with noms, keyness
calculate_keyness(win_word_freq, nom_word_freq, top = 20) 

WORD                     Corpus Freq.RC Freq.  Keyness
worry                    33        10        72.914
black                    33        16        59.931
every                    78        112       58.899
watching                 23        9         46.074
happy                    31        23        43.812
are                      109       235       42.034
world                    63        107       37.511
hmm                      17        6         35.509
rolling                  17        7         33.317
songs                    27        24        33.307
whole                    23        19        30.057
carry                    17        9         29.511
single                   20        14        29.407
let's                    26        26        28.997
make                     88        208       27.485
making                   13        5         26.231
write                    23        23        25.651
man                      48        88        25.228
rose     

In [11]:
## same frequency sets from above, just reversed. these terms are key in noms
calculate_keyness(nom_word_freq, win_word_freq, top = 20) 

WORD                     Corpus Freq.RC Freq.  Keyness
wanna                    142       7         22.058
long                     114       5         19.393
i                        3121      543       16.532
'cause                   220       21        14.690
la                       135       10        13.621
is                       556       76        13.523
off                      102       6         13.478
hey                      136       11        12.095
was                      277       32        11.858
not                      245       29        9.788
all                      633       96        9.369
live                     77        5         9.149
around                   106       9         8.731
feel                     178       20        8.213
been                     195       23        7.871
this                     342       48        7.435
way                      293       40        7.163
no                       413       61        7.045
some              

i look at the keyness tables and it looks like for the winners, the key terms in the corpus aren't distributed very widely over all the songs. take `worry` for example, the 1989 winner was "Don't WORRY Be Happy." similarly, `rolling` and `single` are among the top 20. The songs "ROLLING in the deep" and "SINGLE ladies" won. So I think these words appear as key because they're in the title, meaning they're repeated in the chorus


### type token ratios
potential measure of wordiness across corpora

In [12]:
# song at index 2 is an instrumental track, lyrics listed as 'none' so TTR is 100
# it would actually be undefined bc of dividing by zero issue, so removing from list 
win_ttr.remove(100.0)

In [13]:
# browsing thru the list, i see some very 'clean' values like 25.0, 50.0, 100.0
# i want to check the list to make sure these are not errors

for idx,song_val in enumerate(nom_ttr):
    if song_val in [25.0, 50.0, 100.0]:
        print('{}, {}\n'.format(noms[idx],idx))

# found another error in lyrics where iris by the goo goo dolls is listed as an instrumental track
# fixing this at the top of the notebook and re-running
## **********don't forget to update this in building/cleaning corpus notebook!!!!!********

## after iris lyrics were fixed, 2 instrumental songs remain. deleting before avg TTR calculation
nom_ttr.remove(nom_ttr[6])
nom_ttr.remove(nom_ttr[59])

{'year': '1960', 'nominees': 'Paul Francis Webster & André Previn', 'song_title': 'Like Young', 'performing_artist': 'André Previn', 'lyrics': 'none'}, 6

{'year': '1965', 'nominees': 'Henry Mancini, Ray Evans & Jay Livingston', 'song_title': 'Dear Heart', 'performing_artist': 'Andy Williams', 'lyrics': "Dear heart wish you were here to warm this night\nMy dear heart, seems like a year since you've been out of my sight\nA single room, a table for one\nIt's a lonesome town all right\nBut soon I'll kiss you hello at our front door\nAnd dear heart I want you to know\nI'll leave your arms never more\n\n(A single room, a table for one)\nIt's a lonesome town all right\nBut soon I'll kiss you hello at our front door\nAnd dear heart I want you to know\nI'll leave your arms never more"}, 25

{'year': '1973', 'nominees': 'Alan and Marilyn Bergman & Michel Legrand', 'song_title': 'The Summer Knows', 'performing_artist': 'Michel Legrand', 'lyrics': 'Instrumental'}, 59

{'year': '1999', 'nominees':

In [14]:
print('the average type-token ratio among winning songs is {}'.format(sum(win_ttr) / len(win_ttr)))
print('the average type-token ratio among nominated songs is {}'.format(sum(nom_ttr) / len(nom_ttr)))

the average type-token ratio among winning songs is 40.784626607625476
the average type-token ratio among nominated songs is 39.211339585603504


### pos tagging

In [15]:
winners_tagged = []

for song in range(len(winners_chart)):
    #isolate string of lyrics
    text = tokenize(winners_chart[song]['lyrics'], lowercase= True, strip_chars = to_strip)
    winners_tagged.append(nltk.pos_tag(text))

In [16]:
## picking out verbs across winning songs
winners_verbs = []

for song in range(len(winners_tagged)): #each iteration is now working on a list of lists
    for tags in range(len(winners_tagged[song])): #now each iteration is working on the 2-item list of word,tag
        
        if winners_tagged[song][tags][1].startswith('V'):
            winners_verbs.append(winners_tagged[song][tags][0])


In [17]:
## picking out adjectives across winning songs
winners_adjs = []

for song in range(len(winners_tagged)): #each iteration is now working on a list of lists
    for tags in range(len(winners_tagged[song])): #now each iteration is working on the 2-item list of word,tag
        
        if winners_tagged[song][tags][1].startswith('J'):
            winners_adjs.append(winners_tagged[song][tags][0])


In [18]:
## using isolated verbs to see which ones occur most often
win_verb_freq = Counter(winners_verbs)
win_verb_freq.most_common(20)

[('be', 153),
 ('i', 136),
 ('are', 109),
 ('know', 90),
 ('make', 88),
 ('is', 76),
 ('go', 67),
 ('got', 65),
 ('dont', 64),
 ('do', 55),
 ('were', 51),
 ('have', 49),
 ('see', 48),
 ('love', 47),
 ('get', 44),
 ('say', 38),
 ('im', 38),
 ('want', 36),
 ('take', 33),
 ('tell', 33)]

In [19]:
# most used adjectives for winning songs
win_adj_freq = Counter(winners_adjs)
win_adj_freq.most_common(20)
## i suspect that ill here is actually "i'll" with the apostrophe removed during tokenization

[('i', 142),
 ('oh', 65),
 ('good', 37),
 ('ive', 34),
 ('black', 33),
 ('happy', 31),
 ('im', 28),
 ('ill', 24),
 ('whole', 23),
 ('beautiful', 22),
 ('own', 21),
 ('new', 20),
 ('better', 20),
 ('single', 20),
 ('young', 18),
 ('bad', 18),
 ('little', 17),
 ('verse', 17),
 ('same', 16),
 ('true', 16)]

In [20]:
## what percentage of all tokens in winning corpus is verbs/adjectives
## divide sum of verb counter by sum of total tokens counter

win_verb_perc = (sum(win_verb_freq.values()) / sum(win_word_freq.values()))* 100
win_adj_perc = (sum(win_adj_freq.values()) / sum(win_word_freq.values()))* 100

print('{} percent of the tokens in all winning songs were verbs'.format(win_verb_perc))
print('{} percent of the tokens in all winning songs were adjectives'.format(win_adj_perc))

20.969660916121356 percent of the tokens in all winning songs were verbs
8.869720404521118 percent of the tokens in all winning songs were adjectives


In [21]:
top_win_verb = win_verb_freq.most_common(30)
top_win_adj = win_adj_freq.most_common(30)

#### repeating process for nominees

In [22]:
nom_tagged = []

for song in range(len(noms)):
    #isolate string of lyrics
    text = tokenize(noms[song]['lyrics'], lowercase= True, strip_chars = to_strip)
    nom_tagged.append(nltk.pos_tag(text))

In [23]:
## picking out verbs across nominees
nom_verbs = []

for song in range(len(nom_tagged)): #each iteration is now working on a list of lists
    for tags in range(len(nom_tagged[song])): #now each iteration is working on the 2-item list of word,tag
       
        if nom_tagged[song][tags][1].startswith('V'):
            nom_verbs.append(nom_tagged[song][tags][0])
 

In [24]:
## picking out adjectives across nominees
nom_adjs = []

for song in range(len(nom_tagged)): #each iteration is now working on a list of lists
    for tags in range(len(nom_tagged[song])): #now each iteration is working on the 2-item list of word,tag
       
        if nom_tagged[song][tags][1].startswith('J'):
            nom_adjs.append(nom_tagged[song][tags][0])
 

In [25]:
nom_verb_freq = Counter(nom_verbs)
nom_verb_freq.most_common(30)

[('be', 703),
 ('i', 591),
 ('is', 557),
 ('know', 516),
 ('do', 372),
 ('got', 302),
 ('was', 277),
 ('say', 259),
 ('go', 258),
 ('see', 258),
 ('love', 242),
 ('are', 235),
 ('dont', 227),
 ('want', 218),
 ('let', 201),
 ('have', 200),
 ('make', 200),
 ('get', 199),
 ('tell', 199),
 ('were', 195),
 ('been', 195),
 ('take', 195),
 ('feel', 168),
 ('come', 150),
 ('im', 139),
 ('think', 133),
 ('need', 121),
 ('cant', 115),
 ('had', 106),
 ('give', 104)]

In [26]:
nom_adj_freq = Counter(nom_adjs)
nom_adj_freq.most_common(30)

[('i', 819),
 ('oh', 201),
 ('im', 147),
 ('ive', 130),
 ('little', 104),
 ('good', 99),
 ('verse', 92),
 ('new', 87),
 ('bad', 71),
 ('more', 70),
 ('old', 69),
 ('better', 68),
 ('ill', 66),
 ('dont', 63),
 ('hard', 63),
 ('free', 60),
 ('best', 60),
 ('true', 59),
 ('long', 58),
 ('big', 57),
 ('cant', 53),
 ('alive', 52),
 ('same', 51),
 ('wrong', 45),
 ('high', 44),
 ('mum', 44),
 ('own', 43),
 ('much', 42),
 ('yeah', 42),
 ('gonna', 41)]

In [27]:
## what percentage of all tokens in nominees corpus is verbs/adjectives
## divide sum of verb counter by sum of total tokens counter

nom_verb_perc = (sum(nom_verb_freq.values()) / sum(nom_word_freq.values()))* 100
nom_adj_perc = (sum(nom_adj_freq.values()) / sum(nom_word_freq.values()))* 100

print('{} percent of the tokens in all nominated songs were verbs'.format(nom_verb_perc))
print('{} percent of the tokens in all nominated songs were adjectives'.format(nom_adj_perc))

21.864803606565545 percent of the tokens in all nominated songs were verbs
8.575556053699968 percent of the tokens in all nominated songs were adjectives


In [28]:
top_nom_verb = nom_verb_freq.most_common(30)
top_nom_adj = nom_adj_freq.most_common(30)

## log likelihood

doing log likelihood for a subset of frequently occurring love- and time-related words

In [29]:
## storing size of corpora for normalization
win_size = sum(win_word_freq.values())
nom_size = sum(nom_word_freq.values())

In [30]:
## list to enter into log likelihood. needs to be a counter
log_list = ['love', 'heart', 'always', 'never', 'forever']
win_entries = []

for word in log_list:
    key = word
    value = win_word_freq[key]
    
    word_count = (key, value)
    win_entries.append(word_count)
    
win_entries

[('love', 111), ('heart', 59), ('always', 26), ('never', 66), ('forever', 10)]

In [31]:
row_template = "{: <15}{: <8}{:0.2f}\t\t{: <10}{:0.2f}\t{: 0.2f}"

for word, freq in win_entries: #top acad is a counter, make word list a counter
    win = freq
    nom = nom_word_freq.get(word,0)
    norm_win = win/win_size * 1000
    norm_nom = nom/nom_size * 1000

    LL = 0 if nom==0 else log_likelihood(win, win_size, nom, nom_size)
    print(row_template.format(word, win, norm_win, nom, norm_nom, LL))

love           111     6.60		657       8.18	-4.59
heart          59      3.51		193       2.40	 6.04
always         26      1.55		110       1.37	 0.30
never          66      3.93		229       2.85	 4.93
forever        10      0.59		53        0.66	-0.09


okay, so none of these normalized frequencies show that these words are much more likely to appear in either corpus. let's look at the most common toks and see if anything surprising pops up. 

In [32]:
top_win = win_word_freq.most_common(50)

for word, freq in top_win: #top acad is a counter, make word list a counter
    win = freq
    nom = nom_word_freq.get(word,0)
    norm_win = win/win_size * 1000
    norm_nom = nom/nom_size * 1000

    LL = 0 if nom==0 else log_likelihood(win, win_size, nom, nom_size)
    print(row_template.format(word, win, norm_win, nom, norm_nom, LL))

you            631     37.54		3043      37.90	-0.05
the            580     34.50		2676      33.33	 0.57
i              543     32.30		3121      38.87	-16.53
and            406     24.15		1925      23.97	 0.02
a              349     20.76		1310      16.31	 15.27
to             325     19.33		1645      20.49	-0.92
me             268     15.94		1309      16.30	-0.11
it             246     14.63		1035      12.89	 3.12
my             236     14.04		1235      15.38	-1.68
in             189     11.24		966       12.03	-0.73
your           164     9.76		691       8.61	 2.03
that           162     9.64		832       10.36	-0.72
oh             161     9.58		595       7.41	 7.94
we             161     9.58		480       5.98	 24.66
on             161     9.58		599       7.46	 7.55
be             153     9.10		697       8.68	 0.28
of             148     8.80		718       8.94	-0.03
don't          125     7.44		477       5.94	 4.78
verse          121     7.20		488       6.08	 2.68
chorus         120     7.1

In [33]:
## display abbreviated list with words of interest
display = ['we', 'are', 'world', 'make', 'every']
win_sel = []

## making tuple list to pass through LL table
for word in display:
    key = word
    value = win_word_freq[key]
    
    word_count = (key, value)
    win_sel.append(word_count)
    
## printing display table
for word, freq in win_sel: 
    win = freq
    nom = nom_word_freq.get(word,0)
    norm_win = win/win_size * 1000
    norm_nom = nom/nom_size * 1000

    LL = 0 if nom==0 else log_likelihood(win, win_size, nom, nom_size)
    print(row_template.format(word, win, norm_win, nom, norm_nom, LL))

we             161     9.58		480       5.98	 24.66
are            109     6.48		235       2.93	 42.03
world          63      3.75		107       1.33	 37.51
make           88      5.23		208       2.59	 27.48
every          78      4.64		112       1.39	 58.90


In [34]:
## get counts for we, are, world, make in "We are the world"
usa_africa = Counter()

temp = next((song for song in winners_chart if song['year'] == '1986'), None)
usa_africa.update(tokenize(temp['lyrics'], lowercase = True, strip_chars = to_strip))

usa_africa_display = ['we', 'are', 'world', 'make']
for word in usa_africa_display:
    print('"{}" occurs {} times in the song'.format(word,usa_africa.get(word)))
    percentage = (usa_africa.get(word)/win_word_freq.get(word))*100
    print('this comprises {} percent of occurrences across all winning songs\n'.format(percentage))


"we" occurs 40 times in the song
this comprises 24.84472049689441 percent of occurrences across all winning songs

"are" occurs 35 times in the song
this comprises 32.11009174311927 percent of occurrences across all winning songs

"world" occurs 12 times in the song
this comprises 19.047619047619047 percent of occurrences across all winning songs

"make" occurs 23 times in the song
this comprises 26.136363636363637 percent of occurrences across all winning songs



In [35]:
## get count for every in "every breath you take"
breath = Counter()

police = next((song for song in winners_chart if song['year'] == '1984'), None)
breath.update(tokenize(police['lyrics'], lowercase= True, strip_chars = to_strip))

print('"{}" occurs {} times in the song'.format('every', breath.get('every')))
percentage = (breath.get('every') / win_word_freq.get('every'))*100
print('this comprises {} percent of occurrences across all winning songs'.format(percentage))

"every" occurs 52 times in the song
this comprises 66.66666666666666 percent of occurrences across all winning songs


the same thing that happened with the keyness chart seems to be happening here as well. the words with LL values that suggest occurrence in one corpus over another are associated with the title of a song and i think that's why they have sway. i don't think it's because they're being used in a particular way, but because they are repeated so often. especially for the smaller corpus of winning songs, with only 63 entries.

`we`, `are`, `world`, and `make` come from the 1986 winner, "We Are The World" performed by USA for Africa. The lyrics of the chorus go: `We are the world, We are the children, We are the ones to make a brighter day...`

`every` is likely coming from the 1984 winner, "Every Breath You Take" performed by the Police. Most of the lines in this song contain the word `every` two times

# concordance
### love words

In [36]:
love_conc = ['love', 'heart']

for word in love_conc:
    for song in range(len(winners_chart)):
        song_toks = tokenize(winners_chart[song]['lyrics'])
        
        select_conc = make_kwic(word, song_toks)
        print_kwic(sort_kwic(select_conc, order = ['L1']))

                       Who never fell in  love  It seems that I'm
                         can't I fall in  love  Like any other man
                         can't I fall in  love  Like any other man
                whispered empty words of  love  That left me alone
                         into my eyes My  love  and see All the
                        All the joy that  love  can bring I will
                   ensemble [Bridge 1] I  love  you, I love you,
                           I love you, I  love  you, I love you
                           I love you, I  love  you That's all I
                           what I mean I  love  you [Instrumental Verse: Guitar
                          end of time My  love  The first time, ever
                      is there Don't you  love  farce? My fault, I
                      write the songs of  love  and special things I
                      write the songs of  love  and special things I
                      write the songs of  love  and spec

In [37]:
for word in love_conc:
    for song in range(len(noms)):
        song_toks = tokenize(noms[song]['lyrics'])
        
        select_conc = make_kwic(word, song_toks)
        print_kwic(sort_kwic(select_conc, order = ['L1']))

                       day [Verse 1] For  love  may come and tap
                       day [Verse 1] For  love  may come and tap
                         know how much I  love  you Never know how
                         The joy of your  love  My head is high,
                  down Funny, 'cause I'd  love  to go traveling Small
                        way I do? Though  love  is blind, make up
                          tell me do you  love  me true Or is
                           us to fall in  love  Hey baby what's your
                     We're gonna fall in  love  We're on the road
                    are those who'll bet  love  comes but once and
                    are those who'll bet  love  comes but once and
                you think, perhaps, that  love  like yours Is wasted
                      time you hear your  love  song sung Makes you
                  their dreams All their  love  And the sweet secret
                 their dreams, all their  love     
             

                love Where's that higher  love  I keep thinking of?
                       Bring me a higher  love  Bring me a higher
                       Bring me a higher  love  Bring me a higher
                       Bring me a higher  love  Bring me a higher
                       Bring me a higher  love  Bring me a higher
                       Bring me a higher  love  Bring me a higher
                       Bring me a higher  love  Bring me a higher
                        Let me feel that  love  come over me Let
                     And she said losing  love  Is like a window
                        And I see losing  love  Is like a window
                        have it all When  love  was all we had
                        have it all When  love  was all we had
                      best of times When  love  was young and new?
                        have it all When  love  was all we had
                   You know you'll never  love  that way again Didn't
               

                       your best you are  love  And because I fall
                       dark stop And let  love  light the way Like
                 JAY-Z] I apologize, our  love  was one for the
                      believe me I could  love  you just like that
                         got the kind of  love  It takes to solve
                         got the kind of  love  It takes to solve
                         got the kind of  love  (You got 'em too)
                  Love, let's talk about  love  Is it anything and
                  Love, let's talk about  love  Is it anything and
                  Love, let's talk about  love  Is it anything and
                      Head over heels in  love  Right in front of
                      I wanna build this  love  And everything you want,
                     "Only partly I only  love  my bed and my
                        She say, "Do you  love  me?" I tell her,
                       I won't need your  love  when I'm gone Don't
  

                        It can turn your  heart  black you can trust
                         it deep in your  heart  And deep down you
                            a song in my  heart  Just like oil on
                            a song in my  heart  Just like oil on
                    Rihanna] You have my  heart  And we'll never be
                Together we'll mend your  heart  Because [Chorus: Rihanna] When
                   circles Waiting as my  heart  drops And my back
                       hear And my heavy  heart  sinks deep down under
                         to stay If your  heart  is nowhere in it
                       one that's on his  heart  [Pre-Chorus] Oh, oh-oh, oh,
                      the valley of your  heart  The sun, it rises
                      thief you stole my  heart  And I your willing
                         the scars on my  heart  You’re not broken, just
                      Never wanna put my  heart  on the line (ooh!)
                         a 

### time words

In [38]:
time_conc = ['always', 'never', "forever"]

for word in time_conc:
    for song in range(len(winners_chart)):
        song_toks = tokenize(winners_chart[song]['lyrics'])
        
        select_conc = make_kwic(word, song_toks)
        print_kwic(sort_kwic(select_conc, order = ['L1']))

                         meet me And I'm  always  late But she sits
                        of your hair You  always  have my unspoken passion
                      know that you will  always  be The same old
                          seems to be Is  always  better than nothing Than
                          seems to be Is  always  better than nothing Than
                          seems to be Is  always  better than nothing Than
                      away to where I've  always  heard it could be
                      away to where I've  always  heard it could be
                      away to where I've  always  heard it could be
                 blind [Chorus] You were  always  on my mind You
                        my mind You were  always  on my mind [Verse
                   [Chorus] And you were  always  on my mind You
                        my mind You were  always  on my mind [Bridge]
                       time But you were  always  on my mind You
                        my mi

In [39]:
for word in time_conc:
    for song in range(len(noms)):
        song_toks = tokenize(noms[song]['lyrics'])
        
        select_conc = make_kwic(word, song_toks)
        print_kwic(sort_kwic(select_conc, order = ['L1']))

                         ill? He knows I  always  will... As long as
                   that thought he would  always  be true Colour it
                   more For wives should  always  be lovers, too Run
                  again For wives should  always  be lovers, too Run
                     office And men will  always  be men Don't send
                           a man who has  always  had the wand'ring ways
                       that your door is  always  open And your path
                         I cried She was  always  young at heart Kinda
                        miss it now, you  always  got next time... And
                          a friend But I  always  thought that I'd see
                          a friend But I  always  thought that I'd see
                          a friend But I  always  thought that I'd see
                    deep And there won't  always  be someone there to
                      "I want you" She's  always  a lady Just like
                    lif

                   afraid of waking That  never  takes the chance It's
                    afraid of dyin' That  never  learns to live When
                     years I thought I'd  never  find you You have
                          in a city That  never  sleeps And find I'm
                      inside you know we  never  know why The road
                    you got dreams he'll  never  take away In the
                      your mind and they  never  give you credit It's
                      your mind and they  never  give you credit It's
                       your mind and you  never  get the credit It's
                 eyes Rosanna, Rosanna I  never  thought that a girl
           compromise Rosanna, Rosanna I  never  thought that losing you
                       you is now you'll  never  ever have to compromise
                         Is a place most  never  see It's a hard-won
                she's dancing like she's  never  danced before She's a
                she's danci

### love collocation

In [40]:
win_love_colls = Counter()
for song in range(len(winners_chart)):
    win_love_colls.update(collocates(tokenize(winners_chart[song]['lyrics']),'love', [3,3]))

In [41]:
win_love_colls.most_common(20)

[('I', 31),
 ('you', 29),
 ('with', 20),
 ('love', 16),
 ('of', 15),
 ("What's", 15),
 ('in', 14),
 ('to', 14),
 ('that', 11),
 ('and', 10),
 ('the', 10),
 ('got', 10),
 ('do,', 10),
 ('we', 10),
 ('[Chorus]', 9),
 ('And', 8),
 ('second-hand', 8),
 ('way', 7),
 ('out', 7),
 ('your', 6)]

In [42]:
nom_love_colls = Counter()
for song in range(len(noms)):
    nom_love_colls.update(collocates(tokenize(noms[song]['lyrics']),'love', [3,3]))

In [43]:
nom_love_colls.most_common(20)

[('I', 203),
 ('you', 200),
 ('the', 98),
 ('to', 73),
 ('And', 71),
 ('a', 65),
 ('in', 62),
 ('that', 53),
 ('me', 51),
 ('is', 50),
 ('and', 49),
 ('for', 43),
 ('my', 41),
 ('do', 38),
 ('can', 34),
 ('anything', 33),
 ('your', 32),
 ('like', 30),
 ('know', 29),
 ('with', 27)]

## sentiment analyses

In [44]:
sid = SentimentIntensityAnalyzer()

In [45]:
win_sent_scores = []

for song in range(len(winners_chart)):    
    #calculate score
    score = sid.polarity_scores(winners_chart[song]['lyrics'])
    
    #add to list to calculate avg later
    win_sent_scores.append(score['compound'])
    
    #print results
    print("{} by {} {}\n".format(winners_chart[song]['song_title'],
                                                                   winners_chart[song]['performing_artist'], score))

Volare by Domenico Modugno {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

The Battle of New Orleans by Johnny Horton {'neg': 0.087, 'neu': 0.882, 'pos': 0.031, 'compound': -0.9841}

Theme of Exodus by Instrumental (Various Artists) {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

Moon River by Henry Mancini {'neg': 0.0, 'neu': 0.912, 'pos': 0.088, 'compound': 0.6369}

What Kind of Fool Am I? by Sammy Davis Jr. {'neg': 0.232, 'neu': 0.677, 'pos': 0.092, 'compound': -0.967}

Days of Wine and Roses by Henry Mancini {'neg': 0.055, 'neu': 0.796, 'pos': 0.149, 'compound': 0.8176}

Hello, Dolly! by Louis Armstrong {'neg': 0.012, 'neu': 0.868, 'pos': 0.12, 'compound': 0.9489}

The Shadow of Your Smile by Tony Bennett {'neg': 0.041, 'neu': 0.727, 'pos': 0.233, 'compound': 0.9786}

Michelle by The Beatles {'neg': 0.0, 'neu': 0.848, 'pos': 0.152, 'compound': 0.9747}

Up, Up, and Away by The 5th Dimension {'neg': 0.021, 'neu': 0.653, 'pos': 0.326, 'compound': 0.9969}

Little Green Ap

In [46]:
win_sent_score = sum(win_sent_scores) / len(win_sent_scores)

In [47]:
nom_sent_scores = []

for song in range(len(noms)): 
    #calculate score
    score = sid.polarity_scores(noms[song]['lyrics'])
    
    #add to list to calculate avg later
    nom_sent_scores.append(score['compound'])
    
    #print results
    print("{} by {} {}\n".format(noms[song]['song_title'], noms[song]['performing_artist'], score))

Catch a Falling Star by Perry Como {'neg': 0.112, 'neu': 0.768, 'pos': 0.12, 'compound': 0.93}

Fever by Peggy Lee {'neg': 0.033, 'neu': 0.815, 'pos': 0.152, 'compound': 0.988}

Theme to Gigi by Various Artists {'neg': 0.147, 'neu': 0.754, 'pos': 0.1, 'compound': -0.7922}

Witchcraft by Frank Sinatra {'neg': 0.107, 'neu': 0.823, 'pos': 0.071, 'compound': -0.7096}

High Hopes by Frank Sinatra {'neg': 0.095, 'neu': 0.818, 'pos': 0.088, 'compound': -0.34}

I Know by Perry Como {'neg': 0.183, 'neu': 0.694, 'pos': 0.123, 'compound': -0.5951}

Like Young by André Previn {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

Small World by Johnny Mathis {'neg': 0.0, 'neu': 0.687, 'pos': 0.313, 'compound': 0.9933}

He'll Have to Go by Jim Reeves {'neg': 0.087, 'neu': 0.732, 'pos': 0.181, 'compound': 0.9646}

Nice 'n' Easy by Frank Sinatra {'neg': 0.05, 'neu': 0.534, 'pos': 0.416, 'compound': 0.995}

The Second Time Around by Andy Williams {'neg': 0.014, 'neu': 0.694, 'pos': 0.292, 'compound': 


I Just Called to Say I Love You by Stevie Wonder {'neg': 0.116, 'neu': 0.676, 'pos': 0.209, 'compound': 0.9903}

Time after Time by Cyndi Lauper {'neg': 0.081, 'neu': 0.891, 'pos': 0.028, 'compound': -0.913}

Money for Nothing by Dire Straits {'neg': 0.02, 'neu': 0.767, 'pos': 0.213, 'compound': 0.9983}

The Boys of Summer by Don Henley {'neg': 0.032, 'neu': 0.821, 'pos': 0.147, 'compound': 0.9904}

Everytime You Go Away by Paul Young {'neg': 0.045, 'neu': 0.907, 'pos': 0.048, 'compound': -0.4636}

I Want to Know What Love Is by Foreigner {'neg': 0.033, 'neu': 0.726, 'pos': 0.241, 'compound': 0.9973}

Sledgehammer by Peter Gabriel {'neg': 0.011, 'neu': 0.832, 'pos': 0.157, 'compound': 0.9942}

Addicted to Love by Robert Palmer {'neg': 0.027, 'neu': 0.753, 'pos': 0.221, 'compound': 0.9933}

Higher Love by Steve Winwood {'neg': 0.064, 'neu': 0.611, 'pos': 0.325, 'compound': 0.999}

Graceland by Paul Simon {'neg': 0.075, 'neu': 0.859, 'pos': 0.066, 'compound': 0.0258}

La Bamba by Los Lo

Girl Crush by Little Big Town {'neg': 0.17, 'neu': 0.636, 'pos': 0.195, 'compound': 0.1779}

See You Again by Wiz Khalifa featuring Charlie Puth {'neg': 0.021, 'neu': 0.872, 'pos': 0.106, 'compound': 0.9869}

Formation by Beyoncé {'neg': 0.136, 'neu': 0.717, 'pos': 0.146, 'compound': 0.4171}

I Took a Pill in Ibiza by Mike Posner {'neg': 0.116, 'neu': 0.75, 'pos': 0.134, 'compound': 0.6116}

Love Yourself by Justin Bieber {'neg': 0.117, 'neu': 0.73, 'pos': 0.152, 'compound': 0.986}

7 Years by Lukas Graham {'neg': 0.041, 'neu': 0.836, 'pos': 0.123, 'compound': 0.9919}

Despacito by Luis Fonsi & Daddy Yankee ft. Justin Bieber {'neg': 0.059, 'neu': 0.801, 'pos': 0.14, 'compound': 0.9622}

4:44 by Jay-Z {'neg': 0.135, 'neu': 0.738, 'pos': 0.128, 'compound': -0.5802}

Issues by Julia Michaels {'neg': 0.117, 'neu': 0.763, 'pos': 0.12, 'compound': 0.0258}

1-800-273-8255 by Logic ft. Alessia Cara & Khalid {'neg': 0.195, 'neu': 0.689, 'pos': 0.116, 'compound': -0.9944}

All the Stars by Kendr

In [48]:
nom_sent_score = sum(nom_sent_scores) / len(nom_sent_scores)