In [1]:
import re
import json
import codecs
import string
import copy as cp
import numpy as np
import nltk
import sklearn.feature_selection as fs
from nltk.corpus import stopwords as st
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Binarizer
from sklearn.pipeline import Pipeline
from sklearn import naive_bayes
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
from sklearn.cross_validation import KFold, LeaveOneOut, StratifiedKFold
from sklearn.grid_search import GridSearchCV
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.base import clone
# from ranker import create_occ_matrix, create_rank_matrix



In [None]:
# nltk.download()


In [15]:
import pyphen

In [21]:
hyp = pyphen.Pyphen(lang='en')

In [22]:
hyp.inserted("language")

'lan-guage'

In [25]:
stwords = st.words('english')

def get_preprocessor(suffix=''):
    def preprocess(unicode_text):
        return unicode(unicode_text.strip().lower() + suffix)
    return preprocess


def preprocess_data(X, n, suffix='', binarize=True):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1),
                                 preprocessor=get_preprocessor(suffix))
    X = vectorizer.fit_transform(X)
    X = Binarizer(copy=False).fit_transform(X) if binarize else X
    return X


def preprocess_lyric(lyric):
    new_lyric = cp.deepcopy(lyric)
#     pct = ((string.punctuation).replace("_", "")).replace("'", "")    
    pct = (string.punctuation).replace("_", "")
    new_lyric = new_lyric.translate(str.maketrans('','', pct))
#     new_lyric = new_lyric.decode("utf8")
    new_lyric = new_lyric.replace("_", " _ ")
    new_lyric = new_lyric.replace("'", " ")
    new_lyric = new_lyric.lower()
    new_lyric = new_lyric.split()
    return new_lyric

In [26]:
def load_data(filename='sm-vs-all-lyrics.txt'):
    lyrics, y = [], []

    with codecs.open(filename, 'r', encoding="utf-8") as f:
        for line in f:
            aux = line.split("\t")
            if len(aux) != 2:
                print("aux", aux)
            else:
                lyr, label = aux[0], aux[1]
                lyrics.append(preprocess_lyric(lyr))
                y.append(int(label))

    lyrics, y = np.array(lyrics), np.array(y, dtype=np.int)
    return lyrics, y

In [27]:
x, y = load_data()

In [57]:
def merge_verses(x):
    new_x = []
    for i, lyric in enumerate(x):
        new_x.append((" ".join(x[i])) + " $tay") # append the entire lyrics
    return new_x    
        

In [47]:
# (" ".join(x[0])), x[0]

In [58]:
new_x = merge_verses(x)

In [59]:
new_x[0]

'david the wind blows _ the wind blows _ bits of your life away _ your friends all say _ where is our boy oh weve lost our boy _ but they should know _ where youve gone _ because again and again youve explained that _ youre going to _ _ oh youre going to _ yeah yeah yeah yeah _ england for the english _ england for the english _ _ david the winds blow _ the winds blow _ all of my dreams away _ and i still say _ where is our boy _ ah weve lost our boy _ but i should know _ why youve gone _ because again and again youve explained _ youve gone to the _ _ national ah _ to the national _ theres a country you dont live there _ but one day you would like to _ and if you show them what youre made of _ oh then you might do _ _ but david we wonder _ we wonder if the thunder _ is ever really gonna begin _ begin begin _ your mom says _ ive lost my boy _ but she should know _ why youve gone _ because again and again youve explained _ youve gone to the _ _ national _ to the national _ to the nationa

In [52]:
import codecs

In [60]:
with codecs.open("smiths_merged.txt", "w", encoding="utf-8") as f:
    for lyric in new_x:
        f.write(lyric + "\n")

In [56]:
from textgenrnn import textgenrnn

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [61]:
textgen = textgenrnn()

In [62]:
textgen.train_from_file('smiths_merged.txt', num_epochs=1)
textgen.generate()

2,407 texts collected.
Training on 2,360,441 character sequences.
Epoch 1/1
####################
Temperature: 0.2
####################
dededmdeeded oh how i say _ i can see the stars _ the light in the stars _ and the stars of the stars _ and i know its all on the love _ i want to be the one _ and i can see the control _ and its a floor _ _ i want to be the world _ when i dont know it was in the one _ i am a body so longer _ _ i 

edddeded sun _ _ and i want to be all _ _ i want to say _ _ i can see it _ she says i love you _ _ i want to be a longer of a long _ and i dont want to say _ i want to be a longer _ i dont care i could be a longer _ _ i want to be a longer _ and i can see the stars _ _ and i want to be a longer _ 

meedmede _ and its all the love _ i cant see the stars _ and i can see the stars _ and i cant wait to be the world _ its not a line _ _ i want to be a longer _ and i want to be a world _ i want to be a longer _ _ i dont want to be a longer _ i cant see the world _ 

In [71]:
print(textgen.generate())

dmeeaeeled deep secret _ and i wanna say im alone _ _ its not gone _ _ no one of me _ _ to him _ i can take me to the tops _ what has no longer _ no more lie _ _ i think im without me _ the first to the look about the room _ its need to end _ _ i want to go _ hey i cant look at my princes _ and i 

None


In [None]:
print(textgen.generate(3, temperature=0.1))

In [29]:
ct = 0
for i in range(len(y)):
    if(y[i] == 1 ):
        ct += 1
ct

274

In [147]:
bb = []
for word in x[0]:
    if word != "_":
        bb.append(str(len(hyp.inserted(word).split("-"))))
    else:
        bb.append("_")
        
b = (" ".join(bb)).split("_")
# b
# len(x[0]) == len(bb)

In [195]:
def hyphenate(lyric):
    '''hyphenate lyrics of a song'''
    hyp_lyric = []
    for word in lyric:
        if word != "_":
            hyp_lyric.append(hyp.inserted(word).split("-"))
        else:
            hyp_lyric.append(word)
#     print("hyp_lyric", hyp_lyric)
    return hyp_lyric

def count_syllables(lyric):
    syl_lyric, pat_lyric = [], []
#     print("lyr", lyric)
    for word in lyric:
        if word != "_":
            syl_lyric.append(str(len(word))) # split word into syllables and count
        else:
            syl_lyric.append(word)
    syl_lyric = (" ".join(syl_lyric)) # unite word syllable counts into verses
    print("syl", syl_lyric)
    
    for i in range(len(syl_lyric)): # for each verse
        if syl_lyric[i] != "_":
            pat_lyric.append(str(sum([int(c) for c in syl_lyric[i].split()])))
        else:
            pat_lyric.append("_")
    pat_lyric = (" ".join(pat_lyric)).split("_")
    
    print ("syl, pat", syl_lyric, pat_lyric)
    return syl_lyric, pat_lyric


def get_verse_patterns(lyrics):
    
    hyp_lyrics, syl_lyrics = [], []
    for lyric in lyrics:
        hyp_lyric = hyphenate(lyric) 
        hyp_lyrics.append(hyp_lyric)  # save
        
        syl_lyric = count_syllables(hyp_lyric)
        syl_lyrics.append(syl_lyric)  # save
             
    print(len(hyp_lyrics) == len(lyrics), len(syl_lyrics) == len(lyrics))

            
        
    

In [196]:
# print(b[0], sum([int(c) for c in b[0].split()]))
xx = hyphenate(x[0])
count_syllables(xx)

syl 1 1 1 1 _ 1 1 1 _ 1 1 1 1 1 _ 1 1 1 1 _ 1 1 1 1 1 1 1 1 1 _ 1 1 1 1 _ 1 2 1 _ 2 1 1 1 2 2 1 _ 1 2 1 _ _ 1 1 2 1 _ 1 1 1 1 _ 2 1 1 2 _ 2 1 1 2 _ _ 1 1 1 1 _ 1 1 1 _ 1 1 1 1 1 _ 1 1 1 1 _ 1 1 1 1 _ 1 1 1 1 1 _ 1 1 1 1 _ 1 2 1 _ 2 1 1 1 2 2 _ 2 1 1 1 _ _ 3 1 _ 1 1 3 _ 1 1 2 1 1 1 1 _ 1 1 1 1 1 1 1 _ 1 1 1 1 1 1 1 1 1 _ 1 1 1 1 1 _ _ 1 1 1 2 _ 1 2 1 1 2 _ 1 2 3 1 2 _ 2 2 _ 1 1 1 _ 1 1 1 1 _ 1 1 1 1 _ 1 2 1 _ 2 1 1 1 2 2 _ 2 1 1 1 _ _ 3 _ 1 1 3 _ 1 1 3 1 2 _ 2 1 1 1 1 1 1 2 _ 1 1 1 1 1 1 2 _ 1 1 1 1 1 1 2 _ 1 2 2 1 1 _ _ 1 1 3 2 1
syl, pat 1 1 1 1 _ 1 1 1 _ 1 1 1 1 1 _ 1 1 1 1 _ 1 1 1 1 1 1 1 1 1 _ 1 1 1 1 _ 1 2 1 _ 2 1 1 1 2 2 1 _ 1 2 1 _ _ 1 1 2 1 _ 1 1 1 1 _ 2 1 1 2 _ 2 1 1 2 _ _ 1 1 1 1 _ 1 1 1 _ 1 1 1 1 1 _ 1 1 1 1 _ 1 1 1 1 _ 1 1 1 1 1 _ 1 1 1 1 _ 1 2 1 _ 2 1 1 1 2 2 _ 2 1 1 1 _ _ 3 1 _ 1 1 3 _ 1 1 2 1 1 1 1 _ 1 1 1 1 1 1 1 _ 1 1 1 1 1 1 1 1 1 _ 1 1 1 1 1 _ _ 1 1 1 2 _ 1 2 1 1 2 _ 1 2 3 1 2 _ 2 2 _ 1 1 1 _ 1 1 1 1 _ 1 1 1 1 _ 1 2 1 _ 2 1 1 1 2 2 _ 2 1 1 1 _ _ 3 _ 1 1 3 _ 1 1 3 1 2

('1 1 1 1 _ 1 1 1 _ 1 1 1 1 1 _ 1 1 1 1 _ 1 1 1 1 1 1 1 1 1 _ 1 1 1 1 _ 1 2 1 _ 2 1 1 1 2 2 1 _ 1 2 1 _ _ 1 1 2 1 _ 1 1 1 1 _ 2 1 1 2 _ 2 1 1 2 _ _ 1 1 1 1 _ 1 1 1 _ 1 1 1 1 1 _ 1 1 1 1 _ 1 1 1 1 _ 1 1 1 1 1 _ 1 1 1 1 _ 1 2 1 _ 2 1 1 1 2 2 _ 2 1 1 1 _ _ 3 1 _ 1 1 3 _ 1 1 2 1 1 1 1 _ 1 1 1 1 1 1 1 _ 1 1 1 1 1 1 1 1 1 _ 1 1 1 1 1 _ _ 1 1 1 2 _ 1 2 1 1 2 _ 1 2 3 1 2 _ 2 2 _ 1 1 1 _ 1 1 1 1 _ 1 1 1 1 _ 1 2 1 _ 2 1 1 1 2 2 _ 2 1 1 1 _ _ 3 _ 1 1 3 _ 1 1 3 1 2 _ 2 1 1 1 1 1 1 2 _ 1 1 1 1 1 1 2 _ 1 1 1 1 1 1 2 _ 1 2 2 1 1 _ _ 1 1 3 2 1',
 ['1 0 1 0 1 0 1 0 ',
  ' 0 1 0 1 0 1 0 ',
  ' 0 1 0 1 0 1 0 1 0 1 0 ',
  ' 0 1 0 1 0 1 0 1 0 ',
  ' 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 ',
  ' 0 1 0 1 0 1 0 1 0 ',
  ' 0 1 0 2 0 1 0 ',
  ' 0 2 0 1 0 1 0 1 0 2 0 2 0 1 0 ',
  ' 0 1 0 2 0 1 0 ',
  ' 0 ',
  ' 0 1 0 1 0 2 0 1 0 ',
  ' 0 1 0 1 0 1 0 1 0 ',
  ' 0 2 0 1 0 1 0 2 0 ',
  ' 0 2 0 1 0 1 0 2 0 ',
  ' 0 ',
  ' 0 1 0 1 0 1 0 1 0 ',
  ' 0 1 0 1 0 1 0 ',
  ' 0 1 0 1 0 1 0 1 0 1 0 ',
  ' 0 1 0 1 0 1 0 1 0 ',
