In [70]:
import csv
import pathlib
import time

In [16]:
from gensim.parsing.preprocessing import preprocess_string
from gensim.corpora.textcorpus import TextCorpus
from gensim.models import LsiModel, LdaModel
from gensim import similarities
from gensim import utils

In [24]:
class CSVCorpus(TextCorpus):
    def getstream(self):
        with open(pathlib.Path(self.input)) as csv_file:
            reader = csv.DictReader(csv_file)
            for row in reader:
                yield row["text"]
    def get_docs(self):
        for doc in self.getstream():
            tokens = preprocess_string(utils.to_unicode(doc))
            yield tokens
        return
    def __len__(self):
        self.length = sum(1 for _ in self.get_texts())
        return self.length

In [6]:
datapath = pathlib.Path("data/songdata.csv")

In [25]:
cc = CSVCorpus(datapath)

In [26]:
print("Number of documents: {}".format(len(cc)))
print("Number of unique items in dictionary: {}".format(len(cc.dictionary)))

57650

In [30]:
t0 = time.time()
lsi = LsiModel(cc, id2word=cc.dictionary, num_topics=150, onepass=False)
t1 = time.time()
print("Trained LSI model in: {} minutes".format((t1-t0)/60.0)

In [31]:
t0 = time.time()
lda = LdaModel(cc, id2word=cc.dictionary, num_topics=150, passes=5)
t1 = time.time()
print("Trained LSI model in: {} minutes".format((t1-t0)/60.0)

In [65]:
index = similarities.MatrixSimilarity(lda[cc])

In [66]:
index.save("data/lyrics_lda.index")

In [74]:
sims = index[lda[cc.dictionary.doc2bow(preprocess_string(data[0]["text"]))]]

In [75]:
ordered_sims = sorted(enumerate(sims), key = lambda k: k[1], reverse=True)

In [76]:
for i, e in enumerate(ordered_sims):
    print(i, e)
    if i == 9:
        break

0 (0, 0.78802884)
1 (52886, 0.7080367)
2 (48223, 0.70101875)
3 (24588, 0.6954764)
4 (23119, 0.68913555)
5 (16756, 0.6842629)
6 (10546, 0.6799623)
7 (92, 0.6615739)
8 (13608, 0.6531278)
9 (5485, 0.6299102)


In [77]:
print(data[52886]["artist"], data[52886]["song"])
print("-"*80)
print(data[52886]["text"])

The Temptations My Girl
--------------------------------------------------------------------------------
I've got sunshine on a cloudy day  
When it's cold outside, I've got the month of May  
I guess you'd say  
What can make me feel this way?  
  
My girl, my girl, my girl  
Talkin' 'bout my girl  
My girl  
  
I've got so much honey, the bees envy me  
I've got a sweeter song than the birds in the trees  
Well, I guess you'd say  
What can make me feel this way?  
  
My girl, my girl, my girl  
Talkin' 'bout my girl  
My girl  
Ooh  
  
Hey, hey, hey  
Hey, hey, hey  
Yeah  
  
I don't need no money, fortune or fame  
I got all the riches baby, one man can claim  
Well, I guess you'd say  
What can make me feel this way?  
  
My girl, my girl, my girl  
Talkin' 'bout my girl  
My girl  
  
I've got sunshine on a cloudy day  
With my girl  
I've even got the month of may  
With my girl  
  
Talkin' 'bout, talkin' 'bout  
Talkin' 'bout my girl, my girl  
That's all I can talk about  


In [78]:
print(data[0]["artist"], data[0]["song"])
print("-"*80)
print(data[0]["text"])

ABBA Ahe's My Kind Of Girl
--------------------------------------------------------------------------------
Look at her face, it's a wonderful face  
And it means something special to me  
Look at the way that she smiles when she sees me  
How lucky can one fellow be?  
  
She's just my kind of girl, she makes me feel fine  
Who could ever believe that she could be mine?  
She's just my kind of girl, without her I'm blue  
And if she ever leaves me what could I do, what could I do?  
  
And when we go for a walk in the park  
And she holds me and squeezes my hand  
We'll go on walking for hours and talking  
About all the things that we plan  
  
She's just my kind of girl, she makes me feel fine  
Who could ever believe that she could be mine?  
She's just my kind of girl, without her I'm blue  
And if she ever leaves me what could I do, what could I do?


