In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD,NMF
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from gensim import models, matutils

In [None]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
fr = pd.read_csv('fr_clean.csv')
titles = fr.title
authors = fr.author
test = fr.clean

In [2]:
creating author-title for my flask app
ids = titles + '; par ' + authors

In [2]:
# Stops - uninformative or overly common words
stops = ['mariu','adj','aliocha','cosette','gwynplaine','andrée','ll','ivan','anne','marie','mme','clotild','pierrette',
         'césar','aramis','marguerite','pascal','mouret','catherine','luis','carolin','caroline','vladimir','mary','max',
         'li','ka','luis','rouletabille','bis','madeleine','marat','norine','rosa','samuel','odette','frédérique','darcy',
         'monique','peter','tadeo','toqui','mitia','arami','hortense','fantine','salvator','guiche','vallière','gavroche',
         'rouble','angélique','reine','louis','antoinette','germain',
         'duc','comtesse','comte','baron','citoyen','prince','sire','sir','chevalier','monseigneur','princesse','abbé',
         'duchesse','colonel','assemblée','dauphin','maréchal','miss','tante','xv','juif','empereur','cardinal','national',
         'curé','lord','forçat','préfet','président','commandant','vaisseau','tuilerie','ti','marquis','navire','évêque',
         'république','candeur','barricade','moine','don','indien','capitaine','docteur','police','oncle','chasseur',
         'mademoiselle','prisonnier','pretre','fusil','cousin','révolution','île','propriété','répliquer','répliqua',
         'ministre','prêtre','politique','argot','million','égout','toutefois','professeur','gentleman','russe','paver',
         'pavé','lady','exactement','mètre','répéta','vicomte','baronne','nièce','maman','papa','ta','ba','mé','in',
         'particulièrement']

In [10]:
cv = CountVectorizer(min_df=.05,max_df=.65,stop_words=stops)
cv_test = cv.fit_transform(test)

# LSA Model
Okay-ish topics.

In [12]:
lsa = TruncatedSVD(10,random_state=8,n_iter=10)
doc_topic = lsa.fit_transform(cv_test)
sum(lsa.explained_variance_ratio_)

0.2790740493787146

In [13]:
display_topics(lsa,cv.get_feature_names(),15)


Topic  0
interrompit, troupe, royal, valet, cadavre, canon, amant, bureau, palais, cavalier, ouvrier, lieutenant, bourgeois, instruction, gentilhomme

Topic  1
troupe, canon, matelot, camp, royal, nation, guerrier, cavalier, rocher, lieutenant, équipage, sabre, balle, armer, bâtiment

Topic  2
royal, gentilhomme, valet, décret, carrosse, nation, constitution, suisse, serment, vive, patrie, patriote, député, amant, tribune

Topic  3
maire, bourgeois, angle, boulevard, ouvrier, cabaret, grille, ténèbre, bonhomme, couvent, émeute, passant, sabre, chandelle, sergent

Topic  4
carrosse, gentilhomme, fée, épée, palais, amant, éper, comédien, coeur, valet, écu, philosophe, poète, cavalier, marbre

Topic  5
principe, social, gouvernement, etc, poète, nation, souverain, commerce, produit, article, classe, province, religion, ouvrier, système

Topic  6
matelot, équipage, lieutenant, marin, carrosse, bâtiment, port, frégate, royal, bateau, mât, philosophe, marine, embarcation, diamant

Topic  7


# NMF
Pretty good topics, so what I used in the end!

In [14]:
nmf = NMF(random_state=8,shuffle=True,n_components=17)
doc_topic = nmf.fit_transform(cv_test)

In [15]:
display_topics(nmf,cv.get_feature_names(), 15)


Topic  0
thé, chère, témoigner, chéri, espère, persuader, demoiselle, relation, agitation, entretien, allusion, sérieusement, prie, sympathie, sincère

Topic  1
guerrier, troupe, camp, rocher, cavalier, major, prairie, tribu, bandit, nation, ours, plaine, vengeance, espagnol, balle

Topic  2
royal, nation, décret, patrie, constitution, canon, patriote, tribune, vive, député, août, serment, autel, municipal, sabre

Topic  3
loup, neige, tabac, patte, coeur, ours, bateau, vache, ventre, pipe, comédien, chat, bouteille, paille, hé

Topic  4
assassin, agent, revolver, complice, inspecteur, bureau, auto, bandit, patron, reporter, cadavre, instruction, secrétaire, directeur, enquête

Topic  5
principe, social, gouvernement, produit, souverain, système, etc, matière, canadien, ouvrier, institution, commerce, classe, théorie, publier

Topic  6
matelot, équipage, marin, bâtiment, lieutenant, canot, port, bateau, nègre, frégate, canon, embarcation, mât, rivage, rocher

Topic  7
maire, couvent, 

In [16]:
# saving off with index as author-title combo
topicd = pd.DataFrame(doc_topic.round(5),
                      index=ids,
                      columns = ['romdram','war','revolution','countryside','spy_detect','pol_econ','boats','drama_life',
                       'foreign_exp','bus_commerce','relig','francoprussian','fairies','arts_bohe','knights','law_crime',
                       'candide'])

In [19]:
topicd.to_csv('topic_fr.csv')

# LDA Model
This doesn't work well because my books are not based on topics for the most part - many of them will include multiple topics.

In [157]:
cvt = cv_test.transpose()

In [158]:
corpus = matutils.Sparse2Corpus(cvt)
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [159]:
lda = models.LdaModel(corpus=corpus, num_topics=13, passes=100,random_state=8,id2word=id2word)

In [160]:
lda.print_topics()

[(0,
  '0.004*"revolver" + 0.003*"auto" + 0.003*"assassin" + 0.003*"complice" + 0.002*"bureau" + 0.002*"reporter" + 0.002*"inspecteur" + 0.002*"agent" + 0.002*"cadavre" + 0.002*"secrétaire"'),
 (1,
  '0.002*"chère" + 0.002*"témoigner" + 0.002*"thé" + 0.002*"espère" + 0.001*"entretien" + 0.001*"répondi" + 0.001*"persuader" + 0.001*"consolation" + 0.001*"sympathie" + 0.001*"chéri"'),
 (2,
  '0.005*"interrompit" + 0.004*"instruction" + 0.004*"agent" + 0.004*"avocat" + 0.003*"commissaire" + 0.003*"bureau" + 0.003*"assassin" + 0.003*"banquier" + 0.002*"magistrat" + 0.002*"veuve"'),
 (3,
  '0.005*"bureau" + 0.003*"ouvrier" + 0.003*"café" + 0.003*"etc" + 0.003*"boutique" + 0.003*"tabac" + 0.002*"bonhomme" + 0.002*"commerce" + 0.002*"bouteille" + 0.002*"patron"'),
 (4,
  '0.004*"coeur" + 0.004*"loup" + 0.004*"neige" + 0.003*"fée" + 0.002*"hé" + 0.002*"patte" + 0.002*"bâton" + 0.002*"vache" + 0.002*"sentier" + 0.002*"rocher"'),
 (5,
  '0.005*"amant" + 0.004*"artiste" + 0.003*"poète" + 0.002*"pr