<a href="https://colab.research.google.com/github/liyueling13/Predicting-Banned-Books/blob/main/5)_Banned_Books_Topic_Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 5) Banned Books - Topic Modelling

Now we can perform topic modelling to break down our descriptions into their principal component parts.



## Setup

In [1]:
# this performs line wrapping on output text in Colab

from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [2]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Data Science/Springboard assignments/Capstone Three/Banned Books')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
import spacy
import re

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

## Topic Modelling with CountVectorizer

In [7]:
books_df = pd.read_csv('all_books_cleaned.csv')
books_df

Unnamed: 0,author,title,label,description
0,Arundhati Roy,The God of Small Things,1,debut novel affluent indian family fateful day...
1,Kevin Noble Maillard,Fry Bread,1,winner robert sibert informational book medal ...
2,Oge Mora,Saturday,1,special saturday plan mother way time
3,Jeffery Deaver,The Bone Collector,0,miss television series lincoln rhyme hunt bone...
4,Jennifer Probst,The Marriage Bargain,0,order selfish need bookstore owner billionaire...
...,...,...,...,...
1472,Arvin Ahmadi,How It All Blew Up,1,simon homo sapiens agenda italy ahmadi newest ...
1473,Jaye Robin Brown,The Key to You and Me,1,sweet funny lgbtq romance perfect fan becky al...
1474,Victoria Jamieson,When Stars Are Scattered,1,national book award finalist remarkable graphi...
1475,Megan Atwood,Raise the Stakes,1,audisee audio combine professional narration s...


In [8]:
books_df['label'].value_counts()

1    938
0    539
Name: label, dtype: int64

In [9]:
# let's initialize a CountVectorizer object
# only include terms that show up in 5ish descriptions

cv = CountVectorizer(min_df = .003)
X = cv.fit_transform(books_df['description'])
dtm = pd.DataFrame(X.toarray(), columns = cv.get_feature_names_out())

In [10]:
dtm.head()

Unnamed: 0,19th,21st,aaron,ability,able,abortion,about,absent,absolute,abuse,...,yellow,york,yorker,you,young,younger,youngest,your,youth,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
dtm.shape

(1477, 2674)

In [12]:
# now we'll perform svd
lsa = TruncatedSVD(7)
lsa.fit(dtm)

In [13]:
lsa_by_topic = pd.DataFrame(lsa.components_,
                index = ["component_1", "component_2", "component_3",
                        "component_4",
                         "component_5", "component_6", "component_7"],
                        #  "component_8", "component_9", 'component_10',
                        #  'component_11', 'component_12', 'component_13',
                        #  'component_14', 'component_15', 'component_16',
                        #  'component_17', 'component_18', 'component_19',
                        #  'component_20'],
                columns = cv.get_feature_names_out())

lsa_by_topic

Unnamed: 0,19th,21st,aaron,ability,able,abortion,about,absent,absolute,abuse,...,yellow,york,yorker,you,young,younger,youngest,your,youth,zone
component_1,0.001027,0.002184,0.003328,0.009392,0.010741,0.00133,0.001602,0.00187,0.002326,0.002044,...,0.001293,0.204987,0.003334,0.002651,0.080956,0.007796,0.002682,0.001875,0.010505,0.001665
component_2,0.001059,0.002547,-0.007584,-0.000395,-0.014721,-0.00069,-8.5e-05,0.001736,-0.000341,-0.003209,...,-0.002505,0.070187,0.000744,0.000542,0.001796,-0.00581,-0.001805,3.8e-05,0.00863,-0.001002
component_3,0.001066,0.001023,0.00984,-0.001007,0.004444,0.002551,-0.002953,0.003211,-0.002082,0.003287,...,0.000588,-0.336957,-0.003127,0.003336,0.05729,0.010104,0.00566,0.002059,0.01437,-0.002297
component_4,-0.000869,-0.00386,-0.001986,0.001766,0.004745,-0.000653,0.000351,0.00202,-0.001626,-0.001956,...,-0.001759,-0.002317,-0.004857,0.002315,-0.033079,-0.007769,-0.004264,-0.000431,-0.001112,0.001273
component_5,-0.003034,-0.007082,0.011125,-0.006484,0.004133,-0.001285,0.00086,0.001949,-0.002295,-0.000212,...,0.000598,0.099369,0.000136,-0.000172,-0.065266,0.00117,0.005295,-0.004835,-0.002135,-0.001213
component_6,-0.001917,-0.001415,-0.007306,-0.019352,0.008841,-6.8e-05,9e-05,0.004447,0.003704,0.005867,...,0.000766,-0.00346,0.000442,0.001307,-0.055866,0.005488,0.001229,-0.00556,-0.01156,-0.001673
component_7,-0.002368,-0.003871,-0.004833,-0.009197,-0.009417,-0.003237,3.8e-05,-0.000497,-0.000625,0.001183,...,0.004184,-0.029455,-0.002749,-1e-05,0.015993,-0.006209,-0.002063,-0.002249,-0.019708,0.002297


In [14]:
# Function to display the top n terms in each topic
def display_topics(model, feature_names, num_top_words, topic_names = None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix + 1)
        else:
            print("\nTopic: ", topic_names[ix])
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))
    print("\n")
    return model, feature_names, num_top_words

In [15]:
lsa_top_seven = display_topics(lsa, cv.get_feature_names_out(), 10)
lsa_top_seven


Topic  1
book, new, time, life, york, story, year, author, family, novel

Topic  2
book, time, award, york, child, picture, best, library, winner, national

Topic  3
book, family, child, school, story, girl, life, gender, parent, friend

Topic  4
school, girl, friend, best, new, high, thing, boy, student, friendship

Topic  5
family, new, child, home, york, school, parent, mother, day, father

Topic  6
year, best, life, friend, book, old, family, secret, girl, mother

Topic  7
story, year, novel, boy, best, author, girl, love, award, world




(TruncatedSVD(n_components=7),
 array(['19th', '21st', 'aaron', ..., 'your', 'youth', 'zone'],
       dtype=object),
 10)

In [None]:
# these are pretty generic and indistinguishable! disappointing

In [16]:
# let's try a different kind of modelling
nmf = NMF(10)
nmf.fit(dtm)

In [17]:
nmf_by_topic = pd.DataFrame(nmf.components_,
                index = ["component_1", "component_2", "component_3",
                        "component_4",
                         "component_5", "component_6", "component_7",
                         "component_8", "component_9", 'component_10'],
                        #  'component_11', 'component_12', 'component_13',
                        #  'component_14', 'component_15', 'component_16',
                        #  'component_17', 'component_18', 'component_19',
                        #  'component_20'],
                columns = cv.get_feature_names_out())

nmf_by_topic

Unnamed: 0,19th,21st,aaron,ability,able,abortion,about,absent,absolute,abuse,...,yellow,york,yorker,you,young,younger,youngest,your,youth,zone
component_1,0.0,0.0,0.0,0.062449,0.024041,0.0,0.031517,0.0,0.028173,0.0,...,0.0,4.349282,0.041633,0.0,0.0,0.0,0.0,0.0,0.0,0.013178
component_2,0.008542,0.019418,0.0,0.009256,0.0,0.002356,0.0,0.020815,0.001562,0.0,...,0.0,0.086796,0.006368,0.017031,0.413668,0.013054,0.001985,0.0,0.085462,0.0
component_3,0.018369,0.030647,0.0,0.011232,0.104934,0.01962,0.0,0.0,0.033244,0.005411,...,0.001713,0.0,0.029235,0.0,0.431148,0.104505,0.01902,0.0,0.025319,0.0
component_4,0.008428,0.0,0.064535,0.062196,0.01798,0.011167,0.0,0.020774,0.0,0.0,...,0.0,0.0,0.0,0.025774,0.075382,0.015583,0.0,0.0,0.064929,0.000452
component_5,0.0,0.0,0.09279,0.0,0.047696,0.004037,0.0,0.00659,0.0,0.013317,...,0.0,0.0,0.003898,0.000172,0.017962,0.06044,0.054721,0.0,0.038999,0.0
component_6,0.004371,0.026455,0.0,0.117437,0.0,0.0,0.002726,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.347114,0.021747,0.0,0.022201,0.047053,0.017839
component_7,0.0,0.0,0.019884,0.010817,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025962,0.0,0.011734,0.0,0.403457,0.0,0.009465,0.0,0.0,0.0
component_8,0.000342,0.006847,0.0,0.034082,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.270493,0.0,0.020811,0.11385,0.106538,0.0
component_9,0.0,0.0,0.0,0.0,0.124127,0.0,0.008178,0.012075,0.001339,0.028288,...,0.009167,0.0,0.0,0.0,0.05762,0.039584,0.0,0.0,0.0,0.0
component_10,0.0,0.0,0.0,0.0,0.023806,0.020602,0.0,0.0,0.000278,0.042641,...,0.011529,0.025009,0.0,0.053353,0.318069,0.0,0.0,0.0,0.0,0.025424


In [18]:
nmf_top_ten = display_topics(nmf, cv.get_feature_names_out(), 10)


Topic  1
new, time, york, author, review, series, first, way, the, post

Topic  2
book, best, award, child, picture, library, winner, review, national, honor

Topic  3
life, woman, secret, young, question, relationship, mother, year, god, choice

Topic  4
school, student, high, new, child, kid, racial, first, black, parent

Topic  5
family, child, home, parent, mother, different, full, father, house, many

Topic  6
world, people, american, history, black, war, work, right, series, man

Topic  7
story, author, day, boy, reader, little, tale, love, young, night

Topic  8
gender, identity, people, body, teen, way, transgender, guide, reader, sexuality

Topic  9
year, girl, friend, best, old, love, boy, thing, secret, everything

Topic  10
novel, author, award, love, man, woman, winning, debut, character, war




In [None]:
# these topics are more promising
# went back and tried ten topics on lsa but still not as good

# topics seem something like
# topic 1  new york times bestselling author
# topic 2  award winning children's book
# topic 3  young woman's journey
# topic 4  high school and race
# topic 5  family
# topic 6  unclear
# topic 7  boy storybook
# topic 8  gender and sexuality
# topic 9  girl and boy friendship and love
# topic 10  unclear

In [19]:
# can we get a banned/non-banned split?
nmf_2 = NMF(2)
nmf_2.fit(dtm)

nmf_top_two = display_topics(nmf_2, cv.get_feature_names_out(), 10)


Topic  1
new, life, time, story, novel, year, family, author, world, york

Topic  2
book, time, new, york, best, award, child, author, picture, winner




In [None]:
# not really!! very generic

## Topic modelling with TFIDF

TFIDF (term frequency inverse document frequency) weights words differently than CountVectorizer. Words that appear repeatedly in single documents but not the whole corpus are weighted more than in CountVectorizer.

In [20]:
tfidf = TfidfVectorizer(min_df = .003)
X = tfidf.fit_transform(books_df['description'])
tfidf_df = pd.DataFrame(X.toarray(), columns = tfidf.get_feature_names_out())
tfidf_df

Unnamed: 0,19th,21st,aaron,ability,able,abortion,about,absent,absolute,abuse,...,yellow,york,yorker,you,young,younger,youngest,your,youth,zone
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.071043,0.0,0.0,0.082961,0.00000,0.0,0.0,0.000000,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.042751,0.0,0.0,0.000000,0.00000,0.0,0.0,0.084516,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.090745,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1472,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0
1473,0.0,0.0,0.0,0.0,0.107457,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0
1474,0.0,0.0,0.0,0.0,0.099008,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.050374,0.0,0.0,0.058824,0.10646,0.0,0.0,0.000000,0.0
1475,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0


In [21]:
tfidf_df.shape

(1477, 2674)

In [22]:
# since we had better results with nmf above, let's try that again
nmf = NMF(10)
nmf.fit(tfidf_df)

In [23]:
nmf_by_topic = pd.DataFrame(nmf.components_,
                index = ["component_1", "component_2", "component_3",
                        "component_4",
                         "component_5", "component_6", "component_7",
                         "component_8", "component_9", 'component_10'],
                        #  'component_11', 'component_12', 'component_13',
                        #  'component_14', 'component_15', 'component_16',
                        #  'component_17', 'component_18', 'component_19',
                        #  'component_20'],
                columns = cv.get_feature_names_out())

nmf_by_topic

Unnamed: 0,19th,21st,aaron,ability,able,abortion,about,absent,absolute,abuse,...,yellow,york,yorker,you,young,younger,youngest,your,youth,zone
component_1,0.0,0.0,0.0,0.051758,0.021825,0.0,0.028474,0.0,0.013702,0.0,...,0.0,1.092194,0.028503,0.0,0.017155,0.0,0.0,0.00609,0.0,0.005984
component_2,0.0,0.0,0.019085,0.023874,0.048573,0.030893,0.0,0.0012,0.002216,0.0,...,0.0,0.0,0.0,0.014086,0.000674,0.0,0.0,0.0,0.012255,0.00807
component_3,0.000385,0.004061,0.004866,0.0,0.0,0.0,0.0,0.015371,0.0,0.0,...,0.0,0.007665,0.003449,0.026896,0.187895,0.031534,0.009555,0.0,0.025093,0.0
component_4,0.001081,0.004492,0.0,0.009094,0.0,0.0,0.0,0.0,0.0,0.013738,...,0.0,0.0,0.0,0.0,0.037574,0.0,0.002094,0.028,0.033025,0.0
component_5,0.0,0.0,0.024757,0.001852,0.043563,0.0,0.0,0.006817,0.001322,0.025274,...,0.002193,0.0,0.004453,0.0,0.00346,0.029657,0.026485,0.0,0.011579,0.0
component_6,0.0088,0.005304,0.016847,0.020733,0.0,0.01666,0.0,0.003011,0.0,0.0,...,0.0,0.0,0.006433,0.0,0.039347,0.0,0.0,0.001665,0.077301,0.0
component_7,0.0,0.0,0.0,0.0,0.00092,0.0,0.0,0.024158,0.0,0.0,...,0.013704,0.0,0.0,0.000511,0.0,0.04594,0.0,0.00491,0.0,0.0
component_8,0.010359,0.0,0.0,0.0,0.017039,0.021994,0.0,0.0,0.000485,0.0,...,0.003073,0.0,0.004088,0.0,0.169078,0.01778,0.0,0.0,0.0,0.0
component_9,0.010296,0.012646,0.008741,0.007995,0.002501,0.0,0.0,0.0,0.013409,0.009214,...,0.003698,0.0,0.004769,0.009603,0.098753,0.0,0.0,0.003893,0.00204,0.006104
component_10,0.0,0.0,0.0,0.001647,0.006713,0.0,0.007697,0.0,0.0,0.014438,...,0.007491,0.0,0.0,0.0,0.193081,0.005069,0.020309,0.0,0.0,0.0


In [24]:
nmf_top_ten = display_topics(nmf, tfidf.get_feature_names_out(), 10)
nmf_top_ten


Topic  1
time, new, york, author, novel, review, book, series, the, post

Topic  2
school, friend, high, best, thing, love, friendship, life, student, secret

Topic  3
book, child, award, picture, best, honor, library, illustrator, winning, winner

Topic  4
gender, identity, people, sexuality, sex, teen, body, binary, transgender, reader

Topic  5
family, home, child, parent, mother, house, life, brother, dad, different

Topic  6
american, black, african, white, racial, right, civil, history, race, america

Topic  7
year, old, mother, fifteen, father, seventeen, life, twelve, secret, sixteen

Topic  8
woman, life, first, men, young, medicine, secret, novel, career, printing

Topic  9
novel, world, classic, story, war, work, edition, man, human, life

Topic  10
girl, boy, story, something, little, town, sister, young, day, dress




(NMF(n_components=10),
 array(['19th', '21st', 'aaron', ..., 'your', 'youth', 'zone'],
       dtype=object),
 10)

In [None]:
# way better!!

# topics seem something like
# topic 1  new york times bestselling author
# topic 2  high school love and friendship
# topic 3  award winning children's book
# topic 4  gender and sexuality
# topic 5  families, parents, home
# topic 6  race and america
# topic 7  growing up
# topic 8  young woman protagonist
# topic 9  classic book or series
# topic 10  story about a girl or boy

In [25]:
# what about more categories, can we capture more complexity?
nmf = NMF(14)
nmf.fit(tfidf_df)

In [26]:
nmf_top_fourteen = display_topics(nmf, tfidf.get_feature_names_out(), 10)
nmf_top_fourteen


Topic  1
time, new, york, review, author, book, the, post, today, day

Topic  2
school, high, student, kid, group, teacher, classroom, senior, middle, new

Topic  3
book, child, picture, award, best, honor, library, illustrator, winner, young

Topic  4
gender, identity, body, binary, sexuality, sex, guide, reader, self, female

Topic  5
family, home, child, parent, mother, house, dad, different, brother, life

Topic  6
american, black, african, white, civil, right, racial, america, history, race

Topic  7
year, old, fifteen, mother, father, seventeen, twelve, life, sixteen, thirteen

Topic  8
woman, life, first, young, men, medicine, career, printing, southern, trace

Topic  9
classic, edition, introduction, penguin, work, reader, literature, note, life, story

Topic  10
boy, girl, story, something, little, young, town, sister, dress, day

Topic  11
world, series, war, human, man, vampire, power, fantasy, death, secret

Topic  12
people, gay, community, lgbt, transgender, teen, lesbia

(NMF(n_components=14),
 array(['19th', '21st', 'aaron', ..., 'your', 'youth', 'zone'],
       dtype=object),
 10)

In [None]:
# of all the numbers that I tried, I think that I like this one the best!
# it captures a bit more complexity than 10 topics

# topics seem something like
# topic 1  new york times bestselling author
# topic 2  middle and high school
# topic 3  award winning children's book
# topic 4  gender and sexuality
# topic 5  families, parents, home
# topic 6  race and america
# topic 7  growing up
# topic 8  young woman meeting a man
# topic 9  classic book or series
# topic 10  boy/girl story
# topic 11  some kind of book series
# topic 12  lgbtq sexuality
# topic 13  friendship
# topic 14  prize winning novel

## Building Feature Columns

Now we will build a feature column for each of these 14 columns and merge with our original dataframe.

In [27]:
doc_topic_matrix = nmf.transform(tfidf_df)

In [28]:
doc_topic_nmf_tfidf = pd.DataFrame(doc_topic_matrix.round(5),
                            index = books_df['title'], columns = ["nyt_author",
                                                               "middle_and_high_school",
                                                               "award_winning_childrens",
                                                               "gender_and_sexuality",
                                                               "families_and_home",
                                                               "race_and_america",
                                                               "growing_up",
                                                               "young_woman_meets_man",
                                                               "classics",
                                                               "boy_girl_story",
                                                               "book_series",
                                                               "lgbtq_sexuality",
                                                               "friendship",
                                                               "prize_novel"])

In [29]:
doc_topic_nmf_tfidf.head()

Unnamed: 0_level_0,nyt_author,middle_and_high_school,award_winning_childrens,gender_and_sexuality,families_and_home,race_and_america,growing_up,young_woman_meets_man,classics,boy_girl_story,book_series,lgbtq_sexuality,friendship,prize_novel
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
The God of Small Things,0.03021,0.0,0.0,0.0,0.08918,0.0,0.04075,0.0,0.05655,0.0154,0.0105,0.0,0.01384,0.10782
Fry Bread,0.03985,0.01887,0.1976,0.0,0.03409,0.02503,0.00571,0.0,0.0,0.0,0.0,0.0,0.0,0.00016
Saturday,0.04611,0.00214,0.0,0.0,0.03452,0.0,0.01626,0.0,0.0,0.00769,0.0,0.00306,0.01487,0.0
The Bone Collector,0.05074,0.00196,0.0,0.0,0.0,0.00357,0.0,0.0,0.01026,0.00323,0.06275,0.0,0.0,0.01957
The Marriage Bargain,0.0,0.0,0.0,0.0,0.0,0.0,0.06071,0.0,0.0,0.0,0.0,0.0,0.10148,0.00222


In [30]:
doc_topic_nmf_tfidf.shape

(1477, 14)

In [31]:
books_df.shape

(1477, 4)

In [35]:
books_df

Unnamed: 0,author,title,label,description
0,Arundhati Roy,The God of Small Things,1,debut novel affluent indian family fateful day...
1,Kevin Noble Maillard,Fry Bread,1,winner robert sibert informational book medal ...
2,Oge Mora,Saturday,1,special saturday plan mother way time
3,Jeffery Deaver,The Bone Collector,0,miss television series lincoln rhyme hunt bone...
4,Jennifer Probst,The Marriage Bargain,0,order selfish need bookstore owner billionaire...
...,...,...,...,...
1472,Arvin Ahmadi,How It All Blew Up,1,simon homo sapiens agenda italy ahmadi newest ...
1473,Jaye Robin Brown,The Key to You and Me,1,sweet funny lgbtq romance perfect fan becky al...
1474,Victoria Jamieson,When Stars Are Scattered,1,national book award finalist remarkable graphi...
1475,Megan Atwood,Raise the Stakes,1,audisee audio combine professional narration s...


In [36]:
prediction_df = pd.merge(books_df, doc_topic_nmf_tfidf, how='inner', on='title')
prediction_df

Unnamed: 0,author,title,label,description,nyt_author,middle_and_high_school,award_winning_childrens,gender_and_sexuality,families_and_home,race_and_america,growing_up,young_woman_meets_man,classics,boy_girl_story,book_series,lgbtq_sexuality,friendship,prize_novel
0,Arundhati Roy,The God of Small Things,1,debut novel affluent indian family fateful day...,0.03021,0.00000,0.00000,0.00000,0.08918,0.00000,0.04075,0.00000,0.05655,0.01540,0.01050,0.00000,0.01384,0.10782
1,Kevin Noble Maillard,Fry Bread,1,winner robert sibert informational book medal ...,0.03985,0.01887,0.19760,0.00000,0.03409,0.02503,0.00571,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00016
2,Oge Mora,Saturday,1,special saturday plan mother way time,0.04611,0.00214,0.00000,0.00000,0.03452,0.00000,0.01626,0.00000,0.00000,0.00769,0.00000,0.00306,0.01487,0.00000
3,Jeffery Deaver,The Bone Collector,0,miss television series lincoln rhyme hunt bone...,0.05074,0.00196,0.00000,0.00000,0.00000,0.00357,0.00000,0.00000,0.01026,0.00323,0.06275,0.00000,0.00000,0.01957
4,Jennifer Probst,The Marriage Bargain,0,order selfish need bookstore owner billionaire...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.06071,0.00000,0.00000,0.00000,0.00000,0.00000,0.10148,0.00222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1496,Arvin Ahmadi,How It All Blew Up,1,simon homo sapiens agenda italy ahmadi newest ...,0.00737,0.00000,0.00000,0.01545,0.03822,0.01567,0.09490,0.02039,0.00000,0.00000,0.00716,0.00206,0.06154,0.02668
1497,Jaye Robin Brown,The Key to You and Me,1,sweet funny lgbtq romance perfect fan becky al...,0.00872,0.00000,0.00616,0.00000,0.00000,0.00021,0.00000,0.00000,0.00000,0.04664,0.00000,0.00235,0.12218,0.03016
1498,Victoria Jamieson,When Stars Are Scattered,1,national book award finalist remarkable graphi...,0.02666,0.03437,0.01576,0.00000,0.09213,0.00000,0.00724,0.01363,0.00000,0.02665,0.00188,0.00375,0.00623,0.10011
1499,Megan Atwood,Raise the Stakes,1,audisee audio combine professional narration s...,0.00365,0.00000,0.00000,0.01220,0.00931,0.00000,0.00791,0.00322,0.00882,0.01792,0.01367,0.03065,0.00817,0.00476


In [37]:
prediction_df.drop_duplicates(inplace=True)
# I'm not sure why there are duplicates

In [38]:
prediction_df.shape

(1477, 18)

In [39]:
prediction_df.to_csv('prediction_df.csv', index=False)