<a href="https://colab.research.google.com/github/liyueling13/Predicting-Banned-Books/blob/main/5)_Banned_Books_Topic_Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 5) Banned Books - Topic Modelling

Now we can perform topic modelling to break down our descriptions into their principal component parts.



## Setup

In [None]:
# this performs line wrapping on output text in Colab

from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Data Science/Springboard assignments/Capstone Three/Banned Books')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
import spacy
import re

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

## Topic Modelling with CountVectorizer

In [None]:
books_df = pd.read_csv('all_books_cleaned.csv')
books_df

Unnamed: 0,author,title,label,description
0,Arundhati Roy,The God of Small Things,1,debut novel affluent indian family fateful day...
1,Kevin Noble Maillard,Fry Bread,1,winner robert sibert informational book medal ...
2,Oge Mora,Saturday,1,special saturday plan mother way time
3,Jeffery Deaver,The Bone Collector,0,miss television series lincoln rhyme hunt bone...
4,Jennifer Probst,The Marriage Bargain,0,order selfish need bookstore owner billionaire...
...,...,...,...,...
1696,Arvin Ahmadi,How It All Blew Up,1,simon homo sapiens agenda italy ahmadi newest ...
1697,Jaye Robin Brown,The Key to You and Me,1,sweet funny lgbtq romance perfect fan becky al...
1698,Victoria Jamieson,When Stars Are Scattered,1,national book award finalist remarkable graphi...
1699,Megan Atwood,Raise the Stakes,1,audisee audio combine professional narration s...


In [None]:
books_df['label'].value_counts()

1    1013
0     688
Name: label, dtype: int64

In [None]:
# let's initialize a CountVectorizer object
# only include terms that show up in 5ish descriptions

cv = CountVectorizer(min_df = .003)
X = cv.fit_transform(books_df['description'])
dtm = pd.DataFrame(X.toarray(), columns = cv.get_feature_names_out())

In [None]:
dtm.head()

Unnamed: 0,19th,21st,aaron,ability,able,abortion,about,absent,absolute,abuse,...,york,yorker,you,young,younger,youngest,your,youth,zoey,zone
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,3,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
dtm.shape

(1701, 2706)

In [None]:
# now we'll perform svd
lsa = TruncatedSVD(7)
lsa.fit(dtm)

In [None]:
lsa_by_topic = pd.DataFrame(lsa.components_,
                index = ["component_1", "component_2", "component_3",
                        "component_4",
                         "component_5", "component_6", "component_7"],
                        #  "component_8", "component_9", 'component_10',
                        #  'component_11', 'component_12', 'component_13',
                        #  'component_14', 'component_15', 'component_16',
                        #  'component_17', 'component_18', 'component_19',
                        #  'component_20'],
                columns = cv.get_feature_names_out())

lsa_by_topic

Unnamed: 0,19th,21st,aaron,ability,able,abortion,about,absent,absolute,abuse,...,york,yorker,you,young,younger,youngest,your,youth,zoey,zone
component_1,0.000992,0.001835,0.002324,0.009879,0.008617,0.000916,0.001271,0.001353,0.002039,0.001371,...,0.208274,0.003857,0.002329,0.091038,0.006276,0.002372,0.001762,0.010803,0.001671,0.001745
component_2,0.00042,0.001516,-0.0022,0.000362,-0.005396,0.000234,-0.001065,0.001807,-0.001859,-0.001082,...,-0.101585,-0.001461,-0.00027,0.052717,-0.002573,-3.8e-05,-0.000158,0.018775,-0.001801,-0.001288
component_3,-0.000381,0.000323,-0.007906,0.003242,-0.012649,-0.00213,0.000932,-0.001501,-0.00044,-0.00335,...,0.209472,0.002461,-0.002569,-0.046447,-0.011395,-0.004268,-0.001527,-0.004832,-0.005557,-0.00102
component_4,0.001959,0.003214,0.007132,-0.002595,-0.003938,0.001267,-0.002129,-0.000305,-0.000987,0.001985,...,-0.23083,-0.005784,0.001339,0.057024,0.005655,0.003485,0.004895,0.006666,-3.3e-05,-0.001197
component_5,-0.000266,-0.002418,0.003954,0.015541,0.000931,9e-05,-0.00139,0.001147,-0.001645,-0.000581,...,-0.127112,-0.006588,0.001478,-0.009176,-0.00567,-0.000952,0.000611,0.002788,0.011183,0.003493
component_6,0.003951,0.007649,-0.002855,0.032056,-0.00225,0.000705,-0.00023,-0.001849,-0.001579,-0.003472,...,-0.005807,0.00141,0.000659,0.000321,-0.002612,-0.003356,0.005003,0.019254,-0.000313,-0.002227
component_7,-0.001331,-0.005399,0.011309,0.01152,-0.001121,-0.000116,-0.001023,0.000682,-0.004768,-0.001745,...,0.064327,-0.001299,-0.002559,-0.07497,0.001679,0.002358,-0.001923,0.011498,-0.00115,-0.002909


In [None]:
# Function to display the top n terms in each topic
def display_topics(model, feature_names, num_top_words, topic_names = None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix + 1)
        else:
            print("\nTopic: ", topic_names[ix])
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))
    print("\n")
    return model, feature_names, num_top_words

In [None]:
lsa_top_seven = display_topics(lsa, cv.get_feature_names_out(), 10)
lsa_top_seven


Topic  1
book, new, time, york, life, story, author, year, novel, world

Topic  2
book, child, award, library, picture, best, winner, gender, notable, honor

Topic  3
time, york, jordan, book, robert, new, series, wheel, dragon, reborn

Topic  4
world, jordan, robert, child, family, novel, story, series, tower, wheel

Topic  5
school, friend, girl, jordan, best, high, robert, boy, student, gender

Topic  6
people, school, life, american, racial, gender, history, race, student, identity

Topic  7
family, child, school, new, racial, american, parent, student, black, race




(TruncatedSVD(n_components=7),
 array(['19th', '21st', 'aaron', ..., 'youth', 'zoey', 'zone'],
       dtype=object),
 10)

In [None]:
# not perfect but seems promising
# something like --
# topic 1  new york times bestseller book?
# topic 2  award winning children's book (award, honor, winner, library)
# topic 3  some kind of fantasy novel (wheel, dragon, series)
# topic 4  similar fantasy (world, tower, wheel)
# topic 5  friendship and gender
# topic 6  race and america, especially with respect to history
# topic 7  race and america, especially the black race

In [None]:
# let's try a different kind of modelling
nmf = NMF(10)
nmf.fit(dtm)

In [None]:
nmf_by_topic = pd.DataFrame(nmf.components_,
                index = ["component_1", "component_2", "component_3",
                        "component_4",
                         "component_5", "component_6", "component_7",
                         "component_8", "component_9", 'component_10'],
                        #  'component_11', 'component_12', 'component_13',
                        #  'component_14', 'component_15', 'component_16',
                        #  'component_17', 'component_18', 'component_19',
                        #  'component_20'],
                columns = cv.get_feature_names_out())

nmf_by_topic

Unnamed: 0,19th,21st,aaron,ability,able,abortion,about,absent,absolute,abuse,...,york,yorker,you,young,younger,youngest,your,youth,zoey,zone
component_1,0.0,0.0,0.0,0.066438,0.021681,0.0,0.026641,0.0,0.018538,0.0,...,4.565165,0.082593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.002582,0.008584,0.0,0.00513,0.0,0.000205,0.0,0.013674,0.0,0.0,...,0.10998,0.000403,0.007718,0.75631,0.0,0.0,0.0,0.11602,0.0,0.0
component_3,0.007429,0.01092,0.0,0.0,0.073116,0.014456,0.00094,0.001746,0.017475,0.006884,...,0.0,0.01388,0.041645,0.401265,0.12047,0.029547,0.000382,0.0,0.0,0.0
component_4,0.001473,0.0,0.007108,0.064371,0.0,0.0,0.0,0.0,0.0,0.0,...,0.800337,0.0,0.002569,0.0,0.0,0.003163,0.008378,0.0,0.0,0.0
component_5,0.0,0.0,0.0,0.0,0.152953,0.006552,0.003738,0.009093,0.013216,0.041173,...,0.0,0.0,0.01481,0.324482,0.049249,0.004666,0.0,0.0,0.084143,0.038175
component_6,0.016452,0.0,0.043405,0.275386,0.0,0.008648,0.0,0.02123,0.0,0.0,...,0.0,0.0,0.019738,0.048732,0.011829,0.0,0.0,0.084507,0.020173,0.0
component_7,0.0,0.0,0.094659,0.0,0.050709,0.011462,0.0,0.009857,0.0,0.017471,...,0.0,0.0,0.00066,0.094821,0.08008,0.052131,0.0,0.060923,0.0,0.0
component_8,0.000383,0.0,0.019972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.007582,0.0,0.300664,0.0,0.003998,0.005015,0.0,0.0,0.014443
component_9,0.0,0.018749,0.0,0.052895,0.019433,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.031587,0.0,0.021578,0.108801,0.215564,0.021307,0.0
component_10,0.025235,0.05901,0.0,0.033574,0.013743,0.006452,0.007181,0.0,0.021128,0.007277,...,0.0,0.0,0.00502,0.650838,0.0,0.0,0.008969,0.0,0.0,0.016458


In [None]:
nmf_top_ten = display_topics(nmf, cv.get_feature_names_out(), 10)


Topic  1
new, time, york, author, the, people, year, way, review, post

Topic  2
book, best, award, child, library, young, picture, winner, review, honor

Topic  3
life, woman, year, secret, young, question, truth, mother, journey, face

Topic  4
time, jordan, robert, wheel, novel, series, world, dragon, great, reborn

Topic  5
girl, friend, best, year, novel, love, author, old, thing, boy

Topic  6
school, student, racial, high, new, conversation, race, courageous, group, black

Topic  7
family, child, parent, home, mother, year, different, old, father, american

Topic  8
story, boy, author, day, love, little, reader, first, night, tale

Topic  9
gender, people, identity, way, transgender, teen, body, sexuality, community, guide

Topic  10
world, novel, series, classic, reader, war, american, dark, history, woman




In [None]:
# these topics are way better!
# went back and tried ten topics on lsa but still not as good

# topics seem something like
# topic 1  new york times bestselling author
# topic 2  award winning children's book
# topic 3  some kind of search: a secret, a journey, a question, a truth, a mother
# topic 4  some kind of fantasy novel
# topic 5  friendship, love, and gender
# topic 6  race and high school, especially black (americans)
# topic 7  families, parents, home
# topic 8  little children's book about a boy
# topic 9  gender and sexuality
# topic 10  classic book or series about america or war or history

In [None]:
# can we get a banned/non-banned split?
nmf_2 = NMF(2)
nmf_2.fit(dtm)

nmf_top_two = display_topics(nmf_2, cv.get_feature_names_out(), 10)


Topic  1
new, time, life, york, novel, story, world, author, year, family

Topic  2
book, child, best, award, new, school, young, library, story, time




In [None]:
# not really!! very generic

## Topic modelling with TFIDF

TFIDF (term frequency inverse document frequency) weights words differently than CountVectorizer. Words that appear repeatedly in single documents but not the whole corpus are weighted more than in CountVectorizer.

In [None]:
tfidf = TfidfVectorizer(min_df = .003)
X = tfidf.fit_transform(books_df['description'])
tfidf_df = pd.DataFrame(X.toarray(), columns = tfidf.get_feature_names_out())
tfidf_df

Unnamed: 0,19th,21st,aaron,ability,able,abortion,about,absent,absolute,abuse,...,york,yorker,you,young,younger,youngest,your,youth,zoey,zone
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.072469,0.0,0.0,0.083840,0.000000,0.0,0.0,0.00000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.030976,0.0,0.0,0.107510,0.000000,0.0,0.0,0.06318,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.089177,0.0,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1696,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0
1697,0.0,0.0,0.0,0.0,0.108939,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0
1698,0.0,0.0,0.0,0.0,0.099173,0.0,0.0,0.0,0.000000,0.0,...,0.048623,0.0,0.0,0.056253,0.105728,0.0,0.0,0.00000,0.0,0.0
1699,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0


In [None]:
tfidf_df.shape

(1701, 2706)

In [None]:
# since we had good results with nmf above, let's try that again
nmf = NMF(10)
nmf.fit(tfidf_df)

In [None]:
nmf_by_topic = pd.DataFrame(nmf.components_,
                index = ["component_1", "component_2", "component_3",
                        "component_4",
                         "component_5", "component_6", "component_7",
                         "component_8", "component_9", 'component_10'],
                        #  'component_11', 'component_12', 'component_13',
                        #  'component_14', 'component_15', 'component_16',
                        #  'component_17', 'component_18', 'component_19',
                        #  'component_20'],
                columns = cv.get_feature_names_out())

nmf_by_topic

Unnamed: 0,19th,21st,aaron,ability,able,abortion,about,absent,absolute,abuse,...,york,yorker,you,young,younger,youngest,your,youth,zoey,zone
component_1,0.0,0.0,0.0,0.049973,0.025028,0.0,0.026952,0.0,0.020439,0.0,...,1.034508,0.046578,0.0,0.046485,0.0,0.0,0.007911,0.0,0.0,0.007273
component_2,0.0,0.0,0.013635,0.014944,0.041681,0.023027,0.0,0.0,0.001135,0.00616,...,0.0,0.0,0.012192,0.066094,0.005373,0.00739,0.0,0.0,0.049422,0.00676
component_3,0.000762,0.001746,0.003362,0.002419,0.0,0.0,0.0,0.013943,0.0,0.0,...,0.034669,0.001378,0.024492,0.268482,0.027275,0.010701,0.0,0.036252,0.0,0.0
component_4,0.011564,0.0,0.0,0.0,0.012774,0.016594,0.0,0.0,0.0,0.000786,...,0.0,0.000564,0.0,0.296813,0.021093,0.0,0.0,0.0,0.0,0.0
component_5,0.000498,0.004392,0.0,0.00795,0.0,0.0,0.0,0.0,0.0,0.015857,...,0.0,0.0,0.0,0.034166,0.0,0.001544,0.026722,0.041495,0.0,0.000561
component_6,0.008027,0.00326,0.017079,0.024047,0.003295,0.017226,0.0,0.000341,0.0,0.0,...,0.0,0.007038,0.0,0.028508,0.0,0.0,0.0,0.080091,0.0,0.0
component_7,0.0,0.0,0.027114,0.000367,0.043992,0.020211,0.0,0.010416,0.0,0.026124,...,0.0,0.0,0.0,0.017496,0.033608,0.033652,0.0,0.014079,0.0,0.0
component_8,0.0,0.0,0.0,0.0,0.013155,0.0,0.0,0.023375,0.0,0.003807,...,0.0,0.0,0.001296,0.027194,0.044435,0.0,0.005151,0.0,0.0,0.0
component_9,0.014096,0.013804,0.002382,0.0,0.0,0.0,6.1e-05,0.0,0.018446,0.004092,...,0.0,0.002583,0.008792,0.095138,0.0,0.0,0.003177,0.0,0.0,0.006147
component_10,0.0,0.000603,0.009104,0.027653,0.0,0.0,0.001839,0.001573,0.0,0.004384,...,0.117677,0.0,0.00018,0.0,0.0,0.0,0.000559,0.0,0.012366,0.000171


In [None]:
nmf_top_ten = display_topics(nmf, tfidf.get_feature_names_out(), 10)
nmf_top_ten


Topic  1
new, time, york, author, novel, life, love, the, way, day

Topic  2
school, girl, friend, boy, high, thing, love, best, friendship, story

Topic  3
book, child, award, picture, best, library, honor, young, winner, illustrator

Topic  4
woman, life, young, men, first, secret, novel, girl, man, face

Topic  5
gender, identity, people, transgender, sexuality, teen, body, sex, binary, guide

Topic  6
american, black, african, white, racial, people, right, civil, history, race

Topic  7
family, child, parent, home, mother, story, life, father, house, brother

Topic  8
year, old, mother, fifteen, seventeen, father, secret, life, friend, twelve

Topic  9
classic, novel, edition, work, world, story, penguin, introduction, war, reader

Topic  10
series, jordan, sookie, robert, vampire, wheel, time, world, dragon, tower




(NMF(n_components=10),
 array(['19th', '21st', 'aaron', ..., 'youth', 'zoey', 'zone'],
       dtype=object),
 10)

In [None]:
# similar topics to cv and tfidf
# but probably a bit clearer

# topics seem something like
# topic 1  new york times bestselling author
# topic 2  high school love and friendship
# topic 3  award winning children's book
# topic 4  young woman protagonist
# topic 5  gender and sexuality
# topic 6  race and america
# topic 7  families, parents, home
# topic 8  growing up (twelve, fifteen, seventeen, year, old)
# topic 9  classic book or series
# topic 10  some kind of fantasy novel

In [None]:
# what about more categories, can we capture more complexity?
nmf = NMF(14)
nmf.fit(tfidf_df)

In [None]:
nmf_top_fourteen = display_topics(nmf, tfidf.get_feature_names_out(), 10)
nmf_top_fourteen


Topic  1
new, time, york, author, novel, the, day, review, world, life

Topic  2
friend, love, best, life, thing, secret, everything, summer, friendship, heart

Topic  3
book, child, award, picture, best, library, honor, winner, young, illustrator

Topic  4
woman, life, young, first, men, novel, man, secret, world, face

Topic  5
gender, identity, body, sexuality, binary, sex, guide, reader, self, topic

Topic  6
american, black, african, white, racial, civil, race, right, history, america

Topic  7
family, old, year, mother, father, parent, life, home, child, brother

Topic  8
school, high, student, kid, group, senior, day, middle, home, new

Topic  9
classic, edition, novel, work, penguin, introduction, world, war, literature, story

Topic  10
sookie, series, vampire, stackhouse, blood, waitress, human, cocktail, louisiana, original

Topic  11
boy, story, something, world, indian, team, day, young, men, name

Topic  12
jordan, robert, wheel, time, dragon, tower, reborn, series, roma

(NMF(n_components=14),
 array(['19th', '21st', 'aaron', ..., 'youth', 'zoey', 'zone'],
       dtype=object),
 10)

In [None]:
# of all the numbers that I tried, I think that I like this one the best!
# it captures a bit more complexity than 10 topics

# topics seem something like
# topic 1  new york times bestselling author
# topic 2  school/summer friendship and love
# topic 3  award winning children's book
# topic 4  young woman protagonist
# topic 5  gender and sexuality
# topic 6  race and america
# topic 7  families, parents, home
# topic 8  middle and high school
# topic 9  classic book or series
# topic 10  vampire novel
# topic 11  boy protagonist
# topic 12  fantasy novel
# topic 13  lgbtq sexuality
# topic 14  girl protagonist

## Building Feature Columns

Now we will build a feature column for each of these 14 columns and merge with our original dataframe.

In [None]:
doc_topic_matrix = nmf.transform(tfidf_df)

In [None]:
doc_topic_nmf_tfidf = pd.DataFrame(doc_topic_matrix.round(5),
                            index = books_df['title'], columns = ["nyt_author",
                                                               "school_friendship_and_love",
                                                               "award_winning_childrens",
                                                               "young_woman_protag",
                                                               "gender_and_sexuality",
                                                               "race_and_america",
                                                               "families_and_home",
                                                               "middle_and_high_school",
                                                               "classics",
                                                               "vampire_novel",
                                                               "boy_protagonist",
                                                               "fantasy",
                                                               "lgbtq_sexuality",
                                                               "girl_protagonist"])

In [None]:
doc_topic_nmf_tfidf.head()

Unnamed: 0_level_0,nyt_author,school_friendship_and_love,award_winning_childrens,young_woman_protag,gender_and_sexuality,race_and_america,families_and_home,middle_and_high_school,classics,vampire_novel,boy_protagonist,fantasy,lgbtq_sexuality,girl_protagonist
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
The God of Small Things,0.03352,0.02454,0.01418,0.00123,0.0,0.0,0.0886,0.0,0.10248,0.0,0.02749,0.0,0.0,0.00032
Fry Bread,0.01759,0.0,0.2783,0.0,0.0,0.01119,0.00848,0.00853,0.0,0.0,0.0,0.01092,0.0,0.0
Saturday,0.03628,0.00376,0.0,0.0,0.00089,0.0,0.03001,0.00353,0.0,0.0,0.0,0.00965,0.0,0.01952
The Bone Collector,0.0518,0.0,0.0,0.00295,0.0,0.00543,0.0,0.00103,0.01385,0.04776,0.00305,0.00497,0.0,0.0033
The Marriage Bargain,0.0,0.08728,0.0,0.0,0.0,0.0,0.0191,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
doc_topic_nmf_tfidf.shape

(1701, 14)

In [None]:
books_df.shape

(1701, 4)

In [None]:
prediction_df = pd.merge(books_df, doc_topic_nmf_tfidf, how='inner', on='title')
prediction_df

Unnamed: 0,author,title,label,description,nyt_author,school_friendship_and_love,award_winning_childrens,young_woman_protag,gender_and_sexuality,race_and_america,families_and_home,middle_and_high_school,classics,vampire_novel,boy_protagonist,fantasy,lgbtq_sexuality,girl_protagonist
0,Arundhati Roy,The God of Small Things,1,debut novel affluent indian family fateful day...,0.03352,0.02454,0.01418,0.00123,0.00000,0.00000,0.08860,0.00000,0.10248,0.0,0.02749,0.00000,0.00000,0.00032
1,Kevin Noble Maillard,Fry Bread,1,winner robert sibert informational book medal ...,0.01759,0.00000,0.27830,0.00000,0.00000,0.01119,0.00848,0.00853,0.00000,0.0,0.00000,0.01092,0.00000,0.00000
2,Kevin Noble Maillard,Fry Bread,1,winner robert sibert informational book medal ...,0.01759,0.00000,0.27830,0.00000,0.00000,0.01119,0.00848,0.00853,0.00000,0.0,0.00000,0.01092,0.00000,0.00000
3,Kevin Noble Maillard,Fry Bread,1,winner robert sibert informational book medal ...,0.01759,0.00000,0.27830,0.00000,0.00000,0.01119,0.00848,0.00853,0.00000,0.0,0.00000,0.01092,0.00000,0.00000
4,Kevin Noble Maillard,Fry Bread,1,winner robert sibert informational book medal ...,0.01759,0.00000,0.27830,0.00000,0.00000,0.01119,0.00848,0.00853,0.00000,0.0,0.00000,0.01092,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730,Arvin Ahmadi,How It All Blew Up,1,simon homo sapiens agenda italy ahmadi newest ...,0.01078,0.07607,0.00000,0.01401,0.01718,0.01964,0.08449,0.00000,0.00000,0.0,0.00000,0.00000,0.00000,0.00000
1731,Jaye Robin Brown,The Key to You and Me,1,sweet funny lgbtq romance perfect fan becky al...,0.00678,0.10552,0.01222,0.00000,0.00000,0.00244,0.00000,0.00000,0.00000,0.0,0.00000,0.00000,0.00000,0.09311
1732,Victoria Jamieson,When Stars Are Scattered,1,national book award finalist remarkable graphi...,0.03275,0.00535,0.02473,0.01480,0.00000,0.00000,0.07097,0.03407,0.02942,0.0,0.01365,0.00000,0.00000,0.03165
1733,Megan Atwood,Raise the Stakes,1,audisee audio combine professional narration s...,0.00454,0.00977,0.00000,0.00000,0.01535,0.00000,0.00895,0.00000,0.01380,0.0,0.00867,0.00000,0.02394,0.02217


In [None]:
prediction_df.shape

(1735, 18)

In [None]:
prediction_df.to_csv('prediction_df.csv', index=False)