Objective: build a model to predict whether a book will be popular or not. 

Dataset contains:
* `price`
* `popularity` (target variable)
* `review/summary`
* `review/text`
* `review/helpfulness`
* `authors`
* `categories`

In [91]:
# Import some required packages
import pandas as pd

# Read in the dataset
books = pd.read_csv("data/books.csv")

# Preview the first five rows
books.head()

Unnamed: 0,title,price,review/helpfulness,review/summary,review/text,description,authors,categories,popularity
0,We Band of Angels: The Untold Story of America...,10.88,2/3,A Great Book about women in WWII,I have alway been a fan of fiction books set i...,"In the fall of 1941, the Philippines was a gar...",'Elizabeth Norman','History',Unpopular
1,Prayer That Brings Revival: Interceding for Go...,9.35,0/0,Very helpful book for church prayer groups and...,Very helpful book to give you a better prayer ...,"In Prayer That Brings Revival, best-selling au...",'Yong-gi Cho','Religion',Unpopular
2,The Mystical Journey from Jesus to Christ,24.95,17/19,Universal Spiritual Awakening Guide With Some ...,The message of this book is to find yourself a...,THE MYSTICAL JOURNEY FROM JESUS TO CHRIST Disc...,'Muata Ashby',"'Body, Mind & Spirit'",Unpopular
3,Death Row,7.99,0/1,Ben Kincaid tries to stop an execution.,The hero of William Bernhardt's Ben Kincaid no...,"Upon receiving his execution date, one of the ...",'Lynden Harris','Social Science',Unpopular
4,Sound and Form in Modern Poetry: Second Editio...,32.5,18/20,good introduction to modern prosody,There's a lot in this book which the reader wi...,An updated and expanded version of a classic a...,"'Harvey Seymour Gross', 'Robert McDowell'",'Poetry',Unpopular


In [92]:
books.popularity.value_counts()

Unpopular    10490
Popular       5229
Name: popularity, dtype: int64

In [93]:
books = books.groupby("categories").filter(lambda x: len(x) > 100)
books["categories"].value_counts()

'Fiction'                        3520
'Religion'                       1053
'Biography & Autobiography'       852
'Juvenile Fiction'                815
'History'                         754
'Business & Economics'            707
'Computers'                       513
'Family & Relationships'          486
'Health & Fitness'                414
'Cooking'                         398
'Body, Mind & Spirit'             360
'Music'                           306
'Juvenile Nonfiction'             280
'Self-Help'                       263
'Political Science'               246
'Sports & Recreation'             245
'Social Science'                  241
'Education'                       214
'Psychology'                      206
'Science'                         191
'Humor'                           177
'Crafts & Hobbies'                173
'Nature'                          147
'Philosophy'                      145
'Language Arts & Disciplines'     144
'Performing Arts'                 136
'Pets'      

In [94]:
categories = pd.get_dummies(books.categories,drop_first=True)

In [95]:
categories.head()

Unnamed: 0,'Biography & Autobiography',"'Body, Mind & Spirit'",'Business & Economics','Comics & Graphic Novels','Computers','Cooking','Crafts & Hobbies','Education','Family & Relationships','Fiction','Foreign Language Study','Health & Fitness','History','Humor','Juvenile Fiction','Juvenile Nonfiction','Language Arts & Disciplines','Music','Nature','Performing Arts','Pets','Philosophy','Poetry','Political Science','Psychology','Reference','Religion','Science','Self-Help','Social Science','Sports & Recreation','Travel','Young Adult Fiction'
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [96]:
books = pd.concat([books, categories],axis = 1)
books.head()

books.drop(columns = ["categories"],inplace=True)


In [97]:
books.head()

Unnamed: 0,title,price,review/helpfulness,review/summary,review/text,description,authors,popularity,'Biography & Autobiography',"'Body, Mind & Spirit'",'Business & Economics','Comics & Graphic Novels','Computers','Cooking','Crafts & Hobbies','Education','Family & Relationships','Fiction','Foreign Language Study','Health & Fitness','History','Humor','Juvenile Fiction','Juvenile Nonfiction','Language Arts & Disciplines','Music','Nature','Performing Arts','Pets','Philosophy','Poetry','Political Science','Psychology','Reference','Religion','Science','Self-Help','Social Science','Sports & Recreation','Travel','Young Adult Fiction'
0,We Band of Angels: The Untold Story of America...,10.88,2/3,A Great Book about women in WWII,I have alway been a fan of fiction books set i...,"In the fall of 1941, the Philippines was a gar...",'Elizabeth Norman',Unpopular,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Prayer That Brings Revival: Interceding for Go...,9.35,0/0,Very helpful book for church prayer groups and...,Very helpful book to give you a better prayer ...,"In Prayer That Brings Revival, best-selling au...",'Yong-gi Cho',Unpopular,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,The Mystical Journey from Jesus to Christ,24.95,17/19,Universal Spiritual Awakening Guide With Some ...,The message of this book is to find yourself a...,THE MYSTICAL JOURNEY FROM JESUS TO CHRIST Disc...,'Muata Ashby',Unpopular,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Death Row,7.99,0/1,Ben Kincaid tries to stop an execution.,The hero of William Bernhardt's Ben Kincaid no...,"Upon receiving his execution date, one of the ...",'Lynden Harris',Unpopular,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,Sound and Form in Modern Poetry: Second Editio...,32.5,18/20,good introduction to modern prosody,There's a lot in this book which the reader wi...,An updated and expanded version of a classic a...,"'Harvey Seymour Gross', 'Robert McDowell'",Unpopular,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [98]:
books["num_review"] = books["review/helpfulness"].str.split("/",expand=True)[1].astype(int)
books["num_helpful"] = books["review/helpfulness"].str.split("/",expand=True)[0].astype(int)
books.drop(columns=["review/helpfulness"],inplace=True)

In [99]:
books["perc_helpful_reviews"] = books["num_helpful"] / books["num_review"]
books["perc_helpful_reviews"].fillna(0,inplace=True)

In [100]:
for col in ["review/summary", "review/text", "description"]:
    books[col] = books[col].str.lower()

In [101]:
from sklearn.feature_extraction.text import CountVectorizer

positive_words = ["great", "excellent", "good", "interesting", "enjoy", "helpful", "useful", "like", "love", "beautiful", "fantastic", "perfect", "wonderful", "impressive", "amazing", "outstanding", "remarkable", "brilliant", "exceptional", "positive",
    "thrilling"]

# Instantiate a CountVectorizer
vectorizer = CountVectorizer(vocabulary=positive_words)
review_text = books["review/text"]
text_transformed = vectorizer.fit_transform(review_text.fillna(''))

In [102]:
# Fit and transform review/summary
review_summary = books["review/summary"]
summary_transformed = vectorizer.fit_transform(review_summary.fillna(''))

# Fit and transform description
description = books["description"]
description_transformed = vectorizer.fit_transform(description.fillna(''))


In [103]:
# Add positive counts into DataFrame to add measures of positive sentiment
books["positive_words_text"] = text_transformed.sum(axis=1).reshape(-1, 1)
books["positive_words_summary"] = summary_transformed.sum(axis=1).reshape(-1, 1)
books["positive_words_description"] = description_transformed.sum(axis=1).reshape(-1, 1)

In [104]:
# Remove original columns
books.drop(columns=["review/text", "review/summary", "description"], inplace=True)

# Splitting into features and target values
X = books.drop(columns=["title", "authors", "popularity"]).values
y = books["popularity"].values.reshape(-1, 1)


In [105]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [106]:
from sklearn.ensemble import RandomForestClassifier

In [107]:
clf= RandomForestClassifier(n_estimators=120, max_depth=50, min_samples_split=5, random_state=42, class_weight="balanced")
clf.fit(X_train, y_train.ravel()) 

In [108]:
# Evaluate accuracy
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

model_accuracy = clf.score(X_test, y_test)

0.9617126389460683
0.7090036014405763
