In [7]:
# Import pandas
import pandas as pd
# Import the required dependencies from sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Set the column width to view the text data.
pd.set_option('max_colwidth', 200)

In [8]:
# Load the datasets into DataFrames (goodreads)
goodreads_df=pd.read_csv('Resources/goodreads_dataset.csv')
goodreads_df.head()

Unnamed: 0,title,titleComplete,description,genres,isbn,publisher,author,characters,places,ratingHistogram,ratingsCount,reviewsCount,numPages,language
0,Project Hail Mary,Project Hail Mary,"Ryland Grace is the sole survivor on a desperate, last-chance mission—and if he fails, humanity and the earth itself will perish.Except that right now, he doesn’t know that. He can’t even remember...","['Science Fiction Fantasy', 'Audiobook', 'Fantasy', 'Adventure', 'Mystery', 'Adult', 'Fiction', 'Space', 'Science Fiction', 'Thriller']",593135202.0,Ballantine Books,['Andy Weir'],"['Ryland Grace', 'Rocky']","['Tau Ceti System', 'Outer Space']","[1917, 5775, 29742, 116572, 266669]",420675.0,53538.0,476.0,English
1,The Talented Mr. Ripley,"The Talented Mr. Ripley (Ripley, #1)","Since his debut in 1955, Tom Ripley has evolved into the ultimate bad boy sociopath, influencing countless novelists and filmmakers. In this first novel, we are introduced to suave, handsome Tom R...","['Novels', 'Noir', 'Classics', 'Italy', 'Suspense', 'Mystery Thriller', 'Fiction', 'Mystery', 'Crime', 'Thriller']",393332144.0,W. W. Norton Company,['Patricia Highsmith'],"['Freddie Miles', 'Tom Ripley', 'Dickie Greenleaf', 'Marge Sherwood']","['Italy', 'New York City, New York', 'Italian Riviera']","[1483, 3902, 17161, 34467, 24270]",81283.0,5146.0,288.0,English
2,More Than This,More Than This,"A boy drowns, desperate and alone in his final moments. He dies. Then he wakes, naked and bruised and thirsty, but alive. How can this be? And what is this strange deserted place?As he struggles t...","['Queer', 'Fantasy', 'Contemporary', 'LGBT', 'Young Adult', 'Teen', 'Fiction', 'Dystopia', 'Mystery', 'Science Fiction']",1406350486.0,Walker Books Ltd,['Patrick Ness'],['Seth Wearing'],,"[1441, 3672, 12295, 23873, 21208]",62489.0,8194.0,480.0,English
3,After Forever Ends,After Forever Ends,"Orphaned by her mother and brushed off by her dad, fifteen year old Silvia Cotton had lived a lonely life. That is until 1985 when her father moved the family from the Highlands of Scotland to the...","['Chick Lit', 'Fantasy', 'Coming Of Age', 'Contemporary', 'Contemporary Romance', 'Womens Fiction', 'Young Adult', 'Romance', 'Fiction', 'Literary Fiction']",,Gingersnap Press,['Melodie Ramone'],,,"[81, 119, 205, 365, 750]",1520.0,241.0,564.0,English
4,A Bird Without Wings,A Bird Without Wings,"After an impoverished and indigent childhood, Callie Dahl is interested in one thing: money enough to buy her own home. Love and marriage are impractical pursuits, and hold zero attraction for her...","['Contemporary', 'Contemporary Romance', 'Romance']",,Smashwords,['Roberta Pearce'],,,"[7, 6, 26, 49, 91]",179.0,31.0,,English


In [3]:
# Load the datasets into DataFrames (reviews)
reviews_df = pd.read_csv('Resources/book_reviews.csv')
reviews_df.head()



Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,671551345,Night World: Daughters Of Darkness,,ADB0JID2XRFYR,Harmony-Faith Charisma Izabela Jazmyn McDonague,1/3,5.0,1076457600,BEST BOOK EVER!!,"This is 1 of da bst books dat i have EVER read! @ my school, we are doing a play on this & im playin Mary-Lynette. i cant wait 2 get to the last chapters when they finally give in 2 each other! Gr..."
1,671551345,Night World: Daughters Of Darkness,,,,1/3,5.0,1043971200,one of the best night world books!!!!,first of all i thought that this was one of lj smith's best books she has written adn also the funniest. i love all the characters but my fave one in the book is Ash. he's really a hottie and a ba...
2,671551345,Night World: Daughters Of Darkness,,,,1/3,3.0,960422400,three sisters to die for.......,"Once started I couldn't put it down, literally. I didn't stop til I'd read it through.Three sisters on the run from the Night Worlds patriachal society, they visit Oregon. Their brother finds out ..."
3,671551345,Night World: Daughters Of Darkness,,A1V0SFB3AXM8JK,"K. Davis ""The Rose Bride""",0/2,1.0,1177718400,Disappointing to say the least,"This book is probably, in my opinion, one of (if not THE) worst in the Night World Series. It is Ash's story this time, who's soulmate just happens to be a human. (which Ironically was shadowed up..."
4,671551345,Night World: Daughters Of Darkness,,,,0/0,5.0,889920000,"The most charming, captivating work from LJ Smith!","The plot and characters are incredible. Everyone that likes the supernatural should read this book, and all the other Night World books. I think I'm in love with Ash!"


In [4]:
# Check for missing values. 
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17158 entries, 0 to 17157
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Id                  17158 non-null  object 
 1   Title               17158 non-null  object 
 2   Price               10897 non-null  float64
 3   User_id             14573 non-null  object 
 4   profileName         14572 non-null  object 
 5   review/helpfulness  17158 non-null  object 
 6   review/score        17158 non-null  float64
 7   review/time         17158 non-null  int64  
 8   review/summary      17155 non-null  object 
 9   review/text         17158 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 1.3+ MB


In [9]:
# Check for missing values. 
goodreads_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14712 entries, 0 to 14711
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            14712 non-null  object 
 1   titleComplete    14712 non-null  object 
 2   description      14712 non-null  object 
 3   genres           14712 non-null  object 
 4   isbn             11738 non-null  object 
 5   publisher        14042 non-null  object 
 6   author           14710 non-null  object 
 7   characters       7177 non-null   object 
 8   places           5971 non-null   object 
 9   ratingHistogram  14709 non-null  object 
 10  ratingsCount     14709 non-null  float64
 11  reviewsCount     14692 non-null  float64
 12  numPages         14467 non-null  float64
 13  language         14358 non-null  object 
dtypes: float64(3), object(11)
memory usage: 1.6+ MB


In [5]:
#  Get the number of reviews by score:
reviews_df['review/score'].value_counts()

review/score
5.0    10556
4.0     3265
3.0     1458
1.0      979
2.0      900
Name: count, dtype: int64

In [10]:
#  Get the number of ratings and group them using onehotencoder:
goodreads_df['ratingsCount'].value_counts()

ratingsCount
24.0        21
27.0        15
16.0        15
21.0        14
31.0        14
            ..
17507.0      1
42202.0      1
121956.0     1
52778.0      1
4865.0       1
Name: count, Length: 11493, dtype: int64

In [6]:
# Set the features variable to the review/score
X = reviews_df['review/score']  
# Set the target variable to the "label" column.
y = reviews_df['review/summary']

# Split data into training and testing and set the test_size = 33%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Build a pipeline to transform the test set to compare to the training set. 
# text_vectorize = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
#                      ('clf', LinearSVC()),
# ])

# Fit the model to the transformed data.
text_vectorize.fit(X_train, y_train) 

In [None]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % text_vectorize.score(X_train, y_train))
print('Test Accuracy: %.3f' % text_vectorize.score(X_test, y_test))