In [3]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from surprise import Dataset, Reader, accuracy, NormalPredictor, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering
from surprise.accuracy import rmse
from surprise.prediction_algorithms import SVD, SVDpp, NMF, BaselineOnly, NormalPredictor
from IPython.core.display import HTML

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.collocations import *
import re
import nltk

%matplotlib inline

In [4]:
user_reviews_df = pd.read_csv("./Data/user_reviews.csv")

In [5]:
# Looking at the data from the 
user_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598158 entries, 0 to 598157
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  598158 non-null  int64 
 1   user_id     598158 non-null  int64 
 2   recipe_id   598158 non-null  int64 
 3   date        598158 non-null  object
 4   rating      598158 non-null  int64 
 5   review      598154 non-null  object
dtypes: int64(4), object(2)
memory usage: 27.4+ MB


In [6]:
#Drop 4 missing reviews
user_reviews_df.dropna(inplace=True)

In [7]:
user_reviews_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,recipe_id,date,rating,review
0,8,76535,134728,2005-09-02,4,Very good!
1,11,190375,134728,2007-03-09,5,These taste absolutely wonderful!! My son-in-...
2,12,468945,134728,2008-02-20,0,Made my own buttermilk w/ vinegar and milk. U...
3,13,255338,134728,2008-04-11,5,First time using liquid smoke in a recipe. Mad...
4,14,1171894,134728,2009-04-21,5,MMMMM! This is so good! I actually soaked the ...


In [8]:
# Drop Unnamed: 0 column
user_reviews_df.drop("Unnamed: 0", axis=1, inplace=True)
user_reviews_df

Unnamed: 0,user_id,recipe_id,date,rating,review
0,76535,134728,2005-09-02,4,Very good!
1,190375,134728,2007-03-09,5,These taste absolutely wonderful!! My son-in-...
2,468945,134728,2008-02-20,0,Made my own buttermilk w/ vinegar and milk. U...
3,255338,134728,2008-04-11,5,First time using liquid smoke in a recipe. Mad...
4,1171894,134728,2009-04-21,5,MMMMM! This is so good! I actually soaked the ...
...,...,...,...,...,...
598153,496803,249924,2011-08-21,5,I really loved this! I cut the recipe back to...
598154,143592,82303,2010-07-26,5,I will never buy hot fudge again. This is so q...
598155,140132,82303,2010-10-01,5,This is fabulous. I made it for our dessert t...
598156,199020,82303,2013-03-18,5,5 stars for taste! I had a hard time getting m...


In [9]:
#Check no missing values
user_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 598154 entries, 0 to 598157
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    598154 non-null  int64 
 1   recipe_id  598154 non-null  int64 
 2   date       598154 non-null  object
 3   rating     598154 non-null  int64 
 4   review     598154 non-null  object
dtypes: int64(3), object(2)
memory usage: 27.4+ MB


## Visulizing Ratings

In [10]:
#Creating a bar chart to see the distribution of ratings
#rating_count = user_reviews_df["rating"].value_counts().sort_index(ascending=False)

#plt.bar(rating_count.index, rating_count.values);

In [11]:
# Numerical breakdown
#rating_count

77% of our new dataset includes 5 star reviews, which heavily skews the data and will affect the recommendation portal. 

We also notice that 2.4% of our dataset includes a 0 rating. It would be important to see if these are actually 0s or if they are imputed value for missing ratings. 

In [12]:
#Looking at the 0 ratings to see if they are actually 0s or missing ratings
#user_reviews_df.loc[user_reviews_df["rating"] == 0]

It looks like the 0s are both a mix of positive and negative (or neutral reviews). While it is a small subset of our dataset it could be important to include these reviews considering the skew of our data. We could impute ratings based on the language used in the "review" column. To do this we will need to undertake Natural Language Processing (NLP). 

 ## Preprocessing Review Data

In [13]:
user_reviews_df["review"] = user_reviews_df["review"].astype(str)

In [14]:
# Need to remove capitalization, apostrophes, and numbers from the reviews

# Create a tokenizer that will take all words with three or more letters
pattern = r"(?u)\w{3,}"
tokenizer = RegexpTokenizer(pattern)

# Create list of stopwords in English (language of the reviews)
# Remove "very" from the list of stopwords 
stopwords_list = stopwords.words("english")
#add recipe to stopwords list
stopwords_list.extend("recipe")
# Remove "very" from the list of stopwords 
stopwords_list.remove("very")
stopwords_list += list(string.punctuation)
stopwords_list += ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

# Create an instance of nltk's WordNetLemmatizer with the variable name `lemmatizer`
lemmatizer = WordNetLemmatizer()

In [15]:
def preprocess_text(text, tokenizer, stopwords_list, lemmatizer):
    # Standardize case (lowercase the text)
    lowered_text = text.lower()
    
    # Tokenize text using `tokenizer`
    tokens = tokenizer.tokenize(lowered_text)
    
    # Remove stopwords using `stopwords_list`
    stopped_tokens = [word for word in tokens if word not in stopwords_list]
    
    # Stem the tokenized text using `stemmer`
    lemmatized_text = [lemmatizer.lemmatize(token) for token in stopped_tokens]
    
    # Return the preprocessed text
    return lemmatized_text

In [16]:
# Apply the preprocess function to the entire dataset
review_text_data = user_reviews_df.review.apply(lambda x: preprocess_text(x, tokenizer, stopwords_list, lemmatizer))
review_text_data

0                                              [very, good]
1         [taste, absolutely, wonderful, son, law, love,...
2         [made, buttermilk, vinegar, milk, used, defros...
3         [first, time, using, liquid, smoke, recipe, ma...
4         [mmmmm, good, actually, soaked, chicken, chick...
                                ...                        
598153    [really, loved, cut, recipe, back, use, three,...
598154    [never, buy, hot, fudge, quick, easy, would, m...
598155    [fabulous, made, dessert, tonight, practically...
598156    [star, taste, hard, time, getting, mine, thick...
598157    [amazingly, delicious, change, made, cook, bit...
Name: review, Length: 598154, dtype: object

In [17]:
#Add preprocessed lemmatized text to the dataframe
user_reviews_df["lem_review_text"] = review_text_data
user_reviews_df.head()

Unnamed: 0,user_id,recipe_id,date,rating,review,lem_review_text
0,76535,134728,2005-09-02,4,Very good!,"[very, good]"
1,190375,134728,2007-03-09,5,These taste absolutely wonderful!! My son-in-...,"[taste, absolutely, wonderful, son, law, love,..."
2,468945,134728,2008-02-20,0,Made my own buttermilk w/ vinegar and milk. U...,"[made, buttermilk, vinegar, milk, used, defros..."
3,255338,134728,2008-04-11,5,First time using liquid smoke in a recipe. Mad...,"[first, time, using, liquid, smoke, recipe, ma..."
4,1171894,134728,2009-04-21,5,MMMMM! This is so good! I actually soaked the ...,"[mmmmm, good, actually, soaked, chicken, chick..."


In [18]:
# Set up figure and axes
#fig, axes = plt.subplots(nrows=6, figsize=(12, 12))

# Empty dict to hold words that have already been plotted and their colors
#plotted_words_and_colors = {}
# Establish color palette to pull from
# (If you get an error message about popping from an empty list, increase this #)
#color_palette = sns.color_palette('cividis', n_colors=38)

# Creating a plot for each unique genre
#data_by_rating = [y for _, y in user_reviews_df.groupby('rating', as_index=False)]
#for idx, rating_df in enumerate(data_by_rating):
    # Find top 10 words in this genre
    #all_words_in_rating = rating_df.preprocessed_review_text.explode()
    #top_10 = all_words_in_rating.value_counts()[:10]
    
    # Select appropriate colors, reusing colors if words repeat
    #colors = []
   # for word in top_10.index:
        #if word not in plotted_words_and_colors:
            #new_color = color_palette.pop(0)
            #plotted_words_and_colors[word] = new_color
        #colors.append(plotted_words_and_colors[word])
    
    # Select axes, plot data, set title
    #ax = axes[idx]
    #ax.bar(top_10.index, top_10.values, color=colors)
    #ax.set_title(rating_df.iloc[0].rating)
    
#fig.tight_layout()

In [19]:
#Train/Test Split (save off as new datasets)
#Turn 0s to nulls
#Vectorize
#KNN Imputer (count and TFIDF)
#train/Test Surprise
# Basic Model on Train set 
# Next Model, etc.
# Transform Test
# Retrain on the full set
# Deployment 

In [20]:
#user_reviews_df.head()

In [21]:
user_reviews_df["rating"] = user_reviews_df["rating"].astype("float")

In [22]:
user_reviews_df["user_id"] = user_reviews_df["user_id"].astype("str")

In [23]:
user_reviews_df["recipe_id"] = user_reviews_df["recipe_id"].astype("str")

In [24]:
user_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 598154 entries, 0 to 598157
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   user_id          598154 non-null  object 
 1   recipe_id        598154 non-null  object 
 2   date             598154 non-null  object 
 3   rating           598154 non-null  float64
 4   review           598154 non-null  object 
 5   lem_review_text  598154 non-null  object 
dtypes: float64(1), object(5)
memory usage: 31.9+ MB


In [25]:
#Vectorize preprocessed_review_text
#from sklearn.feature_extraction.text import TfidfVectorizer
#t_vec = TfidfVectorizer(ngram_range=(2,2), max_features=2000)
# matrix of ngrams
#ngrams = t_vec.fit_transform(joined_df[joined_df['Rating'] == 1]['String'])

In [26]:
# Change preprocced_review_text to string and call new variable
user_reviews_df["review_text_string"] = user_reviews_df["lem_review_text"].str.join(" ")



In [27]:
# Since we dropped values we need to reset the index before we perform the vectorization and imputing 
# so that new table index will match existing index
user_reviews_df.reset_index(inplace=True)
user_reviews_df

Unnamed: 0,index,user_id,recipe_id,date,rating,review,lem_review_text,review_text_string
0,0,76535,134728,2005-09-02,4.0,Very good!,"[very, good]",very good
1,1,190375,134728,2007-03-09,5.0,These taste absolutely wonderful!! My son-in-...,"[taste, absolutely, wonderful, son, law, love,...",taste absolutely wonderful son law love reques...
2,2,468945,134728,2008-02-20,0.0,Made my own buttermilk w/ vinegar and milk. U...,"[made, buttermilk, vinegar, milk, used, defros...",made buttermilk vinegar milk used defrosted fr...
3,3,255338,134728,2008-04-11,5.0,First time using liquid smoke in a recipe. Mad...,"[first, time, using, liquid, smoke, recipe, ma...",first time using liquid smoke recipe made dire...
4,4,1171894,134728,2009-04-21,5.0,MMMMM! This is so good! I actually soaked the ...,"[mmmmm, good, actually, soaked, chicken, chick...",mmmmm good actually soaked chicken chicken bou...
...,...,...,...,...,...,...,...,...
598149,598153,496803,249924,2011-08-21,5.0,I really loved this! I cut the recipe back to...,"[really, loved, cut, recipe, back, use, three,...",really loved cut recipe back use three thigh l...
598150,598154,143592,82303,2010-07-26,5.0,I will never buy hot fudge again. This is so q...,"[never, buy, hot, fudge, quick, easy, would, m...",never buy hot fudge quick easy would made sund...
598151,598155,140132,82303,2010-10-01,5.0,This is fabulous. I made it for our dessert t...,"[fabulous, made, dessert, tonight, practically...",fabulous made dessert tonight practically lick...
598152,598156,199020,82303,2013-03-18,5.0,5 stars for taste! I had a hard time getting m...,"[star, taste, hard, time, getting, mine, thick...",star taste hard time getting mine thicken thou...


In [28]:
user_reviews_df.drop("index", axis=1, inplace=True)

In [29]:
user_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598154 entries, 0 to 598153
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             598154 non-null  object 
 1   recipe_id           598154 non-null  object 
 2   date                598154 non-null  object 
 3   rating              598154 non-null  float64
 4   review              598154 non-null  object 
 5   lem_review_text     598154 non-null  object 
 6   review_text_string  598154 non-null  object 
dtypes: float64(1), object(6)
memory usage: 31.9+ MB


In [30]:
#Create new dataframe with rating and review_text_string
#new_df = old_df[['col1','col2']].

ratings_text_df = user_reviews_df[["rating", "review_text_string"]]
#ratings_text_df.to_csv("./Data/ratings_text_df.csv")


In [31]:
#ratings_text_df = pd.read_csv("./Data/ratings_text_df.csv")

#ratings_text_df.drop("Unnamed: 0", axis=1, inplace=True)

In [32]:
#ratings_text_df.head()
#ratings_text_df.drop("Unnamed: 0", axis=1, inplace=True)
#ratings_text_df.dropna(inplace=True)
#ratings_text_df.info()


In [33]:
ratings_text_df.head()

Unnamed: 0,rating,review_text_string
0,4.0,very good
1,5.0,taste absolutely wonderful son law love reques...
2,0.0,made buttermilk vinegar milk used defrosted fr...
3,5.0,first time using liquid smoke recipe made dire...
4,5.0,mmmmm good actually soaked chicken chicken bou...


In [34]:
#Instantiate the TFIDF vectorize using bigrams  
tfidf_vec = TfidfVectorizer(ngram_range=(2,2), max_features=2000)
# Fit the Vectorizer on review_text_string
review_text_vectorized = tfidf_vec.fit_transform(ratings_text_df["review_text_string"])


In [35]:
#Turn the 0s to nulls for KNN Imputer
ratings_text_df["rating"].replace(0, np.nan)

0         4.0
1         5.0
2         NaN
3         5.0
4         5.0
         ... 
598149    5.0
598150    5.0
598151    5.0
598152    5.0
598153    5.0
Name: rating, Length: 598154, dtype: float64

In [36]:
#review_text_vectorized.todense()

In [37]:
vec_df = pd.DataFrame(review_text_vectorized.toarray(), columns=tfidf_vec.get_feature_names())
vec_df

Unnamed: 0,123 hit,350 degree,9x13 pan,able make,absolutely delicious,absolutely fantastic,absolutely love,absolutely loved,absolutely wonderful,across recipe,...,yum yum,yummy easy,yummy made,yummy recipe,yummy thanks,yummy used,yummy yummy,zaar star,zaar tag,zaar world
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.50787,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
#sanity check
len(ratings_text_df) == len(vec_df)

True

In [39]:
#Join the tables 
rating_imputer_df = ratings_text_df.join(vec_df)
rating_imputer_df

Unnamed: 0,rating,review_text_string,123 hit,350 degree,9x13 pan,able make,absolutely delicious,absolutely fantastic,absolutely love,absolutely loved,...,yum yum,yummy easy,yummy made,yummy recipe,yummy thanks,yummy used,yummy yummy,zaar star,zaar tag,zaar world
0,4.0,very good,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5.0,taste absolutely wonderful son law love reques...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,made buttermilk vinegar milk used defrosted fr...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,first time using liquid smoke recipe made dire...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,mmmmm good actually soaked chicken chicken bou...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598149,5.0,really loved cut recipe back use three thigh l...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598150,5.0,never buy hot fudge quick easy would made sund...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598151,5.0,fabulous made dessert tonight practically lick...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598152,5.0,star taste hard time getting mine thicken thou...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
#KNN Imputer on review_text_vectorized
imputer = KNNImputer(missing_values=np.nan, n_neighbors=3, weights="uniform")

In [None]:
#fit and transform imputer on the dataframe
imputer.fit_transform(rating_imputer_df)

In [None]:
review_text_knn["rating"].value_counts()