# NLP Prep on Datasets

This notebook contains code to:
- Execute Text Mining / NLP on reviews
- Parse item and rating datasets

This can be used to run models such as:
- Factorization Machines
- Content Based Filtering Methods

In [None]:
import numpy as np
import pandas as pd
import datetime
import time

In [None]:
start_time = time.time()

import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

rating = getDF('reviews_Digital_Music.json.gz')
meta = getDF('meta_Digital_Music.json.gz')

print(time.time() - start_time, "seconds")

Visualizing the datasets

In [None]:
print("\nmetadata dimensions")
print(meta.shape)
print("\nrating dimensions")
print(rating.shape)

In [None]:
df_ratings = rating
df_items = meta

In [None]:
df_items.head(3)

In [None]:
df_ratings.head(3)

In [None]:
item_filter = pd.DataFrame(df_ratings['asin'])
item_filter = item_filter.groupby('asin').size()
item_filter = item_filter.to_frame().reset_index()
item_filter.columns.values[1] = 'count'
item_filter = item_filter.sort_values('count', ascending=False)

In [None]:
item_filter = item_filter.tail(n=1000)

In [None]:
print(item_filter['count'].mean())
print(item_filter['count'].max())
print(item_filter['count'].min())
print(len(item_filter))
item_filter.head()

In [None]:
item_filter = item_filter[['asin']]

In [None]:
df_ratings = pd.merge(left=df_ratings,right=item_filter, left_on='asin', right_on='asin')

In [None]:
df_ratings.head()

In [None]:
df_ratings.shape

# Starting Text Parsing

### Removing StopWords through NLTK

In [None]:
import nltk
nltk.download("stopwords")

Get usual stopwords

### Combining it all together

In [None]:
import re
from bs4 import BeautifulSoup 
from nltk.corpus import stopwords # Import the stop word list



def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    # to improve execution time this conversion should be done once
    stops = set(stopwords.words("english"))                  
     
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [None]:
df_ratings['cleanedreviews'] = df_ratings['reviewText'].apply(review_to_words)

In [None]:
df_ratings.head(3)

In [None]:
df_ratings.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from pandas import DataFrame 
reviews = df_ratings["cleanedreviews"] 
countVector = CountVectorizer(max_features = 500, stop_words='english') 
transformedReviews = countVector.fit_transform(reviews) 

dfReviews = DataFrame(transformedReviews.A, columns=countVector.get_feature_names())
dfReviews = dfReviews.astype(int)

In [None]:
dfReviews.shape

In [None]:
dfReviews.head()

In [None]:
dfReviews[dfReviews>0] = 0

### Adding the review text to the ratings dataframe

In [None]:
df_ratings2 = pd.merge(df_ratings, dfReviews, left_index=True, right_index=True)

In [None]:
print(df_ratings2.shape)
df_ratings2.head(3)

### Cleaning out the dataframe, selecting only important columns

In [None]:
df_ratings2.to_csv("NLP_ratings_all_top1000.csv", index=False)

## Treating the MetaData Dataframe

In [None]:
df_items.head(3)

Normalizing the price and selecting relevant columns - price and ids

In [None]:
df_items.to_csv("NLP_metadata.csv", index=False)