In [2]:
import sys
sys.path
sys.path.append('../')

import numpy as np 
import pandas as pd

from data import RandomData, AmazonBooks, ToyData, MovieLensData
from model import SimpleMeanModel, UserMeanModel, ProductMeanModel, CombinedMeanModel

from nlu_feature_extractor import *

%matplotlib inline

### Load Data

In [4]:
# ds = MovieLensData(min_user_ratings=5).get_dataset(verbose=True)
ds = AmazonBooks(min_user_ratings=10).get_dataset(verbose=True)
train = ds['train']
val = ds['val']

loading preprocessed dataset from disk


In [40]:
print(train.keys())
train['product_reviews'].head()

dict_keys(['user_product_ratings', 'product_descriptions', 'product_reviews'])


Unnamed: 0,product_id,review
0,6155,for getting your kid introduced to his/her ABC...
1,6155,"This Book is funny and is full of B words, lik..."
2,6155,A favorite Berenstain book of my children I wa...
3,6155,This book is quite funny. Especially when you...
4,6155,Teaching the next generation to love books! M...


## create matrix: product_id -> all text about that product

all text includes all reviews about it and the description of that product

In [115]:
# grouped_reviews = train['product_reviews'].groupby(['product_id']).get_group(6155)
# grouped_reviews['review'].agg([np.concatenate])
grouped_reviews = train['product_reviews'].groupby('product_id')['product_id', 'review'].aggregate(\
        {'product_id':['mean'], 'review':lambda x: list(x)})
grouped_reviews .columns = ['product_id', 'review']

In [116]:
grouped_reviews.head()

Unnamed: 0_level_0,product_id,review
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,2,"[When I lost my best furry friend of 16 years,..."
3,3,[I'm reading this for the second time. The bo...
4,4,[This book had an excellent theme: ugly duckli...
5,5,[I have always enjoyed romance novels and Donn...
6,6,"[This is the second level of ""Speed Secrets"". ..."


In [250]:
# print(list(grouped_reviews.loc[grouped_reviews['product_id'] == 6155]['review'])[0])

In [500]:
combined = train['product_descriptions'].merge(grouped_reviews, on='product_id')
combined['all_text'] = combined['review']
combined['all_text'] = combined['description'].apply(lambda x: [x]) +  combined['all_text']
combined.head()

Unnamed: 0,description,product_id,review,all_text
0,"""Fairy phobic or trapped in a true-life fairy ...",12572,[Dream come true or worst nightmare? Grace Mac...,"[""Fairy phobic or trapped in a true-life fairy..."
1,"Welcome back, Mr Bond. We've been waiting for...",4397,"[Okay 007 fans, this book was a waste of time....","[Welcome back, Mr Bond. We've been waiting fo..."
2,"""A heart-wrenching and uplifting story of surv...",4175,"[Amazing book!It is very detailed, telling the...","[""A heart-wrenching and uplifting story of sur..."
3,A fast-paced and engrossing story. A definite ...,9448,[Lori-Bryant Woolridge is one of my favorite a...,[A fast-paced and engrossing story. A definite...
4,Gayle Jackson Sloan is a native of Philadelphi...,14535,[Wednesday's Woes was an excellent piece of ar...,[Gayle Jackson Sloan is a native of Philadelph...


In [504]:
combined['all_text_parsed'] = combined['all_text'].apply(lambda x: re.sub("[,.;?!():\[\]\"\"]",""," ".join(x)).lower())
combined['all_text_parsed_words_separate'] = combined['all_text_parsed'].apply(lambda x: x.split(" "))
print(combined.head())
# first join all comments reviews, etcetera into 1 big string
# then remove all punctuation
# then split on spaces to create words

                                         description  product_id  \
0  "Fairy phobic or trapped in a true-life fairy ...       12572   
1  Welcome back, Mr Bond.  We've been waiting for...        4397   
2  "A heart-wrenching and uplifting story of surv...        4175   
3  A fast-paced and engrossing story. A definite ...        9448   
4  Gayle Jackson Sloan is a native of Philadelphi...       14535   

                                              review  \
0  [Dream come true or worst nightmare? Grace Mac...   
1  [Okay 007 fans, this book was a waste of time....   
2  [Amazing book!It is very detailed, telling the...   
3  [Lori-Bryant Woolridge is one of my favorite a...   
4  [Wednesday's Woes was an excellent piece of ar...   

                                            all_text  \
0  ["Fairy phobic or trapped in a true-life fairy...   
1  [Welcome back, Mr Bond.  We've been waiting fo...   
2  ["A heart-wrenching and uplifting story of sur...   
3  [A fast-paced and engrossin

## extract  vocab

In [507]:
vocab = get_vocab(combined['all_text_parsed_words_separate'].tolist(), 5000)
print(len(vocab))
print(vocab)

5001


## create document, vocab matrix

In [509]:
combined_words = combined[['product_id','all_text_parsed']]
for v in vocab:
    combined_words[v] = combined_words['all_text_parsed'].apply(lambda x: x.count(" " + v + " "))
combined_words

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [510]:
np.max(np.max(combined_words[vocab]))

2166

### save vocab matrix

In [511]:
combined_words.to_pickle("vocab_matrix_2")  # where to save it, usually as a .pkl
# Then you can load it back using: df = pd.read_pickle(file_name)

## convert to tfidf

In [513]:
from sklearn.feature_extraction.text import TfidfTransformer
tf = TfidfTransformer()
tfidf_words = tf.fit_transform(combined_words[vocab])
tfidf_words

<6568x5001 sparse matrix of type '<class 'numpy.float64'>'
	with 1745315 stored elements in Compressed Sparse Row format>

In [514]:
tfidf_df = pd.SparseDataFrame(tfidf_words)
tfidf_df.columns = vocab
tfidf_df["product_id"] = combined_words["product_id"]
tfidf_df = tfidf_df[["product_id"] + vocab]
tfidf_df.head()

Unnamed: 0,product_id,Unnamed: 2,Unnamed: 3,paperback,#1,#2,$UNK,&,&#34the,&amp,...,young,younger,youngest,your,yours,yourself,youth,yr,zen,zero
0,12572,0.24981,,,0.036692,,,,,,...,0.020471,,,0.023859,,,,,,
1,4397,0.271908,,0.028557,0.03562,,,,,,...,,,,,,,,,,0.038753
2,4175,0.083713,,,,,,,,,...,0.045276,,,0.05277,,,0.079512,,,
3,9448,0.051491,,,,,,0.026269,,,...,,,,,,,,,,
4,14535,0.177852,,,,,,,,,...,,,,,,,,,,


### save tfidf

In [679]:
tfidf_df.to_pickle("tfidf_matrix")  # where to save it, usually as a .pkl

In [683]:
pickle.dump( tf.get_params(), open( "tf_params.p", "wb" ) )