In [116]:
import pandas as pd
import re
from nltk.corpus import stopwords 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('yelp_review.csv')

In [3]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0


In [5]:
df_in = df.iloc[0:200, :]

In [6]:
df_in.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 9 columns):
review_id      200 non-null object
user_id        200 non-null object
business_id    200 non-null object
stars          200 non-null int64
date           200 non-null object
text           200 non-null object
useful         200 non-null int64
funny          200 non-null int64
cool           200 non-null int64
dtypes: int64(4), object(5)
memory usage: 14.1+ KB


# Extracting only reviews

In [35]:
review_df = df_in[['stars', 'text']]

In [36]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
stars    200 non-null int64
text     200 non-null object
dtypes: int64(1), object(1)
memory usage: 3.2+ KB


# Data Cleaning

In [85]:
review_arr =  review_df.as_matrix()

In [86]:
review_arr.shape

(200, 2)

In [99]:
clean_ar = np.empty(shape=(review_arr.shape), dtype=object)

In [100]:
type(clean_ar[0][1])

NoneType

In [101]:
def data_clean(arr, clean_ar):
    for i in range(len(arr)):
        sub_text =  re.sub("[^a-zA-Z]", " ", arr[i][1])
        l_case = sub_text.lower()
        words = l_case.split()
        stop = set(stopwords.words("english"))
        m_words = [wr for wr in words if not wr in stop]
        text = (" ".join(m_words))
        clean_ar[i][1] = text
        clean_ar[i][0] = arr[i][0]
        
    
    return clean_ar

In [102]:
data = data_clean(review_arr, clean_ar)
clean_df = pd.DataFrame(data)

In [103]:
clean_df.head()

Unnamed: 0,0,1
0,5,super simple place amazing nonetheless around ...
1,5,small unassuming place changes menu every ofte...
2,5,lester located beautiful neighborhood since kn...
3,4,love coming yes place always needs floor swept...
4,4,chocolate almond croissant amazing light butte...


In [105]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
0    200 non-null object
1    200 non-null object
dtypes: object(2)
memory usage: 3.2+ KB


# Vectorization

In [111]:
vectorizer = TfidfVectorizer(norm = 'l2',min_df = 0, use_idf = True, smooth_idf = False, sublinear_tf = True, \
                             ngram_range=(1,2))


train_vect = vectorizer.fit_transform(clean_df.iloc[:,1])
train_vect = train_vect.toarray()
print (train_vect.shape)

(200, 17853)


In [114]:
comp_vect = vectorizer.transform(clean_df.iloc[:,1])
comp_vect = comp_vect.toarray()
print (comp_vect.shape)

(200, 17853)


In [118]:
#get cosine similarity
csr_sim = cosine_similarity(comp_vect,train_vect)