In [1]:
import pandas as pd
import math
import re
from collections import Counter
from itertools import combinations

In [2]:
path = r'D:\Yelp\vegas_2010.csv'

In [3]:
df = pd.read_csv(path, index_col=0)

## Similarity

In [4]:
WORD = re.compile(r"\w+")


def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)


text1 = "This is a foo bar sentence ."
text2 = "This sentence is similar to a foo bar sentence ."

vector1 = text_to_vector(text1)
vector2 = text_to_vector(text2)

cosine = get_cosine(vector1, vector2)

print("Cosine:", cosine)

Cosine: 0.8616404368553293


In [5]:
df.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars_business', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'review_id', 'user_id',
       'stars_review', 'useful', 'funny', 'cool', 'text', 'date'],
      dtype='object')

In [32]:
group_df = df[['user_id', 'text']][:10]

In [33]:
group_df

Unnamed: 0,user_id,text
85,W7DHyQlY_kXls2iXt-_2Ag,Do you believe in love at first sight? I wasn...
87,6_GviXBB7Bx3lFoveIfFwQ,I give this store a 5 because -\n1. They had a...
93,gUkqdzdD0jhqsjM-AZAWVw,located in the wynn\n\nplus one star...solely ...
110,n9RiwO6NIhB_Uf4xHJtz8A,The shoes are out of this world. Service is t...
112,YBa116q7RFrXZq6a0yVlCQ,Obviously there are a lot of place to purchase...
117,YDkC5VVT8s9NlIZtAr8NUA,So why would I be giving a Fast Food chain loc...
122,FIk4lQQu1eTe2EpzQ4xhBA,"Great burgers, chicken sandwiches, salads, and..."
131,dLrMDYQ9TwfodJxYLfaKDQ,You would think that since this Carl's Jr. is ...
159,H4BiIX0gU-Fc2S487g7z1Q,"Red, orange, yellow, green, blue, purple, pink..."
264,5RLxzxLIY3ZXUUksr_dtXA,"This is a great, quality Taco Bell recently bu..."


In [73]:
def get_similarity_of_group(df):
    
    result = []
   
    df = df.set_index('user_id')
    users = list(combinations(df.index, 2))
    
    for pair in users:
        user_1 = pair[0]
        user_2 = pair[1]
        
        review_1 = df.loc[user_1].text
        review_2 = df.loc[user_2].text
       
        vector1 = text_to_vector(review_1)
        vector2 = text_to_vector(review_2)

        cosine = get_cosine(vector1, vector2)
          
        result.append({'user_1': user_1, 'user_2': user_2, 'review_1': review_1, 'review_2':review_2, 'pair': pair, 'cosine': cosine})


    df_result = pd.DataFrame(result)
    
    _mean = df_result.cosine.mean()
    _max =  df_result.cosine.max()
    
    return _mean, _max

In [75]:
_mean, _max = get_similarity_of_group(group_df)

In [76]:
_mean, _max

(0.2761723762763612, 0.49863003035165515)