In [1]:
import pymongo
import pandas as pd
import z534
import SemDiff as sd

In [2]:
db = z534.get_db()

In [3]:
db.businesses.find_one({'list_of_categories': 'Restaurants'})

{'_id': ObjectId('5ca02928a157a4323849ad06'),
 'business_id': 'QXAEGFB4oINsVuTFxEYKFQ',
 'name': 'Emerald Chinese Restaurant',
 'address': '30 Eglinton Avenue W',
 'city': 'Mississauga',
 'state': 'ON',
 'postal_code': 'L5R 3E7',
 'latitude': 43.6054989743,
 'longitude': -79.652288909,
 'stars': 2.5,
 'review_count': 128,
 'is_open': 1,
 'attributes': {'RestaurantsReservations': True,
  'GoodForMeal': {'dessert': False,
   'latenight': False,
   'lunch': True,
   'dinner': True,
   'brunch': False,
   'breakfast': False},
  'BusinessParking': {'garage': False,
   'street': False,
   'validated': False,
   'lot': True,
   'valet': False},
  'Caters': True,
  'NoiseLevel': 'loud',
  'RestaurantsTableService': True,
  'RestaurantsTakeOut': True,
  'RestaurantsPriceRange2': 2,
  'OutdoorSeating': False,
  'BikeParking': False,
  'Ambience': {'romantic': False,
   'intimate': False,
   'classy': False,
   'hipster': False,
   'divey': False,
   'touristy': False,
   'trendy': False,
   'ups

In [4]:
db.businesses.find_one({'list_of_categories': 'Hotels'})

{'_id': ObjectId('5ca02928a157a4323849ad34'),
 'business_id': 'Jff5hTK1ZMLKS06sOSNWdA',
 'name': 'Radisson Hotel & Conference Centre Calgary Airport',
 'address': '6620 36 Street NE',
 'city': 'Calgary',
 'state': 'AB',
 'postal_code': 'T3J 4C8',
 'latitude': 51.1118915249,
 'longitude': -113.9804801532,
 'stars': 3.5,
 'review_count': 14,
 'is_open': 1,
 'attributes': {'RestaurantsPriceRange2': 2, 'WiFi': 'free'},
 'categories': 'Hotels & Travel, Hotels, Event Planning & Services',
 'hours': None,
 'list_of_categories': ['Hotels & Travel',
  'Hotels',
  'Event Planning & Services'],
 'attributes_flat': {'RestaurantsPriceRange2': 2, 'WiFi': 'free'},
 'attributes_parsed': {'RestaurantsPriceRange2': 2, 'WiFi': 'free'}}

In [5]:
db.businesses.find_one({'list_of_categories': 'Shopping'})

{'_id': ObjectId('5ca02928a157a4323849ad08'),
 'business_id': 'HhyxOkGAM07SRYtlQ4wMFQ',
 'name': 'Queen City Plumbing',
 'address': '4209 Stuart Andrew Blvd, Ste F',
 'city': 'Charlotte',
 'state': 'NC',
 'postal_code': '28217',
 'latitude': 35.1900119,
 'longitude': -80.8872232,
 'stars': 4.0,
 'review_count': 4,
 'is_open': 1,
 'attributes': {'BusinessAcceptsBitcoin': False,
  'ByAppointmentOnly': True,
  'BusinessAcceptsCreditCards': True},
 'categories': 'Plumbing, Shopping, Local Services, Home Services, Kitchen & Bath, Home & Garden, Water Heater Installation/Repair',
 'hours': {'Monday': '7:0-23:0',
  'Tuesday': '7:0-23:0',
  'Wednesday': '7:0-23:0',
  'Thursday': '7:0-23:0',
  'Friday': '7:0-23:0',
  'Saturday': '7:0-23:0',
  'Sunday': '7:0-23:0'},
 'list_of_categories': ['Plumbing',
  'Shopping',
  'Local Services',
  'Home Services',
  'Kitchen & Bath',
  'Home & Garden',
  'Water Heater Installation/Repair'],
 'attributes_flat': {'BusinessAcceptsBitcoin': False,
  'ByAppoint

In [6]:
restaurants_cursor = db.businesses.find({'list_of_categories': 'Restaurants'})
hotels_cursor = db.businesses.find({'list_of_categories': 'Hotels'})
store_cursor = db.businesses.find({'list_of_categories': 'Shopping'})

restaurants = []
hotels = []
stores = []

for item in restaurants_cursor:
    restaurants.append(item)
    
for item in hotels_cursor:
    hotels.append(item)
    
for item in store_cursor:
    stores.append(item)
    
print("Number of hotels: {}\nNumber of restaurants: {}\n Number of stores: {}" \
      .format(len(hotels), len(restaurants), len(stores)))

Number of hotels: 2625
Number of restaurants: 59371
 Number of stores: 31878


In [7]:
print('querying hotel reviews')
hotel_reviews = z534.mongo_utils.aggregate_reviews(db, business_category = 'Hotels')
print('querying restaurant reviews')
restaurant_reviews = z534.mongo_utils.aggregate_reviews(db, business_category = 'Restaurants')
print('querying shopping reviews')
store_reviews = z534.mongo_utils.aggregate_reviews(db, business_category = 'Shopping')

print("\nNumber of hotel reviews: {}\nNumber of restaurant reviews: {}\nNumber of store reviews: {}" \
      .format(len(hotel_reviews), len(restaurant_reviews), len(store_reviews)))

querying hotel reviews
querying restaurant reviews
querying shopping reviews

Number of hotel reviews: 231380
Number of restaurant reviews: 4159944
Number of store reviews: 468741


In [8]:
hotel_reviews_filt = hotel_reviews[:230000]
restaurant_reviews_filt = restaurant_reviews[:230000]
store_reviews_filt = store_reviews[:230000]

In [9]:
hotel_df = pd.DataFrame(hotel_reviews_filt)
restaurant_df = pd.DataFrame(restaurant_reviews_filt)
store_df = pd.DataFrame(store_reviews_filt)

In [10]:
hotel_df.to_csv('~/Documents/semantic_difference/data/hotel_text.csv')
restaurant_df.to_csv('~/Documents/semantic_difference/data/restaurant_text.csv')
store_df.to_csv('~/Documents/semantic_difference/data/store_text.csv')

In [11]:
# Now load the models that have already been trained
from gensim.models import Word2Vec
hotel_model = Word2Vec.load("../data/hotel_model.wv")
restaurant_model = Word2Vec.load("../data/restaurants_model.wv")
store_model = Word2Vec.load("../data/store_model.wv")

In [15]:
# repeat for a set of antonyms
antonyms = [
    ["good", "bad"],
    ['always', 'never'],
    ['higher', 'lower'],
    ['certain', 'impossible'],
    ['strong', 'weak'],
    ['very', 'slightly'],
    ['similar', 'unlike'],
    ['unexpected', 'expected'],
    ['outside', 'inside'],
    ['start', 'stop'],
    ['worst', 'best'],
    ['happy', 'sad'],
    ['everywhere', 'nowhere'],
    ['common', 'exceptional'],
    ['fast', 'slow'],
    ['average', 'poor'],
    ['man', 'woman']
    
]

words = ['customer', 'host', 'food', 'bed', 'cashier', 'parking', 'room', 'table', 'drink']
models = [hotel_model, restaurant_model]
models_names = ["hotel", "restaurant"]
hotel_list = []
restaurant_list = []

ant_names = []
word_list = []
for word in words:
    for ant in antonyms:
        hotel_list.append(sd.project_word_on_axis(hotel_model, word, ant, k = 3))
        restaurant_list.append(sd.project_word_on_axis(restaurant_model, word, ant, k = 3))

        word_list.append(word)
    
        ant_names.append("{}-{}".format(ant[0], ant[1]))

In [20]:
df = pd.DataFrame({'antonym': ant_names, 
                   'hotel': hotel_list, 
                   'restaurant': restaurant_list, 
                   'word': word_list}, 
                  index = ant_names)
df.head()

Unnamed: 0,antonym,hotel,restaurant,word
good-bad,good-bad,0.163979,0.176224,client
always-never,always-never,0.020885,0.216944,client
higher-lower,higher-lower,0.016677,0.147421,client
certain-impossible,certain-impossible,-0.016828,-0.124972,client
strong-weak,strong-weak,0.091252,-0.153894,client


In [21]:
df.to_csv('../data/yelp_semantic_word_axes.csv')

In [17]:
shared = sd.get_shared_words(hotel_model, restaurant_model, topn_words = 10000)
frames = []
for word in words:
    frames.append(sd.model_term_similarity(hotel_model, restaurant_model, word, shared))
    
hotel_restaurant_model = pd.concat(frames)

hotel_restaurant_model.to_csv('../data/hotel_restaurant_interword_similarity.csv')

In [18]:
hotel_restaurant_model.head()

Unnamed: 0,x,y,word
0,-0.192539,-0.21243,client
1,0.052198,0.005734,client
2,0.219695,0.18735,client
3,-0.113341,0.016733,client
4,0.32703,0.205634,client
