In [23]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable
from PIL import Image
import numpy as np
from tqdm import tqdm
import json
from collections import defaultdict
import cv2
import tarfile

In [20]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
ROOT_DIR_PATH =  "drive/MyDrive/yelp_dataset_orig"
PHOTOTS_JSON_PATH = "drive/MyDrive/yelp_dataset_orig/photos.json"
TAR_FILE_PATH = "drive/MyDrive/yelp_dataset_orig/yelp_photos.tar"
PHOTOS_DIR_PATH = "drive/MyDrive/yelp_dataset_orig/photos"
BUSINESS_JSON_PATH = "drive/MyDrive/yelp_dataset_orig/yelp_academic_dataset_business.json"
REVIEW_JSON_PATH = "drive/MyDrive/yelp_dataset_orig/yelp_academic_dataset_review.json"
OUTPUT_CSV_FILE_PATH = "drive/MyDrive/yelp_dataset_orig/feats_resnet18.csv"
PHOTOS_TAR_FILE = "drive/MyDrive/yelp_dataset_orig/yelp_photos.tar"
#!touch "drive/MyDrive/yelp_dataset/yelp_photos/feat_vgg16.csv"

Mounted at /content/drive


In [3]:
UNVISITED_RESTAURANTS_THRESHOLD = 10
RESTAURANT_CITY_COUNT_THRESHOLD = 50

In [4]:
info_per_category = defaultdict(int)
filtered_info_per_category = defaultdict(int)
business_counted = defaultdict(bool)

null = None
businesses_covered_photos = set()

print("\nReading photos data\n")



Reading photos data



In [5]:
with open(PHOTOTS_JSON_PATH, encoding='utf-8') as f_photos:
    for photo_json_obj in tqdm(f_photos):
        photo_json = json.loads(photo_json_obj)
        info_per_category[photo_json["label"]] += 1
        business_id= photo_json["business_id"]
        if photo_json["label"] == "menu": continue
        businesses_covered_photos.add(business_id)

print("Number of businesses covered by photos : " + str(len(businesses_covered_photos)))


200000it [00:03, 61134.21it/s]

Number of businesses covered by photos : 36709





In [6]:
print("Number of businesses covered by photos : " + str(len(businesses_covered_photos)))

info_per_business = defaultdict(str)
print("\nReading business data\n")

null = None


Number of businesses covered by photos : 36709

Reading business data



In [7]:
with open(BUSINESS_JSON_PATH, encoding='utf-8') as f_businesses:
    for business_json_obj in tqdm(f_businesses):
        business_json = json.loads(business_json_obj)
        business_id = business_json["business_id"]
        if business_id in businesses_covered_photos:
            info_per_business[business_json["business_id"]] = business_json["city"]

info_per_user = defaultdict(list)
businesses_per_city = defaultdict(set)

print("\nBusiness data read completed\n")
print("\nReading review data\n")


160585it [00:06, 23933.73it/s]


Business data read completed


Reading review data






In [9]:
with open(REVIEW_JSON_PATH, encoding='utf-8') as f_reviews:
    for review_json_obj in tqdm(f_reviews):
        review_json = json.loads(review_json_obj)
        review_id = review_json["review_id"]
        business_id = review_json["business_id"]
        user_id = review_json["user_id"]
        city = info_per_business[business_id]
        if user_id and business_id and business_id in businesses_covered_photos:
            info_per_user[user_id].append( (business_id, city) )
            businesses_per_city[city].add(business_id)

for user in info_per_user:
    info_per_user[user] = list(set(info_per_user[user]))


8635403it [02:55, 49115.19it/s]


In [10]:
print("\nReview data read completed\n")
print("\nComputing user/city review count\n")

user_city_count_dict = defaultdict(int)

for user, reviews in tqdm(info_per_user.items()):
    for review_info in reviews:
        city = review_info[1]
        user_city_count_dict[(user, city)] += 1

print("\nTotal user city combination count : " + str(len(user_city_count_dict)) + "\n")
print("\nUser city count computation completed\n")
print("\nFiltering user/city review count\n")



Review data read completed


Computing user/city review count



100%|██████████| 1489351/1489351 [00:08<00:00, 176666.70it/s]


Total user city combination count : 2245054


User city count computation completed


Filtering user/city review count






In [11]:
filtered_user_city_count_dict = defaultdict(int)

for k, v in tqdm(user_city_count_dict.items()):
    univisited_restaurants = len(businesses_per_city[k[1]]) - len(info_per_user[k[0]])
    if v >= RESTAURANT_CITY_COUNT_THRESHOLD and univisited_restaurants >= UNVISITED_RESTAURANTS_THRESHOLD:
        filtered_user_city_count_dict[k] = v

print("\nFiltered user city combination count : " + str(len(filtered_user_city_count_dict)) + "\n")
print("\nUser city count filtering completed\n")
user_city_combinations = set(filtered_user_city_count_dict.keys())
filtered_info_per_user = defaultdict(list)
print("\nCreating filtered data\n")


100%|██████████| 2245054/2245054 [00:02<00:00, 796817.33it/s]


Filtered user city combination count : 5260


User city count filtering completed


Creating filtered data






In [12]:
for user, reviews in tqdm(info_per_user.items()):
    for review in reviews:
        city = review[1]
        if (user, city) in user_city_combinations:
            filtered_info_per_user[user].append(review)
print("\nUser city count filtering based user info data extraction completed\n")
print("\nWriting the extracted data in CSV format\n")

count_instances = 0

businesses_covered = set()


100%|██████████| 1489351/1489351 [00:03<00:00, 401538.15it/s]


User city count filtering based user info data extraction completed


Writing the extracted data in CSV format






In [13]:
for user, reviews in tqdm(filtered_info_per_user.items()):
    for review in reviews:
        #review_id = review[0]
        business_id = review[0]
        city = review[1]
        businesses_covered.add(business_id)
        #f.write(str(user) + "," + str(business_id) + "," + str(city) + "\n")
        count_instances += 1

print("Total Number of businesses in extracted filtered data : " + str(len(businesses_covered)))
print("Total number of (user_id, business_id, city) instances : " + str(count_instances))


100%|██████████| 5116/5116 [00:00<00:00, 11882.08it/s]

Total Number of businesses in extracted filtered data : 24213
Total number of (user_id, business_id, city) instances : 487221





In [14]:

# Load the pretrained model
model = models.resnet18(pretrained=True)

# Use the model object to select the desired layer
layer = model._modules.get('avgpool')
print(model._modules)

# Set model to evaluation mode
model.eval()


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

OrderedDict([('conv1', Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)), ('bn1', BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)), ('relu', ReLU(inplace=True)), ('maxpool', MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)), ('layer1', Sequential(
  (0): BasicBlock(
    (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (1): BasicBlock(
    (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Con

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [15]:
# Image transforms
scaler = transforms.Resize((224, 224))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
to_tensor = transforms.ToTensor()


In [24]:
tar = tarfile.open(PHOTOS_TAR_FILE, 'r|')

In [43]:
def get_vector(image_name):
    # 1. Load the image with Pillow library
    try:
      img = Image.open(image_name)
    except:
      return None

    '''
    path = "./photos/" + image_name + ".jpg"
    c = tar.extractfile(path).read()

    img = None

    if sys.getsizeof(c) > 266:
      print(sys.getsizeof(c))
      na = np.frombuffer(c, dtype=np.uint8)
      img = cv2.imdecode(na, cv2.IMREAD_COLOR)
    '''

    # 2. Create a PyTorch Variable with the transformed image
    t_img = Variable(normalize(to_tensor(scaler(img))).unsqueeze(0))
    # 3. Create a vector of zeros that will hold our feature vector
    #    The 'avgpool' layer has an output size of 512
    my_embedding = torch.zeros(512)
    # 4. Define a function that will copy the output of a layer
    def copy_data(m, i, o):
        my_embedding.copy_(o.data.reshape(o.data.size(1)))
    # 5. Attach that function to our selected layer
    h = layer.register_forward_hook(copy_data)
    # 6. Run the model on our transformed image
    model(t_img)
    # 7. Detach our copy function from the layer
    h.remove()
    # 8. Return the feature vector
    return my_embedding.numpy()


In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")

#print(str(res.shape))
#print(len(res))

csv_file =  open(OUTPUT_CSV_FILE_PATH, "w")
count = 0

with open(PHOTOTS_JSON_PATH, encoding='utf-8') as f_photos:
    csv_file.write("photo_id" + "," + "business_id" + "," + "label" + "," + "res_18_feats" + "\n")

    for photo_json_obj in tqdm(f_photos):
        photo_json = json.loads(photo_json_obj)
        photo_id = photo_json["photo_id"]
        business_id = photo_json["business_id"]
        label = photo_json["label"]
        path = PHOTOS_DIR_PATH + "/" + photo_id + ".jpg"
        if label != "menu" and business_id in businesses_covered:
            count += 1
            # Extract the feature from the image
            features = get_vector(path)
            if features is None: continue
            features = np.reshape(features, (1,512))
            # Convert to NumPy Array, Reshape it, and save it to features variable
            #temp = features.cpu().detach().numpy().reshape(-1)
            l = features.tolist()
            s = ",".join([str(v) for v in l])
            s = s[1:-1]
            string_to_write = photo_id + "," + business_id + "," + label + "," + s + "\n"
            csv_file.write(string_to_write)
            
print("No. of iterations required : "+ str(count))

3955it [06:28, 12.66it/s]