# 이미지 Feature vector를 활용해보기!

- 여러 논문에서 소개된것처럼 pre-trained CNN으로 image features를 추출해봅니다.
- image features를 비교하여 실제로 비슷한지 판단하고, rating정보와 함께 분석해봅니다.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
path = '/content/drive/MyDrive/추천시스템 강의 (패스트캠퍼스)/data/amazon_reviews'

## Load dataset

1. AMAZON_FASHION_5.json
2. All_Beauty_5.json
3. Luxury_Beauty_5.json

In [9]:
import os, json
import pandas as pd

In [10]:
def load_json(filename):
  data = []
  with open(os.path.join(path, filename), 'r', encoding='utf-8') as f:
      for l in f:
        data.append(json.loads(l.strip()))

  df = pd.DataFrame.from_dict(data)

  # get rows which contains image 
  df = df[~df['image'].isnull()]

  return df

In [13]:
fashion_df = load_json('AMAZON_FASHION_5.json')
print(fashion_df.shape)
print(fashion_df.head())

(106, 12)
     overall  verified  ... vote                                              image
164      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
172      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
179      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
192      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
197      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...

[5 rows x 12 columns]


In [14]:
beauty_df = load_json('All_Beauty_5.json')
print(beauty_df.shape)
print(beauty_df.head())

(98, 12)
    overall  verified  ... vote                                              image
19      5.0      True  ...    5  [https://images-na.ssl-images-amazon.com/image...
20      5.0      True  ...    4  [https://images-na.ssl-images-amazon.com/image...
21      5.0      True  ...    4  [https://images-na.ssl-images-amazon.com/image...
34      1.0      True  ...    2  [https://images-na.ssl-images-amazon.com/image...
47      5.0      True  ...   20  [https://images-na.ssl-images-amazon.com/image...

[5 rows x 12 columns]


In [15]:
luxury_df = load_json('Luxury_Beauty_5.json')
print(luxury_df.shape)
print(luxury_df.head())

(617, 12)
     overall  verified  ... vote                                              image
68       5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
75       5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
86       5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
88       5.0      True  ...    9  [https://images-na.ssl-images-amazon.com/image...
104      5.0     False  ...    2  [https://images-na.ssl-images-amazon.com/image...

[5 rows x 12 columns]


## Download images

In [16]:
from tqdm import tqdm
import requests

In [17]:
def download_images(path, df, category):
  folder_path = os.path.join(path, category)

  if not os.path.exists(folder_path):
    os.makedirs(folder_path)

  for index in tqdm(df.index):
    url_list = df['image'].loc[index]
    for url_index, url in enumerate(url_list):
      if not os.path.exists(os.path.join(folder_path, f'{str(index)}_{str(url_index)}.jpg')):
        img_data = requests.get(url).content
        with open(os.path.join(folder_path, f'{str(index)}_{str(url_index)}.jpg'), 'wb') as handler:
          handler.write(img_data)

  print(f'{category}: {len(df.index)} images downloaded or already exist...')

In [18]:
download_images(path, beauty_df, 'beauty')

100%|██████████| 98/98 [00:16<00:00,  5.93it/s]

beauty: 98 images downloaded or already exist...





In [19]:
download_images(path, fashion_df, 'fashion')

100%|██████████| 106/106 [00:16<00:00,  6.45it/s]

fashion: 106 images downloaded or already exist...





In [20]:
download_images(path, luxury_df, 'luxury')

100%|██████████| 617/617 [03:22<00:00,  3.04it/s]

luxury: 617 images downloaded or already exist...





## Use pre-trained CNN

In [21]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable
from PIL import Image

In [22]:
# Load the pretrained model
model = models.resnet18(pretrained=True)
# Use the model object to select the desired layer
layer = model._modules.get('avgpool')

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


HBox(children=(FloatProgress(value=0.0, max=46830571.0), HTML(value='')))




In [23]:
# Set model to evaluation mode
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [24]:
scaler = transforms.Scale((224, 224))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
to_tensor = transforms.ToTensor()



In [25]:
def get_vector(image_name):
    # 1. Load the image with Pillow library
    img = Image.open(image_name)
    # 2. Create a PyTorch Variable with the transformed image
    t_img = Variable(normalize(to_tensor(scaler(img))).unsqueeze(0))
    # 3. Create a vector of zeros that will hold our feature vector
    #    The 'avgpool' layer has an output size of 512
    my_embedding = torch.zeros(512)
    # 4. Define a function that will copy the output of a layer
    def copy_data(m, i, o):
      my_embedding.copy_(o.data.reshape(o.data.size(1)))
    # 5. Attach that function to our selected layer
    h = layer.register_forward_hook(copy_data)
    # 6. Run the model on our transformed image
    model(t_img)
    # 7. Detach our copy function from the layer
    h.remove()
    # 8. Return the feature vector
    return my_embedding.cpu().detach().numpy()

In [None]:
# Test
category = 'beauty'
for image_file in os.listdir(os.path.join(path, category))[:2]:
  print(f"{image_file} feature vectors")
  print(get_vector(os.path.join(path, category+'/'+image_file)))

## Preprocess dataset

- Remove unnecessary columns
- Remove all other columns except `overall`, `reviewerID`, `asin`, `image` 
- Create new column with image filename

In [27]:
def add_image_filenames(category, df):
  # Remove unnecessary columns
  df = df[['overall','reviewerID', 'asin', 'image']]

  filenames = []
  for row_index in df.index:
    each_files = []
    for idx in range(len(df.loc[row_index]['image'])):
      each_files.append(os.path.join(path, category+'/'+f'{row_index}_{idx}.jpg'))
    filenames.append(each_files)

  # Add new column
  df.drop('image', axis=1, inplace=True)
  df['image_filename'] = list(filenames)

  return df


In [28]:
luxury_df = add_image_filenames('luxury', luxury_df)
beauty_df = add_image_filenames('beauty', beauty_df)
fashion_df = add_image_filenames('fashion', fashion_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Combine into one dataframe

- `luxury_df`, `fashion_df`, beauty_df

In [29]:
luxury_df.head()

Unnamed: 0,overall,reviewerID,asin,image_filename
68,5.0,A2BHOZILR7SY9,B000142FVW,[/content/drive/MyDrive/추천시스템 강의 (패...
75,5.0,ACMSQCH1H7JZD,B000142FVW,[/content/drive/MyDrive/추천시스템 강의 (패...
86,5.0,A2L77YQRAEA1YZ,B000142FVW,[/content/drive/MyDrive/추천시스템 강의 (패...
88,5.0,A28W77RPDZK7AZ,B00014351Q,[/content/drive/MyDrive/추천시스템 강의 (패...
104,5.0,A2IV70BWQBUF32,B00014351Q,[/content/drive/MyDrive/추천시스템 강의 (패...


In [30]:
# 이미지 파일 이름 저장한 곳에 맞게 재 변경
data_list = []
dataframe_list = [('luxury',luxury_df), ('beauty', beauty_df), ('fashion', fashion_df)]
for dataframe in dataframe_list:
  category = dataframe[0]
  df = dataframe[1]
  for index, row in df.iterrows():
    for filename in row['image_filename']:
      data_tuple = (category, row['overall'], row['reviewerID'], row['asin'], filename)
      data_list.append(data_tuple)

In [31]:
# 데이터 합치기
combined_df = pd.DataFrame(data=data_list,columns=['category', 'overall', 'reviewerID', 'asin', 'filename'])
print(combined_df.head())
print(combined_df.shape)

  category  ...                                           filename
0   luxury  ...  /content/drive/MyDrive/추천시스템 강의 (패ᄉ...
1   luxury  ...  /content/drive/MyDrive/추천시스템 강의 (패ᄉ...
2   luxury  ...  /content/drive/MyDrive/추천시스템 강의 (패ᄉ...
3   luxury  ...  /content/drive/MyDrive/추천시스템 강의 (패ᄉ...
4   luxury  ...  /content/drive/MyDrive/추천시스템 강의 (패ᄉ...

[5 rows x 5 columns]
(1368, 5)


### Get image feature vectors

In [32]:
combined_df['image_vec'] = combined_df['filename'].apply(lambda x: get_vector(x)) #cnn에서 나온 이미지 백터들을 채워줌
combined_df.to_csv(os.path.join(path, 'image_dataset.csv'), sep='\t')

In [33]:
combined_df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec
0,luxury,5.0,A2BHOZILR7SY9,B000142FVW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.55111164, 0.8644801, 0.101986505, 0.2385793..."
1,luxury,5.0,ACMSQCH1H7JZD,B000142FVW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[1.5448725, 1.1352522, 0.14166662, 0.29448575,..."
2,luxury,5.0,A2L77YQRAEA1YZ,B000142FVW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.951097, 1.2491941, 0.17508712, 0.6364965, 0..."
3,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.8647723, 0.4211385, 0.17356627, 1.435112, 0..."
4,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[1.2491897, 0.44804502, 1.3037109, 1.6371988, ..."


## K-means clustering

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import numpy as np

In [35]:
# csv 파일 읽은걸 빈칸이 있는거를 0으로 처리, 차원 512 차원인지 확인
def check_vector(vector):
  return np.array([0.0 if str(x) == '' else float(x) for x in vector])[:512]

In [36]:
# 이미지 백터를 float로 바꿈
df = pd.read_csv(os.path.join(path, 'image_dataset.csv'), sep='\t', index_col=0,
                 converters={"image_vec": lambda x: x.strip("[]").replace('\n','').split(" ")})


In [37]:
df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec
0,luxury,5.0,A2BHOZILR7SY9,B000142FVW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[5.51111639e-01, 8.64480078e-01, 1.01986505e-0..."
1,luxury,5.0,ACMSQCH1H7JZD,B000142FVW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[1.54487252e+00, 1.13525224e+00, 1.41666621e-0..."
2,luxury,5.0,A2L77YQRAEA1YZ,B000142FVW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[9.51097012e-01, 1.24919415e+00, 1.75087124e-0..."
3,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[8.64772320e-01, 4.21138495e-01, 1.73566267e-0..."
4,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[1.24918973e+00, 4.48045015e-01, 1.30371094e+0..."


In [38]:
df['image_vec'] = df['image_vec'].apply(lambda x: check_vector(x))

In [39]:
df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec
0,luxury,5.0,A2BHOZILR7SY9,B000142FVW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.551111639, 0.864480078, 0.101986505, 0.2385..."
1,luxury,5.0,ACMSQCH1H7JZD,B000142FVW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[1.54487252, 1.13525224, 0.141666621, 0.294485..."
2,luxury,5.0,A2L77YQRAEA1YZ,B000142FVW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.951097012, 1.24919415, 0.175087124, 0.63649..."
3,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.86477232, 0.421138495, 0.173566267, 1.43511..."
4,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[1.24918973, 0.448045015, 1.30371094, 1.637198..."


In [40]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1234)
print(train_df.shape) #1094에 총 3개의 카테고리가 들어가 있음
print(test_df.shape)

(1094, 6)
(274, 6)


In [41]:
X_train = np.array([list(x) for x in train_df['image_vec'].values])

In [42]:
X_train[:3]

array([[0.77896351, 0.42345953, 0.40614811, ..., 0.3404167 , 0.75824285,
        0.12758855],
       [0.18728934, 0.81857067, 0.4344621 , ..., 0.15579861, 0.25018868,
        1.37156582],
       [0.03194894, 1.10710621, 0.90613866, ..., 0.03438341, 0.09282333,
        2.05421519]])

In [43]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(X_train)

In [44]:
kmeans.labels_

array([1, 2, 0, ..., 1, 2, 1], dtype=int32)

In [45]:
test_df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec
266,luxury,5.0,A24YNYNC6QJNBN,B002B4540O,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.854530394, 1.85690022, 1.03045893, 0.185951..."
1088,beauty,5.0,A245UNW3PI53NG,B0009RF9DW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[1.13625407, 0.0897668004, 0.442606211, 1.3276..."
488,luxury,4.0,A25QBCHO0KFT0P,B00B95PWYE,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.035534475, 1.64337683, 0.0617778897, 0.0301..."
1087,beauty,5.0,A85ENSL5HBBZF,B0009RF9DW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.909578383, 0.4232274, 1.81325567, 0.2379740..."
1152,beauty,4.0,A25QBCHO0KFT0P,B0010ZBORW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[1.35672152, 0.251045316, 1.9737004, 0.2126619..."


### Evaluation

In [49]:
# kmeans.predict([test_df['image_vec'].iloc[0]])
test_df['prediction'] = test_df['image_vec'].apply(lambda x: kmeans.predict([x])[0])
test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec,prediction
266,luxury,5.0,A24YNYNC6QJNBN,B002B4540O,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.854530394, 1.85690022, 1.03045893, 0.185951...",2
1088,beauty,5.0,A245UNW3PI53NG,B0009RF9DW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[1.13625407, 0.0897668004, 0.442606211, 1.3276...",2
488,luxury,4.0,A25QBCHO0KFT0P,B00B95PWYE,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.035534475, 1.64337683, 0.0617778897, 0.0301...",0
1087,beauty,5.0,A85ENSL5HBBZF,B0009RF9DW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.909578383, 0.4232274, 1.81325567, 0.2379740...",2
1152,beauty,4.0,A25QBCHO0KFT0P,B0010ZBORW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[1.35672152, 0.251045316, 1.9737004, 0.2126619...",1


In [50]:
test_df.groupby('category')['prediction'].count()

category
beauty      35
fashion     28
luxury     211
Name: prediction, dtype: int64

In [47]:
test_df.groupby('category').count()

Unnamed: 0_level_0,overall,reviewerID,asin,filename,image_vec
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
beauty,35,35,35,35,35
fashion,28,28,28,28,28
luxury,211,211,211,211,211


In [51]:
test_df.groupby('prediction').count() #구분이 잘 안됨. 비슷한 애들끼리 묶어놨기 때문.

Unnamed: 0_level_0,category,overall,reviewerID,asin,filename,image_vec
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,87,87,87,87,87,87
1,79,79,79,79,79,79
2,108,108,108,108,108,108


In [52]:
# 럭셔리로 봤을때 카테고리별로 예측이 어떻게 됐는지
print(test_df[(test_df.prediction == 0) & (test_df.category == 'luxury')].shape)
print(test_df[(test_df.prediction == 1) & (test_df.category == 'luxury')].shape)
print(test_df[(test_df.prediction == 2) & (test_df.category == 'luxury')].shape)

(67, 7)
(64, 7)
(80, 7)


## K-Nearest Neighbors

In [53]:
from sklearn.neighbors import KNeighborsClassifier

In [54]:
neigh = KNeighborsClassifier(n_neighbors=3) #이거는 지도 학습

- `category`가 아닌 `overall`로 간단하게 평점 예측하기

In [55]:
y_train = train_df['overall'].values
y_train

array([4., 4., 5., ..., 5., 4., 5.])

In [56]:
neigh.fit(X_train, y_train) #레이블 부여

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [57]:
test_df['prediction'] = test_df['image_vec'].apply(lambda x: neigh.predict([x])[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [58]:
test_df.head() #비교적 예측이 나쁘지 않음

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec,prediction
266,luxury,5.0,A24YNYNC6QJNBN,B002B4540O,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.854530394, 1.85690022, 1.03045893, 0.185951...",5.0
1088,beauty,5.0,A245UNW3PI53NG,B0009RF9DW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[1.13625407, 0.0897668004, 0.442606211, 1.3276...",5.0
488,luxury,4.0,A25QBCHO0KFT0P,B00B95PWYE,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.035534475, 1.64337683, 0.0617778897, 0.0301...",4.0
1087,beauty,5.0,A85ENSL5HBBZF,B0009RF9DW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[0.909578383, 0.4232274, 1.81325567, 0.2379740...",5.0
1152,beauty,4.0,A25QBCHO0KFT0P,B0010ZBORW,/content/drive/MyDrive/추천시스템 강의 (패ᄉ...,"[1.35672152, 0.251045316, 1.9737004, 0.2126619...",4.0


In [59]:
test_df[test_df.overall == test_df.prediction].count()

category      162
overall       162
reviewerID    162
asin          162
filename      162
image_vec     162
prediction    162
dtype: int64

In [60]:
len(test_df)

274

In [61]:
174/274

0.635036496350365