In [1]:
import numpy as np
import pandas as pd
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.models import Model
from keras.preprocessing import image
from keras.layers import Flatten, Input, Dense, Lambda
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt
from processing import merge_pickles, data_to_pickle
from helper_functions import get_filenames
from sklearn.decomposition import PCA
%matplotlib inline
import config

Using TensorFlow backend.


In [None]:
#Using pretrained model as backbone
VGG_model = VGG16(weights='imagenet', include_top = False)
# #Adding layers
input_layer = Input(shape=(224, 224, 3), name='image_input')
conv_out = VGG_model(input_layer)
flattened = Flatten()(conv_out)

# #Model for extracting features from images
features_model = Model(inputs=input_layer, outputs=flattened)

In [4]:
#Dirs with photos
TRAIN_DIR = config.TRAIN_DIR
#TEST_DIR = "D:/test_photos/"

DATA_DIR = config.DATA_DIR

#CSV files with photos ids and corresponding business ids 
#test_photo_to_biz = pd.read_csv("D:/test_photo_to_biz.csv")
train_photo_to_biz = config.train_photo_to_biz

#CSV file with labels corresponding to business
labels = config.labels
max_img_amount = config.max_img_amount

In [None]:
#Extracting train features

X_train = []

for i in tqdm(range(len(train_photo_to_biz))):
    
    #Reading images corresponding to photo_to_biz DataFrame
    img_path = TRAIN_DIR + str(train_photo_to_biz.photo_id[i]) + '.jpg'
    img = np.expand_dims(image.img_to_array(image.load_img(img_path, target_size=(224, 224))), axis=0)
    
    #Processing and extracting features from images
    processed_img = preprocess_input(img)
    X_train.append(features_model.predict(processed_img))
    
    #Deliting images to free RAM
    del processed_img
    del img
    del img_path
    
    #Writing images to pickles to free RAM
    #Number of i to enter this block is maximum number of images could be holded in RAM 
    #10000=8GB RAM
    
    if i%max_img_amount==0 and i!=0:
        
        data_to_pickle(X_train, train_photo_to_biz, labels, i-max_img_amount, i, True)
        
        del X_train
        
        X_train = []

data_to_pickle(X_train, train_photo_to_biz, labels, len(train_photo_to_biz)-(len(train_photo_to_biz)%max_img_amount), len(train_photo_to_biz), True)

del X_train

In [None]:
#Extracting test features

X_test = []

for i in tqdm(range(len(test_photo_to_biz))):
    
    #Reading images corresponding to photo_to_biz DataFrame
    img_path = TEST_DIR + str(test_photo_to_biz.photo_id[i]) + '.jpg'
    img = np.expand_dims(image.img_to_array(image.load_img(img_path, target_size=(224, 224))), axis=0)
    
    #Processing and extracting features from images
    processed_img = preprocess_input(img)
    X_test.append(features_model.predict(processed_img))
    
    #Deliting images to free RAM
    del processed_img
    del img
    del img_path
    
    #Writing images to pickles to free RAM
    #Number of i to enter this block is maximum number of images could be holded in RAM 
    #10000=8GB RAM
    
    if i%max_img_amount==0 and i!=0:
        
        data_to_pickle(X_test, test_photo_to_biz, None, i-max_img_amount, i, False)
        
        del X_test
        
        X_test = []

In [None]:
#Getting pickled data

data_files = get_filenames(DATA_DIR)

filtered_data_files = list(filter(lambda y:"train_data" in y, list(filter(lambda x:x[-3:]=="csv", data_files))))

In [None]:
#Merging pickled data
main_df = pd.read_pickle(filtered_data_files[0])

for i in range(1,len(filtered_data_files)):
    
    main_df = merge_pickles(filtered_data_files[i], main_df)

In [None]:
#Taking mean vector for every business
grouped_df = pd.DataFrame(main_df.groupby("business_id")["features"].apply(np.mean))

grouped_df.reset_index(level=0, inplace=True)

lb = labels[labels["business_id"].isin(grouped_df["business_id"])]

sorted_labels = lb.sort_values("business_id")

sorted_labels.reset_index(level=0,drop=True, inplace=True)

grouped_df["labels"] = sorted_labels["labels"]

del sorted_labels

#Clearing

nans = pd.isnull(grouped_df).any(1).nonzero()[0]

cleared_df = grouped_df.drop(grouped_df.index[list(nans)])

cleared_df.to_pickle("embeddings.csv")

In [None]:
#Dimentionality reduction
df = pd.read_pickle("result_df.csv")

arr = np.array(list(map(lambda x:x.squeeze(),df["features"])))

pca = PCA(.999)

pca.fit(arr)

print(pca.n_components_)


In [None]:
new_df = pca.transform(arr)

pd_df=pd.DataFrame({"features":list(new_df), "labels":df["labels"], "business_id":df["business_id"]})
pd_df.to_pickle("reduced_result.csv")