In [3]:
import operator
import sys
import pyspark
import numpy as np
from pyspark.sql import SparkSession
from collections import defaultdict

In [4]:
PREFIX = "gs://book-covers-e6893/"
reviews_file = "Books_5.json"
meta_file = "meta_Books.json"

In [5]:
spark = SparkSession.builder \
    .master("local") \
    .appName("covers") \
    .getOrCreate();
sc = spark.sparkContext

In [6]:
from google.cloud import bigquery
import subprocess
client = bigquery.Client()

In [7]:
sql = """
SELECT *
FROM
    `eecs-e6893-book-cover.book_cover_data.image_data`
"""
df = client.query(sql).to_dataframe()

In [8]:
sdf = spark.createDataFrame(df)

In [9]:
sdf.printSchema()

root
 |-- file_number: string (nullable = true)
 |-- top_color_R: double (nullable = true)
 |-- top_color_G: double (nullable = true)
 |-- top_color_B: double (nullable = true)
 |-- brightness: double (nullable = true)
 |-- colorfullness: double (nullable = true)



In [10]:
data = sdf.rdd.map(lambda x : (x[0], (x[1], x[2], x[3], x[4], x[5]))).collect()

In [11]:
print(len(data))
data[0]

57000


('0557080398', (0.56, 0.56, 0.56, 8.35, 0.0))

In [12]:
test2 = sdf.rdd.map(lambda x : (x[1], x[2], x[3], x[4], x[5])).collect()
test2[3]

(251.97, 251.97, 251.97, 197.87, 0.0)

In [13]:
from sklearn.neighbors import NearestNeighbors

In [14]:
knn = NearestNeighbors()
knn.fit(test2)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [15]:
import keras.backend.tensorflow_backend as tb
from keras.models import load_model

Using TensorFlow backend.


In [16]:
cnn_model = load_model("/tmp/bookcovers/models/cover_cnn_model.h5")

In [17]:
from sklearn.externals import joblib 
from sklearn.neighbors import NearestNeighbors
from google.cloud import bigquery
import pandas as pd 
import numpy as np
from PIL import Image, ImageStat
import numpy as np
import math
import urllib.request



In [18]:
def show_color(colors):
    fig,ax = plt.subplots()
    currentAxis = plt.gca()
    x=0
    y=0
    width = 1/len(colors)
    for rgb in colors:
        colour = binascii.hexlify(bytearray(int(c) for c in rgb)).decode('ascii')
        colour = '#'+colour
        currentAxis.add_patch(Rectangle((x, y), width, 1, alpha=1, facecolor=colour))
        x=x+width

def dominant_colors(ar):
    NUM_CLUSTERS = 5
    codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
    vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
    counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
    index_max = scipy.argmax(counts)                    # find most frequent
    peak = codes[index_max]
    return codes , peak

def image_colorfulness(image):
    R = np.array([x[0] for x in image])
    G = np.array([x[1] for x in image])
    B = np.array([x[2] for x in image])
    rg = np.absolute(R - G)
    yb = np.absolute(0.5 * (R + G) - B)
    (rbMean, rbStd) = (np.mean(rg), np.std(rg))
    (ybMean, ybStd) = (np.mean(yb), np.std(yb))
    stdRoot = np.sqrt((rbStd ** 2) + (ybStd ** 2))
    meanRoot = np.sqrt((rbMean ** 2) + (ybMean ** 2))
    return stdRoot + (0.3 * meanRoot)

def image_brightness(im):
   stat = ImageStat.Stat(im)
   r,g,b = stat.mean
   return math.sqrt(0.241*(r**2) + 0.691*(g**2) + 0.068*(b**2))

def image_features(im):
    ar = np.asarray(im)
    shape = ar.shape
    ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
    
    colorfullness = image_colorfulness(ar)
    bright = image_brightness(im)
    codes, peak = dominant_colors(ar)
    
    return [round(peak[0],2),round(peak[2],2),round(peak[2],2),round(bright,2),round(colorfullness,2)]

def read_image_for_cnn(im):
    outputImage = np.zeros((64, 64, 3), dtype="uint8")
    outputImage[0:64, 0:64] = im.resize((64, 64))

    return np.array([outputImage]) / 255.0

In [19]:
def Convert(tup, di): 
    for a, b in tup: 
        di[b] = a
    return di 

In [90]:
indexedBooks = pd.read_csv("/tmp/bookcovers/models/indexed_books.csv")
imageRatings = pd.read_csv("/tmp/bookcovers/data/image_ratings.csv")

booksFeaturesList = list(indexedBooks.itertuples(index=False, name=None))
ratingsList = list(imageRatings.itertuples(index=False, name=None))

ratingsDict = {}
Convert(ratingsList, ratingsDict)

In [57]:
results = []

In [58]:
cnt = 0 

In [None]:
while cnt < len(booksFeaturesList):
    imageNum =  booksFeaturesList[cnt][1]
    imageFile = "/tmp/bookcovers/covers/224x224/" + imageNum +'.jpg'
    im = Image.open(imageFile)
    testImage = image_features(im)
    
    d,i = knn.kneighbors([testImage])
    predictedRating = cnn_model.predict(read_image_for_cnn(im))[0][0] * 5
    rating = ratingsDict.get(imageNum, predictedRating)
    if imageNum not in ratingsDict:
        ratingsDict[imageNum] = rating
        
    tmpList = []
    for val in np.array(data)[i[0]]:
        tmpList.append(val[0])
    results.append((booksFeaturesList[cnt][1], rating, predictedRating, tmpList))

    if cnt % 1000 == 0:
        print("Processing File "+str(cnt)+" of "+str(len(booksFeaturesList))+'\n')
    cnt+=1
    if cnt % 5000 == 0:
        print(results)
        insert_into_bigquery(results)
        results = []

print(results)
insert_into_bigquery(results)

Processing File 38000 of 57000

Processing File 39000 of 57000

[('0785152458', 4.450559020042419, 4.450559020042419, ['0785152458', '0810957310', '0812970284', '0340728566', '0375766316']), ('1598697846', 4.450559020042419, 4.450559020042419, ['1598697846', '1593373236', '1932549188', '1575420112', '0829440917']), ('1435480953', 4.450559020042419, 4.450559020042419, ['1435480953', '187856983X', '144945402X', '0827214715', '1572241837']), ('163215403X', 4.454545454545454, 4.450559020042419, ['163215403X', '0786802162', '1443859915', '0957670567', '9749575547']), ('0829818073', 4.450559020042419, 4.450559020042419, ['0829818073', '143845404X', '0824753461', '1577318889', '0060006277']), ('1629143243', 4.450559020042419, 4.450559020042419, ['1629143243', '1905460376', '1505268575', '0803220359', '1890627313']), ('1416569855', 4.450559020042419, 4.450559020042419, ['1416569855', '1931494991', '1942952481', '0749455160', '1438454562']), ('190314194X', 4.724137931034483, 4.450559020042419, 

In [110]:
def insert_into_bigquery(rows):
    # Instantiates a client
    bigquery_client = bigquery.Client()

    # Prepares a reference to the dataset
    dataset_ref = bigquery_client.dataset('book_cover_results')

    table_ref = dataset_ref.table('ratings_results')
    table = bigquery_client.get_table(table_ref)  # API call

    errors = bigquery_client.insert_rows(table, rows)  # API request
    assert errors == []

In [29]:
from __future__ import print_function
import binascii
import struct
from PIL import Image, ImageStat
import numpy as np
import scipy
import scipy.misc
import scipy.cluster
import os
import tempfile
from google.cloud import storage
from google.cloud.storage import Blob
import io
from matplotlib import pyplot as plt
from matplotlib.patches import Rectangle
import math
from google.cloud import bigquery

In [None]:
bucket = "book-covers-e6893"    # TODO : replace with your own bucket name
output_directory = 'gs://{}/hadoop/tmp/bigquery/pyspark_output/book_results'.format(bucket)

In [49]:
insert_into_bigquery(results)

In [78]:
sql = """
SELECT *
FROM
    `eecs-e6893-book-cover.book_cover_results.predicted_results`
"""
df = client.query(sql).to_dataframe()
sdf = spark.createDataFrame(df)
sdf.printSchema()

root
 |-- asin: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- predicted: double (nullable = true)
 |-- similar: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [95]:
for row in sdf.rdd.collect():
    if row[0] not in ratingsDict:
        ratingsDict[row[0]] = row[1]

In [107]:
rating_r = []

In [109]:
for row in sdf.rdd.collect():
    best = 0
    for book in row[3]:
        if ratingsDict[book] > best:
            best = ratingsDict[book]
    rating_r.append((row[0], row[1], row[2], best))

In [121]:
insert_into_bigquery(rating_r[50000:])