### Running Environment Preparation

In [None]:
from collections import defaultdict
from scipy.stats import itemfreq
from scipy import ndimage as ndi
import matplotlib.pyplot as plt
from skimage import feature
from PIL import Image as IMG
import numpy as np
import pandas as pd 
import operator
import cv2
import os 
from multiprocessing import Pool
import time

### Define features to extract from images

In [None]:
# arbitrarily define light and dark level for pixels and get the dark and light level for the image
# the idea is from kernel:
# https://www.kaggle.com/shivamb/ideas-for-image-features-and-image-quality

def color_analysis(img):
    # obtain the color palatte of the image 
    palatte = defaultdict(int)
    for pixel in img.getdata():
        palatte[pixel] += 1
    
    # sort the colors present in the image 
    sorted_x = sorted(palatte.items(), key=operator.itemgetter(1), reverse = True)
    light_shade, dark_shade, shade_count, pixel_limit = 0, 0, 0, 25
    for i, x in enumerate(sorted_x[:pixel_limit]):
        # dull : too much darkness 
        if all(xx <= 20 for xx in x[0][:3]):
            dark_shade += x[1]
        # bright : too much whiteness 
        if all(xx >= 240 for xx in x[0][:3]): 
            light_shade += x[1]
        shade_count += x[1]
        
    light_percent = round((float(light_shade)/shade_count)*100, 2)
    dark_percent = round((float(dark_shade)/shade_count)*100, 2)
    return light_percent, dark_percent

In [None]:
# utilize the color analysis function and get light score from image
def perform_white_analysis(img):
    try:
        if pd.notnull(img):
            flag="white"
            path = images_path + img +".jpg"
            im = IMG.open(path) #.convert("RGB")

            # cut the images into two halves as complete average may give bias results
            size = im.size
            halves = (size[0]/2, size[1]/2)
            im1 = im.crop((0, 0, size[0], halves[1]))
            im2 = im.crop((0, halves[1], size[0], size[1]))

            try:
                light_percent1, dark_percent1 = color_analysis(im1)
                light_percent2, dark_percent2 = color_analysis(im2)
            except Exception as e:
                return None

            light_percent = (light_percent1 + light_percent2)/2 
            dark_percent = (dark_percent1 + dark_percent2)/2 

            if flag == 'black':
                return dark_percent
            elif flag == 'white':
                return light_percent
            else:
                return None
        else:
            return -999
    except:
        return -999

In [None]:
# utilize the color analysis function and get light score from image
def perform_black_analysis(img):
    try:
        if pd.notnull(img):
            flag="black"
            path = images_path + img +".jpg"
            im = IMG.open(path) #.convert("RGB")

            # cut the images into two halves as complete average may give bias results
            size = im.size
            halves = (size[0]/2, size[1]/2)
            im1 = im.crop((0, 0, size[0], halves[1]))
            im2 = im.crop((0, halves[1], size[0], size[1]))

            try:
                light_percent1, dark_percent1 = color_analysis(im1)
                light_percent2, dark_percent2 = color_analysis(im2)
            except Exception as e:
                return None

            light_percent = (light_percent1 + light_percent2)/2 
            dark_percent = (dark_percent1 + dark_percent2)/2 

            if flag == 'black':
                return dark_percent
            elif flag == 'white':
                return light_percent
            else:
                return None
        else:
            return -999
    except:
        return -999

In [None]:
# get image size
# we thought that length and width are the same feature for image, so we sum them up as size feature
def image_size(img):
    try:
        if pd.notnull(img):
            path = images_path + img +".jpg"
            im = IMG.open(path)    
            wandl=im.size[0]+im.size[1]
            return wandl
        else:
            return -999
    except:
        return -999

In [None]:
# Some images may contain no pixel variation and are entirely uniform.
# Average Pixel Width is a measure which indicates the amount of edges present in the image.
# If this number comes out to be very low, then the image is most likely a uniform image and may not represent right content.
def average_pixel_width(img):
    try:
        if pd.notnull(img):
            path = images_path + img +".jpg"
            im = IMG.open(path)    
            im_array = np.asarray(im.convert(mode='L'))
            edges_sigma1 = feature.canny(im_array, sigma=3)
            apw = (float(np.sum(edges_sigma1)) / (im.size[0]*im.size[1]))
            return apw*100
        else:
            return -999
    except:
        return -999

In [2]:
# colors might influence the deal probability, so we calculated average colors to show the image overall color conditions
def get_average_color(img):
    try:
        if pd.notnull(img):
            path = images_path + img +".jpg"
            img = cv2.imread(path)
            average_color = [img[:, :, i].mean() for i in range(img.shape[-1])]
            return average_color
        else:
            return [-999,-999,-999]
    except:
        return -999

In [None]:
# reference:
# https://www.pyimagesearch.com/2015/09/07/blur-detection-with-opencv/
def get_blurrness_score(img):
    try:
        if pd.notnull(img):
            path = images_path + img +".jpg"
            image = cv2.imread(path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            fm = cv2.Laplacian(image, cv2.CV_64F).var()
            return fm
        else:
            return -999
    except:
        return -999

#### Check all the functions we defined to extract image features

In [3]:
import sys
import inspect
from os.path import join

print ("Current Functions Defined")
dict(inspect.getmembers(sys.modules[__name__],predicate = lambda f: inspect.isfunction(f) and f.__module__ == __name__))

Current Functions Defined


{'average_pixel_width': <function __main__.average_pixel_width>,
 'color_analysis': <function __main__.color_analysis>,
 'get_average_color': <function __main__.get_average_color>,
 'get_blurrness_score': <function __main__.get_blurrness_score>,
 'image_size': <function __main__.image_size>,
 'perform_black_analysis': <function __main__.perform_black_analysis>,
 'perform_white_analysis': <function __main__.perform_white_analysis>}

### Load Data

In [69]:
mydf=pd.read_csv("~/.kaggle/competitions/avito-demand-prediction/test.csv",header=0)
images_path="/home/ubuntu/.kaggle/competitions/avito-demand-prediction/data/competition_files/test_jpg/"
mydf.to_csv("mydf.csv",encoding="utf-8",header=True,index=False)

#### We need to analyze about 2 million images and most analysis is pixel-wise analysis.
#### So, we used a 32-core instance in AWS to handle the task.

#### We use multiprocessing to separate data into 32 parts and analyze them simultaneously.

In [None]:
import numpy as np
from multiprocessing import Pool
import time

time1=time.time()

# Number of CPU cores on your system
cores = 32
# Define as many partitions as you want
partitions = cores

funclist=[perform_white_analysis,perform_black_analysis,image_size,average_pixel_width,get_average_color,get_blurrness_score]

### In each iteration, we extract one feature and output the file into csv.
### The overall run time for training data is around 20 hours (test data: 5 hours) in the 32-core machine.

In [9]:
for myfunc in funclist:
    print ("reading files")
    mydf=pd.read_csv("mydf.csv",header=0,encoding="utf-8")
    images_path="/home/ubuntu/.kaggle/competitions/avito-demand-prediction/data/competition_files/test_jpg/"
    def parallelize_dataframe(data, func):
        print ("parallelizing dataframe")
        data_split = np.array_split(data, partitions)
        pool = Pool(cores)
        data = pd.concat(pool.map(func, data_split))
        pool.close()
        pool.join()
        return data

    def test_func(data):
        print ("applying function")
        data[str(myfunc.__name__)] = data["image"].apply(myfunc)
        return data
    
    mydf = parallelize_dataframe(mydf, test_func)
    mydf.to_csv("mydf.csv",header=True,index=False,encoding="utf-8")
    print (myfunc.__name__," completed")
    print ("running for ",round((time.time()-time1)/60)," mins")

reading files
parallelizing dataframe
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
perform_black_analysis  completed
running for  40  mins
reading files
parallelizing dataframe
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
applying function
appl

#### Feature cleaning

##### get_average_color output the color in 0-255 color range. We can transform it to 0-1 range.

In [10]:
import pandas as pd
mydf=pd.read_csv("mydf.csv",header=0,encoding="utf-8")

In [12]:
mydf["get_average_color"]=mydf["get_average_color"].apply(lambda x: x.strip("[").strip("]").replace(" ","").split(","))

In [13]:
# always replace NA or other errors with -999
def mycolor(x,y):
    try:
        if float(x[y])==-999:
            return -999
        else:
            return round(float(x[y])/255,2)
    except:
        return -999

In [14]:
mydf['average_red'] = mydf['get_average_color'].apply(lambda x: mycolor(x,0))
mydf['average_green'] = mydf['get_average_color'].apply(lambda x: mycolor(x,1))
mydf['average_blue'] = mydf['get_average_color'].apply(lambda x: mycolor(x,2))

In [15]:
mydf.drop(['get_average_color'],axis=1,inplace=True)

### Check Results

In [16]:
mydf.head(5)

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,...,image,image_top_1,perform_white_analysis,perform_black_analysis,image_size,average_pixel_width,get_blurrness_score,average_red,average_green,average_blue
0,6544e41a8817,dbe73ad6e4b5,Волгоградская область,Волгоград,Личные вещи,Детская одежда и обувь,Для мальчиков,Обувь,25.0,Отдам бесплатно,...,a8b57acb5ab304f9c331ac7a074219aed4d349d8aef386...,2020.0,23.38,25.64,840,3.831019,313.475352,0.41,0.42,0.42
1,65b9484d670f,2e11806abe57,Свердловская область,Нижняя Тура,Хобби и отдых,Велосипеды,Дорожные,,,Продам велосипед,...,,,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999.0,-999.0
2,8bab230b2ecd,0b850bbebb10,Новосибирская область,Бердск,Бытовая электроника,Аудио и видео,Телевизоры и проекторы,,,BBK,...,8c361112cb049745ef2d1b0ae73594fc5c107286b0c942...,2960.0,6.285,93.715,1000,4.937934,754.22803,0.34,0.3,0.28
3,8e348601fefc,5f1d5c3ce0da,Саратовская область,Саратов,Для дома и дачи,Бытовая техника,Для кухни,Вытяжки,,Вытяжка Jetair 60,...,,,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999.0,-999.0
4,8bd2fe400b89,23e2d97bfc7f,Оренбургская область,Бузулук,Личные вещи,Товары для детей и игрушки,Детские коляски,,,Коляска зима-лето,...,bc3cf6deef10840fc302e38eb48fa7748aa1e28d534f8f...,1002.0,0.0,53.82,840,3.735532,344.755778,0.23,0.27,0.36


### Output results

In [17]:
mydf.to_csv("testwithimagefeatures.csv",header=True,index=False,encoding="utf-8")
print ("completed")

completed


In [None]:
# silimarly we can change the path and file name to analyze both training and test data