In [0]:
import cv2
#import imageio as io
from skimage import io
import skimage
import os
import matplotlib.pyplot as plt
from skimage.color import rgb2hsv
import scipy.misc
from scipy import ndimage
import scipy.misc
from skimage import data, io
from skimage.color import rgb2hsv

import matplotlib.pyplot as plt
import numpy as np
from skimage import exposure
from skimage.filters.rank import entropy
from skimage.morphology import disk
from skimage.color import rgb2gray
import urllib.request
import re
import numpy as np
import pandas as pd

import os
from sklearn.model_selection import train_test_split

import urllib.request
from urllib.error import HTTPError

import PIL
from PIL import Image

from google.colab import files

In [0]:
def read_and_clean_data(path):
    df = pd.read_csv(path, encoding="ISO-8859-1", usecols=["imdbId", "Title", "Genre", "Poster"])
    df.set_index(["imdbId"], inplace=True)
    print(f"Shape of the original dataset: {df.shape}")
    df.dropna(inplace=True)
    print(f"Shape after dropping rows with missing values: {df.shape}")
    df.drop_duplicates(subset="Poster", keep=False, inplace=True)
    print(f"Shape after dropping rows with potentially misleading poster link: {df.shape}\n")
    return df

def add_year_variable(df):
    re_year = re.compile("\((\d{4})\)")
    df["year"] = df.Title.map(lambda x: int(re_year.findall(x)[0]) if re_year.findall(x) else None)
    print(f"There are movies between {int(np.min(df.year))} and {int(np.max(df.year))} available in the dataset.\n")
    return df

def create_boolean_genres(df):
    df["Genre"] = df.Genre.map(lambda x: x.split("|"))
    all_genres = set([item for l in df.Genre for item in l])
    print(f"There are {len(all_genres)} genres in the dataset: {all_genres}\n")
    for genre in all_genres:
        new_var = "is_" + re.sub(r'\W+', '', genre.lower())
        df[new_var] = df.Genre.map(lambda x: genre in x)
    df.drop(["Genre"], axis=1, inplace=True)
    return df

def extract_genre_data(df, genre="Action"):
    filter_var = "is_" + re.sub(r'\W+', '', genre.lower())
    df_genre = df.copy().loc[df[filter_var]]
    print(f"{genre} movies in the dataset: {df_genre.shape[0]}\n")
    return df_genre

def select_years(df, min_year=1950, max_year=2000, add_decades=True):
    df_range = df.copy().loc[(df.year >= min_year) & (df.year < max_year)]
    print(f"Movies left between {min_year} and {max_year}: {df_range.shape[0]}")
    if add_decades:
        df_range["decade"] = df_range.year.apply(lambda x: str(int(x))[2] +"0s")
        print(f"Movies per decade in the dataset:\n{df_range.decade.value_counts()}\n")
    return df_range

def sample_same_number_per_decade(df, use_test_sample=False):
    min_number = 40 if use_test_sample else np.min(df.decade.value_counts())
    df_sample = df.groupby("decade").apply(lambda x: x.sample(min_number))
    print(f"Sample includes {min_number} movies per decade")
    return df_sample

def create_train_and_test_dfs(df, prop_test=.2, strat = 'decade'):
    if strat is not None:
        strat = df[strat]
    train, test = train_test_split(df, test_size=prop_test, stratify=strat)
    print(f"Number of movies in training data: {train.shape[0]}")
    print(f"Number of movies in testing data:  {test.shape[0]}\n")
    return {"train": train, "test": test}

def create_folder_structure(image_folder="movie_posters", splits=["train", "test"], classes=None):
    for s in splits:
        for c in classes:
            folder_name = "/".join([image_folder, s, c])
            try:
                os.makedirs(folder_name)
            except FileExistsError:
                print(f"{folder_name} already exists.")
        print("\n")
        
def download_posters(dfs, image_folder="movie_posters"):
    for k, df in dfs.items():
        print(f"Starting with downloading files for {k}...\n")
        already_downloaded = 0
        http_errors = []
        for index, movie in df.iterrows():
            movie_id = str(index[1])
            movie_decade = index[0]
            file_name = movie_id + ".jpg"
            file_path = "/".join([image_folder, k, movie_decade, file_name])
            if os.path.isfile(file_path):
                already_downloaded += 1
            else:
                try:
                    urllib.request.urlretrieve(movie.Poster, file_path)       
                except HTTPError:
                    http_errors.append(movie_id)
        print(f"{len(http_errors)} posters had an HTTPError.")
        print(f"{already_downloaded} posters were downloaded before.\n")
        count = 0
        for root, dirs, files in os.walk("/".join([image_folder, k])):
            if len(dirs) == 0:
                count += len(files)
                print(f"Number of pictures in {root}:\t{len(files)}")
        print(f"\nTotal number of pictures available for {k}: {count}\n")

def delete_black_and_white_posters(image_folder=None):
    print(f"\nChecking for black and white pictures in {image_folder}...")
    count = 0
    for root, dirs, files in os.walk(image_folder):
        if len(files) > 0:
            for f in files:
                file_path = "/".join([root, f])
                if np.asarray(Image.open(file_path)).shape != (268, 182, 3):
                    os.remove(file_path)
                    count += 1
    print(f"Files without RGB and therefore deleted: {count}")    
    
    
### TO DOWNLOAD ALL ####
def create_all_folders(image_folder="movie_posters",name = 'all', classes=None):
    for c in classes:
        folder_name = "/".join([image_folder, name, c])
        try:
            os.makedirs(folder_name)
        except FileExistsError:
            print(f"{folder_name} already exists.")
    print("\n")
    
def download_all(df, image_folder="movie_posters", name = 'all'):
    if not os.path.exists("/".join([image_folder, name])):
        os.mkdir("/".join([image_folder, name]))
    print(f"Starting with all downloading files..\n")
    already_downloaded = 0
    http_errors = []
    for index, movie in df.iterrows():
        movie_id = str(index[1])
        movie_decade = index[0]
        file_name = movie_id + ".jpg"
        file_path = "/".join([image_folder, name, movie_decade, file_name])
        if os.path.isfile(file_path):
            already_downloaded += 1
        else:
            try:
                urllib.request.urlretrieve(movie.Poster, file_path)       
            except HTTPError:
                http_errors.append(movie_id)
    print(f"{len(http_errors)} posters had an HTTPError.")
    print(f"{already_downloaded} posters were downloaded before.\n")
    count = 0
    for root, dirs, files in os.walk("/".join([image_folder, name])):
        if len(dirs) == 0:
            count += len(files)
            print(f"Number of pictures in {root}:\t{len(files)}")
    #print(f"\nTotal number of pictures available for {k}: {count}\n")

In [0]:
#!unzip 4.zip

In [0]:
cd 

/root


In [0]:
#!unzip /content/AllPoster.zip

In [0]:
def calc_stats(filename):
    camera = io.imread(filename)
    image_width = camera.shape[1]
    image_height = camera.shape[0]
    hsv_img = rgb2hsv(camera)
    hue_img = hsv_img[:, :, 0]
    saturation_img = hsv_img[:,:, 1]
    value_img = hsv_img[:, :, 2]
    mean_hue = np.mean(hue_img, axis=(0,1))
    mean_saturation = np.mean(saturation_img, axis=(0,1))
    mean_brightness = np.mean(value_img)
    return [image_width, image_height, mean_hue, mean_saturation, mean_brightness]

In [0]:
def image_colorfulness(image):
    # split the image into its respective RGB components
    (B, G, R) = cv2.split(image.astype("float"))
    # compute rg = R - G
    rg = np.absolute(R - G)
    # compute yb = 0.5 * (R + G) - B
    yb = np.absolute(0.5 * (R + G) - B)
    # compute the mean and standard deviation of both `rg` and `yb`
    (rbMean, rbStd) = (np.mean(rg), np.std(rg))
    (ybMean, ybStd) = (np.mean(yb), np.std(yb))
    # combine the mean and standard deviations
    stdRoot = np.sqrt((rbStd ** 2) + (ybStd ** 2))
    meanRoot = np.sqrt((rbMean ** 2) + (ybMean ** 2))
    # derive the "colorfulness" metric and return it
    return stdRoot + (0.3 * meanRoot)

In [0]:
def calcDGenergy(img):
    # from from https://stackoverflow.com/a/48974892

    #convert from uint8 to int64 to prevent overflow problems
    arr = np.array(img, dtype = int)

    #calculate squared difference ((x-1, y) - (x+1, y))^2 for each R, G and B pixel
    deltaX2 = np.square(np.roll(arr, -1, axis = 0) - np.roll(arr, 1, axis = 0))

    #same for y axis
    deltaY2 = np.square(np.roll(arr, -1, axis = 1) - np.roll(arr, 1, axis = 1))

    #add R, G and B values for each pixel, then add x- and y-shifted values
    dualEnergy = np.sum(deltaX2, axis = 2) + np.sum(deltaY2, axis = 2)
    return dualEnergy

In [0]:
def edginess(filename):
    painting = io.imread(filename)
    hsv_img = rgb2hsv(painting)
    value_img = hsv_img[:, :, 2]
    sobel_x = ndimage.sobel(value_img, axis=0, mode='constant')
    sobel_y = ndimage.sobel(value_img, axis=1, mode='constant')
    edge_image = np.hypot(sobel_x, sobel_y)
    sum_of_edge_image = np.sum(edge_image)/ (edge_image.size)
    mean_of_edge_image = edge_image.mean()

    temp = exposure.rescale_intensity(edge_image, out_range=(-1.0, 1.0))

    edges = skimage.img_as_ubyte(np.clip(temp, -1, 1))

    # Probabilistic Hough Transform
    minLineLength = 400
    maxLineGap = 10

    lines = cv2.HoughLinesP(edges,1,np.pi/180,100,minLineLength,maxLineGap)
    #result = abstract.copy()
    #
    #for x in range(0, len(lines)):    
    #    for x1,y1,x2,y2 in lines[x]:
    #        cv2.line(result,(x1,y1),(x2,y2),(0,255,255),5)

    # cv2.imwrite('houghlines5.jpg',edges)

    #img = landscape
    gray_img = rgb2gray(painting)
    entr_img = entropy(gray_img, disk(10))
    #io.imshow(entr_img)

    #dgEnergy = calcDGenergy(painting)
    return sum_of_edge_image, mean_of_edge_image, len(lines), entr_img.mean()#,  dgEnergy.min(),  dgEnergy.max(),  dgEnergy.mean()

In [0]:
os.listdir()

In [0]:
import time
count = 0
start = time.time()
ids = []
widths = []
heights = []
hues = []
saturations = []
brightnesses = []
corners_list = []
colorfulness_list = []
sums_edges = []
means_edges = []
num_lines = []
entropy_means = []
dg_image_energy_mins = []
dg_image_energy_maxs = []
dg_image_energy_means = []
for i in os.listdir():
    try:
        width, height, hue, saturation, brightness = calc_stats(i)
        #ids.append(i.partition(".")[0])
        #widths.append(width)
        #heights.append(height)
        #hues.append(hue)
        #saturations.append(saturation)
        #brightnesses.append(brightness)

        # Colourfulness
        img = cv2.imread(i)
        # calculating colorfulness
        colorfulness = image_colorfulness(img)
        #colorfulness_list.append(colorfulness)
        gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
        # find Harris corners
        gray = np.float32(gray)
        dst = cv2.cornerHarris(gray,2,3,0.04)
        dst = cv2.dilate(dst,None)
        ret, dst = cv2.threshold(dst,0.01*dst.max(),255,0)
        dst = np.uint8(dst)
        # find centroids
        ret, labels, stats, centroids = cv2.connectedComponentsWithStats(dst)
        # define the criteria to stop and refine the corners
        criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, 0.001)
        corners = cv2.cornerSubPix(gray,np.float32(centroids),(5,5),(-1,-1),criteria)
        #corners_list.append(len(corners))

        sum_edges, mean_edges, lines, mean_ent = edginess(i)
        sums_edges.append(sum_edges)
        means_edges.append(mean_edges)
        num_lines.append(lines)
        entropy_means.append(mean_ent)

        dgEnergy = calcDGenergy(io.imread(i))
        en_min, en_max, en_mean = dgEnergy.min(),  dgEnergy.max(),  dgEnergy.mean()
        dg_image_energy_mins.append(en_min)
        dg_image_energy_maxs.append(en_max)
        dg_image_energy_means.append(en_mean)
        ids.append(i.partition(".")[0])
        widths.append(width)
        heights.append(height)
        hues.append(hue)
        saturations.append(saturation)
        brightnesses.append(brightness)
        colorfulness_list.append(colorfulness)
        corners_list.append(len(corners))
        count += 1
        #if count % 10 == 0:
        #  print("You're at: " + str(count))
    except:
        continue
        
end = time.time()
print("Time Taken: " + str(end - start))

  out_dtype)


Time Taken: 8872.638138771057


In [0]:
new_df = pd.DataFrame()
new_df["imdbId"] = ids
new_df["Width"] = widths
new_df["Height"] = heights
new_df["Hue"] = hues
new_df["Saturation"] = saturations
new_df["Brightness"] = brightnesses
new_df["Colourfulness"] = colorfulness_list
new_df["Corners"] = corners_list
new_df["Edginess"] = means_edges
new_df["Number Of Lines (Hough)"] = num_lines
new_df["Mean Entropy"] = entropy_means
new_df["DG Image Energy Minimum"] = dg_image_energy_mins
new_df["DG Image Energy Maximum"] = dg_image_energy_maxs
new_df["DG Image Energy Mean"] = dg_image_energy_means
new_df

Unnamed: 0,imdbId,Width,Height,Hue,Saturation,Brightness,Colourfulness,Corners,Edginess,Number Of Lines (Hough),Mean Entropy,DG Image Energy Minimum,DG Image Energy Maximum,DG Image Energy Mean
0,268978,182,268,0.289566,0.317307,0.493287,41.523655,48,0.435260,1,5.767977,0,356123,8038.865057
1,30973,182,268,0.268064,0.396460,0.761431,93.641859,169,0.701259,4,5.182376,0,317807,19729.438576
2,23686,182,268,0.182442,0.319706,0.632708,37.933190,152,0.940084,8,6.557065,0,351213,26105.511030
3,2017561,182,268,0.156557,0.440561,0.672142,57.595658,111,0.646836,4,6.454978,3,322862,13146.240323
4,116479,182,268,0.242190,0.442489,0.294204,39.941220,100,0.414149,7,5.008403,0,224340,5427.131745
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30752,82769,182,268,0.486926,0.453615,0.412819,47.597628,73,1.112720,19,6.520898,0,383084,33236.516402
30753,115976,182,268,0.431284,0.527088,0.462169,86.109456,40,0.446330,1,5.877357,0,268066,6600.868132
30754,4311466,182,268,0.218411,0.285973,0.674517,35.157194,83,0.413087,3,5.788535,0,300655,6276.040225
30755,5210048,182,268,0.216408,0.488336,0.499522,62.828648,140,0.769391,6,6.410104,0,343819,19300.933615


In [0]:
new_df.to_csv("All_Posters_Features.csv")
files.download("All_Posters_Features.csv")