In [2]:
# import libraries
import os
import xgboost as xgb
import cv2
import numpy as np
import os
import pandas as pd
import random
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import fbeta_score

import scipy
from sklearn.metrics import fbeta_score

from PIL import Image



In [3]:
random_seed = 2017
random.seed(random_seed)
np.random.seed(random_seed)

# Load data
train_path = '../data/train-jpg/train-jpg/'
test_path = '../data/test-jpg/test-jpg/'
train = pd.read_csv('../data/train_csv/train.csv')
test = pd.read_csv('../data/sample_submission_csv/sample_submission.csv')

In [None]:
# simple features
def extract_features(df, data_path):
    im_features = df.copy()

    r_mean = []
    g_mean = []
    b_mean = []

    r_std = []
    g_std = []
    b_std = []

    r_max = []
    g_max = []
    b_max = []

    r_min = []
    g_min = []
    b_min = []

    r_kurtosis = []
    g_kurtosis = []
    b_kurtosis = []
    
    r_skewness = []
    g_skewness = []
    b_skewness = []

    for image_name in tqdm(im_features.image_name.values, miniters=100): 
        im = Image.open(data_path + image_name + '.jpg')
        im = np.array(im)[:,:,:3]

        r_mean.append(np.mean(im[:,:,0].ravel()))
        g_mean.append(np.mean(im[:,:,1].ravel()))
        b_mean.append(np.mean(im[:,:,2].ravel()))

        r_std.append(np.std(im[:,:,0].ravel()))
        g_std.append(np.std(im[:,:,1].ravel()))
        b_std.append(np.std(im[:,:,2].ravel()))

        r_max.append(np.max(im[:,:,0].ravel()))
        g_max.append(np.max(im[:,:,1].ravel()))
        b_max.append(np.max(im[:,:,2].ravel()))

        r_min.append(np.min(im[:,:,0].ravel()))
        g_min.append(np.min(im[:,:,1].ravel()))
        b_min.append(np.min(im[:,:,2].ravel()))

        r_kurtosis.append(scipy.stats.kurtosis(im[:,:,0].ravel()))
        g_kurtosis.append(scipy.stats.kurtosis(im[:,:,1].ravel()))
        b_kurtosis.append(scipy.stats.kurtosis(im[:,:,2].ravel()))
        
        r_skewness.append(scipy.stats.skew(im[:,:,0].ravel()))
        g_skewness.append(scipy.stats.skew(im[:,:,1].ravel()))
        b_skewness.append(scipy.stats.skew(im[:,:,2].ravel()))


    im_features['r_mean'] = r_mean
    im_features['g_mean'] = g_mean
    im_features['b_mean'] = b_mean

    im_features['r_std'] = r_std
    im_features['g_std'] = g_std
    im_features['b_std'] = b_std

    im_features['r_max'] = r_max
    im_features['g_max'] = g_max
    im_features['b_max'] = b_max

    im_features['r_min'] = r_min
    im_features['g_min'] = g_min
    im_features['b_min'] = b_min

    im_features['r_range'] = im_features['r_max'] - im_features['r_min']
    im_features['g_range'] = im_features['g_max'] - im_features['g_min']
    im_features['b_range'] = im_features['b_max'] - im_features['b_min']

    im_features['r_kurtosis'] = r_kurtosis
    im_features['g_kurtosis'] = g_kurtosis
    im_features['b_kurtosis'] = b_kurtosis
    
    im_features['r_skewness'] = r_skewness
    im_features['g_skewness'] = g_skewness
    im_features['b_skewness'] = b_skewness
    
    return im_features

In [None]:
train_features.head()

In [None]:
# Extract features
print('Extracting train features')
train_features = extract_features(train, train_path)
print('Extracting test features')
test_features = extract_features(test, test_path)

In [None]:
train_features.to_csv("../intermediate/train_simple_feats.csv", index = 0)
test_features.to_csv("../intermediate/test_simple_feats.csv", index = 0)


In [None]:
def rounded_rect(df, data_path):
    im_features = df.copy()
    no_of_rectangles_grey = []
    no_of_rectangles_threshold = []
    for image_name in tqdm(im_features.image_name.values, miniters=100): 
        im = cv2.imread(data_path + image_name + '.jpg', 0)     
        _,thresh = cv2.threshold(im,127,255,cv2.THRESH_BINARY) # threshold
        contours = cv2.findContours(im,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE)[-2] # get contours
        contours2 = cv2.findContours(thresh,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE)[-2] # get contours
        no_of_rectangles_grey.append(len(contours))
        no_of_rectangles_threshold.append(len(contours2))
    im_features["no_of_rectangles_grey"] = no_of_rectangles_grey
    im_features["no_of_rectangles_threshold"] = no_of_rectangles_threshold
    return im_features


In [None]:
# Extract features
print('Extracting train features')
train_features = rounded_rect(train, train_path)
print('Extracting test features')
test_features = rounded_rect(test, test_path)

In [None]:
train_features.to_csv("../intermediate/train_simple_feats_v2.csv", index = 0)
test_features.to_csv("../intermediate/test_simple_feats_v2.csv", index = 0)


In [4]:
train_features = pd.read_csv("../intermediate/train_simple_feats_v3.csv")
test_features = pd.read_csv("../intermediate/test_simple_feats_v3.csv")

In [5]:
def rounded_rect_v2(df, data_path):
    im_features = df.copy()
    no_of_rectangles_grey = []
    no_of_rectangles_threshold = []
    for image_name in tqdm(im_features.image_name.values, miniters=100): 
        im = cv2.imread(data_path + image_name + '.jpg', 0)     
        _,thresh = cv2.threshold(im,175,255,cv2.THRESH_BINARY) # threshold
        contours = cv2.findContours(im,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE)[-2] # get contours
        contours2 = cv2.findContours(thresh,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE)[-2] # get contours
        no_of_rectangles_grey.append(len(contours))
        no_of_rectangles_threshold.append(len(contours2))
    im_features["no_of_rectangles_grey_v2"] = no_of_rectangles_grey
#     im_features["no_of_rectangles_threshold"] = no_of_rectangles_threshold
    return im_features

In [6]:
# Extract features
print('Extracting train features')
train_features = rounded_rect_v2(train_features, train_path)
print('Extracting test features')
test_features = rounded_rect_v2(test_features, test_path)

Extracting train features


100%|███████████████████████████████████| 40479/40479 [06:31<00:00, 103.32it/s]


Extracting test features


100%|███████████████████████████████████| 40669/40669 [06:09<00:00, 110.01it/s]


In [None]:
train_features["no_of_tags"] = train_features.tags.apply(lambda x : len(x.split(" ")))
test_features["no_of_tags"] = test_features.tags.apply(lambda x : len(x.split(" ")))


In [None]:
train_features

In [None]:
def get_features(path):
    try:
        st = []
        #pillow jpg
        img = Image.open(path)
        im_stats_ = ImageStat.Stat(img)
        st += im_stats_.sum
        st += im_stats_.mean
        st += im_stats_.rms
        st += im_stats_.var
        st += im_stats_.stddev
        img = np.array(img)[:,:,:3]
        st += [scipy.stats.kurtosis(img[:,:,0].ravel())]
        st += [scipy.stats.kurtosis(img[:,:,1].ravel())]
        st += [scipy.stats.kurtosis(img[:,:,2].ravel())]
        st += [scipy.stats.skew(img[:,:,0].ravel())]
        st += [scipy.stats.skew(img[:,:,1].ravel())]
        st += [scipy.stats.skew(img[:,:,2].ravel())]
        #cv2 jpg
        img = cv2.imread(path)
        bw = cv2.imread(path,0)
        st += list(cv2.calcHist([bw],[0],None,[256],[0,256]).flatten()) #bw 
        st += list(cv2.calcHist([img],[0],None,[256],[0,256]).flatten()) #r
        st += list(cv2.calcHist([img],[1],None,[256],[0,256]).flatten()) #g
        st += list(cv2.calcHist([img],[2],None,[256],[0,256]).flatten()) #b
        try:
            #skimage tif
            tf = io.imread(path.replace('jpg','tif'))[:, :, 3]
            st += list(cv2.calcHist([tf],[0],None,[256],[0,65536]).flatten()) #near ifrared
        except:
            st += [-1 for i in range(256)]
            print('err', path.replace('jpg','tif'))
        m, s = cv2.meanStdDev(img) #mean and standard deviation
        st += list(m)
        st += list(s)
        st += [cv2.Laplacian(bw, cv2.CV_64F).var()] 
        st += [cv2.Laplacian(img, cv2.CV_64F).var()]
        st += [cv2.Sobel(bw,cv2.CV_64F,1,0,ksize=5).var()]
        st += [cv2.Sobel(bw,cv2.CV_64F,0,1,ksize=5).var()]
        st += [cv2.Sobel(img,cv2.CV_64F,1,0,ksize=5).var()]
        st += [cv2.Sobel(img,cv2.CV_64F,0,1,ksize=5).var()]
        st += [(bw<30).sum()]
        st += [(bw>225).sum()]
    except:
        print(path)
    return [path, st]

def normalize_img(paths):
    imf_d = {}
    p = Pool(cpu_count())
    ret = p.map(get_features, paths)
    for i in range(len(ret)):
        imf_d[ret[i][0]] = ret[i][1]
    ret = []
    fdata = [imf_d[f] for f in paths]
    return fdata

In [None]:

# Prepare data
X = np.array(train_features.drop(['image_name', 'tags'], axis=1))
y_train = []

flatten = lambda l: [item for sublist in l for item in sublist]
labels = np.array(list(set(flatten([l.split(' ') for l in train_features['tags'].values]))))

label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

for tags in tqdm(train.tags.values, miniters=1000):
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    y_train.append(targets)
    
y = np.array(y_train, np.uint8)

print('X.shape = ' + str(X.shape))
print('y.shape = ' + str(y.shape))

n_classes = y.shape[1]

X_test = np.array(test_features.drop(['image_name', 'tags'], axis=1))

# Train and predict with one-vs-all strategy
y_pred = np.zeros((X_test.shape[0], n_classes))

In [None]:
y_pred

In [None]:
etr = ExtraTreesRegressor(n_estimators=50, max_depth=30, n_jobs=-1, random_state=1)
etr.fit(X, y); print('etr fit...')

train_pred = etr.predict(X)
train_pred[train_pred > 0.21] = 1
train_pred[train_pred < 1] = 0
print(fbeta_score(y,train_pred,beta=2, average='samples'))

pred1 = etr.predict(X_test); print('etr predict...')
etr_test = pd.DataFrame(pred1)
etr_test['image_name'] =  test[['image_name']]

In [None]:
pred1 = [' '.join(labels[y_pred_row > 0.1]) for y_pred_row in y_pred]

subm = pd.DataFrame()
subm['image_name'] = test_features.image_name.values
subm['tags'] = pred1
subm.to_csv('../output/extra_tress2.csv', index=False)

In [None]:
pred1

In [None]:
y_train

In [7]:

# Prepare data
X = np.array(train_features.drop(['image_name', 'tags'], axis=1))
y_train = []

flatten = lambda l: [item for sublist in l for item in sublist]
labels = np.array(list(set(flatten([l.split(' ') for l in train_features['tags'].values]))))

label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

for tags in tqdm(train.tags.values, miniters=1000):
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    y_train.append(targets)
    
y = np.array(y_train, np.uint8)

print('X.shape = ' + str(X.shape))
print('y.shape = ' + str(y.shape))

n_classes = y.shape[1]

X_test = np.array(test_features.drop(['image_name', 'tags'], axis=1))

# Train and predict with one-vs-all strategy
y_pred = np.zeros((X_test.shape[0], n_classes))

print('Training and making predictions')
for class_i in tqdm(range(n_classes), miniters=1): 
    model = xgb.XGBClassifier(max_depth=11, learning_rate=0.01, n_estimators=1000, \
                              silent=True, objective='binary:logistic', nthread=-1, \
                              gamma=0, min_child_weight=3, max_delta_step=0, \
                              subsample=1, colsample_bytree=0.8, colsample_bylevel=0.8, \
                              reg_alpha=0, reg_lambda=4, scale_pos_weight=1, \
                              base_score=0.5, seed=random_seed, missing=None)
    model.fit(X, y[:, class_i])
    y_pred[:, class_i] = model.predict_proba(X_test)[:, 1]



100%|████████████████████████████████| 40479/40479 [00:00<00:00, 371367.90it/s]


X.shape = (40479, 23)
y.shape = (40479, 17)
Training and making predictions


100%|██████████████████████████████████████████| 17/17 [11:38<00:00, 39.66s/it]


In [8]:
preds = [' '.join(labels[y_pred_row > 0.50]) for y_pred_row in y_pred]

subm = pd.DataFrame()
subm['image_name'] = test_features.image_name.values
subm['tags'] = preds
subm.to_csv('../output/submission_2_rounded_rect.csv', index=False)

In [9]:
train_features.to_csv("train_v4.csv", index=0)
test_features.to_csv("test_v4.csv", index=0)