In [None]:
import math, re, os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

## Load label and paths

In [None]:
def append_path(pre):
    return np.vectorize(lambda file: os.path.join('/kaggle/input/alaska2-image-steganalysis', pre, file))

In [None]:
sub = pd.read_csv('/kaggle/input/alaska2-image-steganalysis/sample_submission.csv')
train_filenames = np.array(os.listdir("/kaggle/input/alaska2-image-steganalysis/Cover/"))

In [None]:
np.random.seed(0)
positives = train_filenames.copy()
negatives = train_filenames.copy()
np.random.shuffle(positives)
np.random.shuffle(negatives)

jmipod = append_path('JMiPOD')(positives[:2000])
juniward = append_path('JUNIWARD')(positives[10000:12000])
uerd = append_path('UERD')(positives[20000:22000])

# pos_paths = np.concatenate([jmipod, juniward, uerd])

In [None]:
test_paths = append_path('Test')(sub.Id.values)
neg_paths = append_path('Cover')(negatives[:2000])

In [None]:
#Do all the imports
import cv2
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.image as mpimg
%matplotlib inline
import os
import glob
from skimage.feature import hog
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
# from kaggle_datasets import KaggleDatasets
import time
from sklearn.model_selection import train_test_split
from scipy.ndimage.measurements import label

In [None]:
 
#Define functions that will be used later

def color_hist(img, nbins=32, bins_range=(0, 256)):
    # Compute the histogram of the RGB channels separately
    fhist = np.histogram(img[:,:,0], bins=nbins, range=bins_range) #First channel histogram
    shist = np.histogram(img[:,:,1], bins=nbins, range=bins_range) # Second channel histogram
    thist = np.histogram(img[:,:,2], bins=nbins, range=bins_range) # Third channel histogram
    # Generating bin centers
    bin_edges = fhist[1]
    bin_centers = (bin_edges[1:]  + bin_edges[0:len(bin_edges)-1])/2
    # Concatenate the histograms into a single feature vector
    hist_features = np.concatenate((fhist[0], shist[0], thist[0]))
    # Return the individual histograms, bin_centers and feature vector
    return hist_features, fhist, shist, thist, bin_centers

def bin_spatial_features(img, size=(16, 16)):
    # Use cv2.resize().ravel() to create the feature vector
    features = cv2.resize(img, size).ravel() 
    # Return the feature vector
    return features

def get_hog_features(img, orient, pix_per_cell, cell_per_block, vis=False, feature_vec=True):
     if vis == True:   
       features,hog_image = hog(img, orientations=orient, pixels_per_cell=(pix_per_cell, pix_per_cell),
                       cells_per_block=(cell_per_block, cell_per_block), transform_sqrt=False, 
                        visualize = vis,feature_vector=feature_vec)
       return features,hog_image
     else:
        features = hog(img, orientations=orient, pixels_per_cell=(pix_per_cell, pix_per_cell),
                       cells_per_block=(cell_per_block, cell_per_block), transform_sqrt=False, 
                        visualize = vis,feature_vector=feature_vec)
        return features

def extract_features_hog(imgs, cspace='RGB', orient=9, 
                        pix_per_cell=8, cell_per_block=2, hog_channel=0):
    # Create a list to append feature vectors to
    features = []
    # Iterate through the list of images
    for file in imgs:
        # Read in each one by one
        image = cv2.imread(file)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) #convert to RGB color space
        # apply color conversion if other than 'RGB'
        if cspace != 'RGB':
            if cspace == 'HSV':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
            elif cspace == 'LUV':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2LUV)
            elif cspace == 'HLS':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2HLS)
            elif cspace == 'YUV':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2YUV)
            elif cspace == 'YCrCb':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2YCrCb)
        else: feature_image = np.copy(image)      
 
        # Call get_hog_features() with vis=False, feature_vec=True
        if hog_channel == 'ALL':
            hog_features = []
            for channel in range(feature_image.shape[2]):
                hog_features.append(get_hog_features(feature_image[:,:,channel], 
                                    orient, pix_per_cell, cell_per_block, 
                                    vis=True, feature_vec=True))
            hog_features = np.ravel(hog_features)        
        else:
            hog_features = get_hog_features(feature_image[:,:,hog_channel], orient, 
                        pix_per_cell, cell_per_block, vis=True, feature_vec=True)
        # Append the new feature vector to the features list
        features.append(hog_features)
    # Return list of feature vectors
    return features

def color_hist_features(img, nbins=32, bins_range=(0, 256)):
    # Compute the histogram of the color channels separately
    channel1_hist = np.histogram(img[:,:,0], bins=nbins, range=bins_range)
    channel2_hist = np.histogram(img[:,:,1], bins=nbins, range=bins_range)
    channel3_hist = np.histogram(img[:,:,2], bins=nbins, range=bins_range)
    # Concatenate the histograms into a single feature vector
    hist_features = np.concatenate((channel1_hist[0], channel2_hist[0], channel3_hist[0]))
    # Return the individual histograms, bin_centers and feature vector
    return hist_feature

In [None]:
#Look at color features
#Spatial binned features comparison for car and notcar

def bin_spatial(img, color_space='RGB', size=(16, 16)): 
    if color_space!= 'RGB':
        if color_space == 'HSV':
            feature_image = cv2.cvtColor(img,cv2.COLOR_RGB2HSV)
        elif color_space == 'HLS':
            feature_image = cv2.cvtColor(img,cv2.COLOR_RGB2HLS)
        elif color_space == 'LUV':
            feature_image = cv2.cvtColor(img,cv2.COLOR_RGB2LUV)
        elif color_space == 'YUV':
            feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2YUV)
        elif color_space == 'YCrCb':
            feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2YCrCb)
    else: feature_image = np.copy(img) 
    resize = cv2.resize(feature_image, size)
    features = resize.ravel()
    
    # Return the feature vector
    return features

In [None]:
# import pickle
#Function reads images, extract features and returns a list of feature vectors

def extract_features_color(imgs, cspace='HLS',spatial_size=(16, 16),
                        hist_bins=16, hist_range=(0, 256)):
    # Create a list to append feature vectors to
    features = []
    for file in imgs:
        # Read in each one by one
        image = cv2.imread(file)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) #convert to RGB color space
        
        spatial_features = bin_spatial(image, color_space=cspace, size= spatial_size)
        hist_features = color_hist_features(image, nbins=hist_bins, bins_range=hist_range)
    
        features.append(np.concatenate((spatial_features, hist_features)))
 
        
    return features
 
# # car_features_color = extract_features_color(cars, cspace='RGB', spatial_size=(16, 16), hist_bins=32, hist_range=(0, 256))
# # pickle.dump('car_color',car_features_color)
# # notcar_features_color = extract_features_color(notcars, cspace='RGB', spatial_size=(16, 16), hist_bins=32, hist_range=(0, 256))
# # pickle.dump('notcar_color',notcar_features_color)
# print(len(car_features_color), len(notcar_features_color))
# print(len(car_features_color[0]), len(notcar_features_color[0]))

In [None]:
import joblib

In [None]:
#Training using color and HOG features
#Selected color features - cspace = 'RGB' and spatial_size = 16,16
# from numba import jit, cuda
from sklearn.preprocessing import StandardScaler
def extract_features(imgs, color_space='RGB', spatial_size=(16, 16),
                        hist_bins=32, orient=9, 
                        pix_per_cell=8, cell_per_block=2, hog_channel=0,
                        spatial_feat=True, hist_feat=True, hog_feat=True):

    # Create a list to append feature vectors to
    features = []
    num = 0
    # Iterate through the list of images
    for file in imgs:
        file_features = []
        # Read in each one by one
        image = cv2.imread(file)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) #convert to RGB color space
        # apply color conversion if other than 'RGB'
        if color_space != 'RGB':
            if color_space == 'HSV':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
            elif color_space == 'LUV':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2LUV)
            elif color_space == 'HLS':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2HLS)
            elif color_space == 'YUV':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2YUV)
            elif color_space == 'YCrCb':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2YCrCb)
        else: feature_image = np.copy(image)      
        
        if spatial_feat == True:
            spatial_features = bin_spatial_features(feature_image, size=spatial_size)
            file_features.append(spatial_features)
            
        if hist_feat == True:
            # Apply color_hist()
            hist_features = color_hist_features(feature_image, nbins=hist_bins)
            file_features.append(hist_features)
            
        if hog_feat == True:
        # Call get_hog_features() with vis=False, feature_vec=True
            if hog_channel == 'ALL':
                hog_features = []
                for channel in range(feature_image.shape[2]):
                    hog_features.append(get_hog_features(feature_image[:,:,channel], 
                                        orient, pix_per_cell, cell_per_block, 
                                        vis=False, feature_vec=True))
                hog_features = np.ravel(hog_features)        
            else:
                hog_features = get_hog_features(feature_image[:,:,hog_channel], orient, 
                            pix_per_cell, cell_per_block, vis=False, feature_vec=True)
            # Append the new feature vector to the features list
            
            file_features.append(hog_features)
        num+=1
        if num%100==0:
            print('finished',num)
          
        features.append(np.concatenate(file_features))
   
    # Return list of feature vectors
    return features


In [None]:
!pip install dist/jpegio-x.x.x-cp3x-cp3x-win_amd64.whl

In [None]:
# Train classifier using HOG and color features
# from numba import jit, cuda
from multiprocessing import Process
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
import time
import joblib 
from sklearn.model_selection import train_test_split

### TODO: Tweak these parameters and see how the results change.
color_space = 'YCrCb' # Can be RGB, HSV, LUV, HLS, YUV, YCrCb
orient = 18  # HOG orientations
pix_per_cell = 64 # HOG pixels per cell #16
cell_per_block = 1 # HOG cells per block #2
hog_channel = 'ALL' # Can be 0, 1, 2, or "ALL"
spatial_size = (16, 16) # Spatial binning dimensions
hist_bins = 16    # Number of histogram bins
spatial_feat = False # Spatial features on or off
hist_feat = False # Histogram features on or off
hog_feat = True # HOG features on or off
y_start_stop = [None, None] # Min and max in y to search in slide_window()

print('Hello')                         
jmipod_features = extract_features(jmipod, color_space=color_space, 
                        spatial_size=spatial_size, hist_bins=hist_bins, 
                        orient=orient, pix_per_cell=pix_per_cell, 
                        cell_per_block=cell_per_block, 
                        hog_channel=hog_channel, spatial_feat=spatial_feat, 
                        hist_feat=hist_feat, hog_feat=hog_feat)

print('Hello')

joblib.dump(jmipod_features,'jmipod_feature.pkl')

cover_features = extract_features(neg_paths, color_space=color_space, 
                        spatial_size=spatial_size, hist_bins=hist_bins, 
                        orient=orient, pix_per_cell=pix_per_cell, 
                        cell_per_block=cell_per_block, 
                        hog_channel=hog_channel, spatial_feat=spatial_feat, 
                        hist_feat=hist_feat, hog_feat=hog_feat)
print('Hello')

joblib.dump(cover_features,'cover_feature.pkl')
uerd_features = extract_features(uerd, color_space=color_space, 
                        spatial_size=spatial_size, hist_bins=hist_bins, 
                        orient=orient, pix_per_cell=pix_per_cell, 
                        cell_per_block=cell_per_block, 
                        hog_channel=hog_channel, spatial_feat=spatial_feat, 
                        hist_feat=hist_feat, hog_feat=hog_feat)
print('Hello')

joblib.dump(uerd_features,'uerd_feature.pkl')
juniward_features = extract_features(juniward, color_space=color_space, 
                        spatial_size=spatial_size, hist_bins=hist_bins, 
                        orient=orient, pix_per_cell=pix_per_cell, 
                        cell_per_block=cell_per_block, 
                        hog_channel=hog_channel, spatial_feat=spatial_feat, 
                        hist_feat=hist_feat, hog_feat=hog_feat)
print('Hello')

joblib.dump(juniward_features,'juniward_feature.pkl')
test_features = extract_features(test_paths, color_space=color_space, 
                        spatial_size=spatial_size, hist_bins=hist_bins, 
                        orient=orient, pix_per_cell=pix_per_cell, 
                        cell_per_block=cell_per_block, 
                        hog_channel=hog_channel, spatial_feat=spatial_feat, 
                        hist_feat=hist_feat, hog_feat=hog_feat)
print('Hello')

joblib.dump(test_features,'test_feature.pkl')

In [None]:
data=pd.DataFrame()
jun = juniward_features
jmi = jmipod_features
uer=uerd_features
cov = cover_features
tes = test_features
data['juniward'] =juniward_features
data['jmipod'] =jmipod_features
data['uerd'] =uerd_features
data['cover'] =cover_features
# data['test'] = test_features

In [None]:
data.to_csv('data.csv',index=False)

In [None]:
# !pip install stegano
# from stegano import lsb
from scipy import spatial

In [None]:
dif1 = []
dif2 = []
dif3 = []
for i in range(data.shape[0]):
    dif1.append(spatial.distance.cosine(cov[i],jun[i]))
    dif2.append(spatial.distance.cosine(cov[i],jmi[i]))
    dif3.append(spatial.distance.cosine(cov[i],uer[i]))
#     print()

In [None]:
data['jun'] = dif1
data['jmi'] = dif2
data['uer'] = dif3


In [None]:
data['mean'] = data[['jun','jmi','uer']].mean(axis = 1)

In [None]:
data.to_csv('data.csv',index = False)

In [None]:
jes = list()
for i in range(data.shape[0]):
#     print(jun[i].shape)
    jes.append(np.sum((jun[i],jmi[i],uer[i]),axis = 0)/3)
#     print()

data['mean_hog']= jes

In [None]:
X=np.concatenate((juniward_features,jmipod_features,uerd_features,cover_features)).astype(np.float64)

In [None]:
X=np.concatenate((jes,cover_features)).astype(np.float64)

In [None]:
X.shape

In [None]:
# Fit a per-column scaler
X_scaler = StandardScaler().fit(X)
# Apply the scaler to X
scaled_X = X_scaler.transform(X)

In [None]:
a = np.full(len(jmipod_features),2)
b = np.full(len(uerd_features),3)
c = np.full(len(juniward_features),1)
# a.fill(1)
# y = np.hstack((c,a,b,np.zeros(len(cover_features))))
y = np.hstack((np.ones(len(jes)),np.zeros(len(cover_features))))

In [None]:
BASE_PATH = "/kaggle/input/alaska2-image-steganalysis"
# train_imageids = pd.Series(os.listdir(BASE_PATH + '/Cover')).sort_values(ascending=True).reset_index(drop=True)
# test_imageids = pd.Series(os.listdir(BASE_PATH + '/Test')).sort_values(ascending=True).reset_index(drop=True)
# cover_images_path = pd.Series(BASE_PATH + '/Cover/' + train_imageids ).sort_values(ascending=True)
# JMIPOD_images_path = pd.Series(BASE_PATH + '/JMiPOD/'+train_imageids).sort_values(ascending=True)
# JUNIWARD_images_path = pd.Series(BASE_PATH + '/JUNIWARD/'+train_imageids).sort_values(ascending=True)
# UERD_images_path = pd.Series(BASE_PATH + '/UERD/'+train_imageids).sort_values(ascending=True)
# test_images_path = pd.Series(BASE_PATH + '/Test/'+test_imageids).sort_values(ascending=True)
ss = pd.read_csv(f'{BASE_PATH}/sample_submission.csv')

In [None]:
! git clone https://github.com/dwgoon/jpegio
!pip install jpegio/.
import jpegio as jio

In [None]:
from lightgbm import LGBMClassifier
lgbm_params =  {
#     'n_estimators':3000,
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 7,
    'metric': ['multi_error'],
    "learning_rate": 0.05,
     "num_leaves": 60,
     "max_depth": 9,
     "feature_fraction": 0.45,
     "bagging_fraction": 0.3,
     "reg_alpha": 0.15,
     "reg_lambda": 0.15,
#      "min_split_gain": 0,
      "min_child_weight": 0
                }

In [None]:
from PIL import Image
coverPixels = np.array(Image.open(cover_images_path[0])).astype('float')
stegoPixels = np.array(Image.open(JUNIWARD_images_path[0])).astype('float')

pixelsDiff = coverPixels - stegoPixels

# So since they are not the same Images the pixels_diff would not be zero
print(len(pixelsDiff[np.where(pixelsDiff!=0)]))
print(np.unique(pixelsDiff))

In [None]:
import lightgbm as lgb
lgtrain = lgb.Dataset(scaled_X,y, categorical_feature= "auto")

In [74]:
from sklearn.svm import SVR as SVC
from sklearn.ensemble import RandomForestClassifier as rfc
# from sklearn.naive_bayes import GaussianNB as nb
# from sklearn.linear_model import LinearRegression as lr
# from sklearn.linear_model import LogisticRegression as lr
# Split up data into randomized training and test sets
rand_state = np.random.randint(0, 100)
X_train, X_test, y_train, y_test = train_test_split(
    scaled_X, y, test_size=0.2, random_state=rand_state)

print('Using:',orient,'orientations',pix_per_cell,
    'pixels per cell and', cell_per_block,'cells per block')
print('Feature vector length:', len(X_train[0]))
# Use a linear SVC 
# svc = rfc(n_estimators=3000,n_jobs=-1,random_state=rand_state)
# svc = LGBMClassifier(**lgbm_params)
svc = SVC()
# svc = lr()
# Check the training time for the SVC
t=time.time()
# svc.fit(X_train, y_train, eval_set=[(X_train,y_train),(X_test,y_test)],
#               verbose=100, early_stopping_rounds=100)
svc.fit(X_train, y_train)
t2 = time.time()
print(round(t2-t, 2), 'Seconds to train SVC...')
# Check the score of the SVC
print('Test Accuracy of SVC = ', round(svc.score(X_test, y_test), 4))
# Check the prediction time for a single sample
t=time.time()
n_predict = 10
print('My SVC predicts: ', svc.predict(X_test[0:n_predict]))
print('For these',n_predict, 'labels: ', y_test[0:n_predict])
t2 = time.time()
print(round(t2-t, 5), 'Seconds to predict', n_predict,'labels with SVC')

#Tried YCrCb, 8, 1 _ good performance 
#Tried YCrCb, 8, 2 _ good performance but very slow


Using: 18 orientations 64 pixels per cell and 1 cells per block
Feature vector length: 3456
34.12 Seconds to train SVC...
Test Accuracy of SVC =  0.8338
My SVC predicts:  [ 0.99219042  0.34137831  0.99800797 -0.20344753 -0.28358846  0.93858582
  0.25476044  0.90285269  1.06855699  1.06101569]
For these 10 labels:  [1. 0. 1. 0. 0. 1. 0. 1. 1. 1.]
0.10559 Seconds to predict 10 labels with SVC


In [75]:
ss['Label'] = 1-svc.predict(test_features)

In [None]:
svc.predict_proba(test_features[0:5])

In [76]:
ss.to_csv('submission.csv',index = False)

In [73]:
ss['Label'].min()

0.0