In [1]:
import os
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
from keras import layers
from keras.preprocessing import image
from keras import Sequential
from keras.layers import Dense,Activation, Conv2D, Flatten, Dropout, MaxPooling2D, GlobalAveragePooling2D, BatchNormalization
from keras.models import Model
from keras.metrics import categorical_crossentropy
from keras.utils.np_utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,max_error, explained_variance_score,median_absolute_error, accuracy_score,classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,  cross_validate,  cross_val_score 
import random
import seaborn as sns
import matplotlib.pyplot as plt
from skimage import *
import cv2
import math
import joblib
from scipy import stats
import gc


img_w= 256
img_h=256

In [2]:
train_path= "../input/seti-data/primary_small/train/"
valid_path= '../input/seti-data/primary_small/valid/'
test_path= '../input/seti-data/primary_small/test/'

In [3]:
class ImageData:

    def __init__(self, filepath):
        self.filepath= filepath
        self.X_data= []
        self.y_data= []
        self.X_gray= []
        self.X_binary=[]
        self.classes=[]
        self.y_categorical= []
           
            
            
    def split_and_resize(self): 

        self.classes= sorted(os.listdir(self.filepath))

        for c in range(len(self.classes)):
            path = os.path.join(self.filepath, self.classes[c])
            signal_type= self.classes[c]
            
            for i in os.listdir(path):
                img= cv2.imread(os.path.join(path, i))
                img= cv2.resize(img, (256, 256))
                
                self.X_data.append(img)
                self.y_data.append(signal_type)
                
        return np.array(self.X_data), np.array(self.y_data)

    
    def grayscale(self):

        for i in range(len(self.X_data)):
            x= cv2.cvtColor(self.X_data[i], cv2.COLOR_BGR2GRAY)
            self.X_gray.append(x)

            
    
    def create_binary_image(self):
        self.grayscale()
        for i in range(len(self.X_gray)):
            img= np.where(self.X_gray[i]>100,1,0)
            self.X_binary.append(img)
        return np.array(self.X_binary)
            
    
    def create_categorical_label(self):
        self.y_categorical = to_categorical([self.classes.index(i) for i in self.y_data])
        return self.y_categorical
    

In [4]:
class FeatureEngineering:
    
    def __init__(self):
        self.lr_rmse=[]
        self.lr_max_error= []
        self.lr_spread= []
        self.lr_av_slope= []
        self.lr_slope= []
        self.y_int= []
        
        self.num_gaps= []
        self.mean_squiggly =[]
        self.num_bright_pix =[]
        self.pct_bright_lines =[]
        self.noisy= []
        self.biggest_gap=[]
#         self.frequent_gap=[]

        
    
        self.Features_DF = pd.DataFrame()
        
    
    def make_df(self):

        img_tuple = [([(i,x,y) for x, y in enumerate(j)]) for i, j in enumerate(self.temp)]

        
        new_list=[]
        for x in range(256):
            for y in range(256):
                new_list.append(img_tuple[x][y])
                
                   #create df from the list, filter for only bright pixels (ie, 1 values)
        df= pd.DataFrame(new_list).rename(columns={0:'Y', 1:"X", 2:'Bright'})
        df= df[df['Bright']==1]
        df.drop(columns='Bright', inplace=True)
        df.reset_index(inplace=True, drop=True)
        self.brightpix_df= df
        self.num_bright_pix.append(len(self.brightpix_df))
        return self.brightpix_df
    

        
    def lin_reg(self, binary_array):

        
        binary_df =  pd.DataFrame(self.make_df())
        Y= binary_df['Y']
        X= binary_df.drop(columns='Y')
        
        LinReg = LinearRegression()
        LinReg.fit(X,Y)
        
        self.y_pred= LinReg.predict(X)
        
        self.lr_rmse.append(math.sqrt(mean_squared_error(Y, self.y_pred)))
        self.lr_max_error.append(max_error(Y, self.y_pred))
        self.lr_av_slope.append(abs(LinReg.coef_))
        self.lr_slope.append(LinReg.coef_)
        self.y_int.append(LinReg.intercept_)
        self.lr_spread.append(np.max(X)- np.min(X))
        
    def gap_detector(self):
        self.num_gaps.append(len(self.brightpix_df[self.brightpix_df.Y.diff()> self.thresh]))
            
    def mean_squiggliness(self):
        if self.brightpix_df.X.diff().apply(abs).mean() ==np.nan:
            self.mean_squiggly.append(0)
        else:
            self.mean_squiggly.append(self.brightpix_df.X.diff().apply(abs).mean())
            

    def big_and_most_gap(self):
        
        diffs = self.brightpix_df.Y.diff()
        gap_info= pd.DataFrame(diffs.value_counts()).reset_index().rename(columns={'index':'Gap Size', 'Y':'Gap Freq'})
        freq_gap = gap_info.sort_values(by='Gap Freq', ascending=False)
        
        
                                        # ***********
        if np.max(gap_info['Gap Size']) > self.thresh:
            self.biggest_gap.append(np.max(gap_info['Gap Size']))
        else:
            self.biggest_gap.append(0)
            

    
    def find_bright_lines(self):
        lines=[]
        
        for line in range(256):
            if np.max(self.temp[line]) >0:
                lines.append(1)
        self.pct_bright_lines.append(len(lines)/256)
        

    def detect_noise(self):
        if len(self.brightpix_df)>150:
            self.noisy.append(1)
        else:
            self.noisy.append(0)
    

    
    
    
    def assemble(self, binary_array, thresh=20):
        self.binary_array= binary_array
        self.thresh= thresh
        
        for i in range(len(self.binary_array)):
            self.temp = self.binary_array[i]
            
            self.lin_reg(self.temp)
            self.big_and_most_gap()
            
            
            
            
            self.gap_detector()
            self.mean_squiggliness()
            self.find_bright_lines()
            self.detect_noise()
        self.Features_DF = pd.DataFrame(np.column_stack([self.lr_rmse, self.lr_max_error, self.lr_spread, self.lr_av_slope,
                                        self.noisy, self.mean_squiggly, self.num_bright_pix, self.pct_bright_lines, 
                                        self.num_gaps, self.biggest_gap]),
                                       columns=['RMSE', 'Max Error', 'Spread','Slope', 'Is Noisy','Mean Squiggliness',
                                               'Num Bright Pix', 'Pct Bright Lines','Num Gaps','Biggest Gap'])
        self.Features_DF.fillna(0, inplace=True)
        return self.Features_DF
            
        

In [23]:
class EnsembleModel():
   
    def __init__(self, X_train , y_train , train_features, X_test, y_test, test_features, cnn_path):
        
        
        self.X_Train_Features= train_features
        self.X_Train_Data= X_train
        self.y_train= y_train

        self.X_Test_Features= test_features
        self.X_Test_Data= X_test
        self.y_test= y_test
        
        self.ytr= to_categorical([classes.index(i) for i in self.y_train])
        
        self.cnn_model= tf.keras.models.load_model(cnn_path)
        
        self.classes= ['brightpixel','narrowband', 'narrowbanddrd', 'noise', 'squarepulsednarrowband', 'squiggle', 'squigglesquarepulsednarrowband']
    
        
    def cnn_only_predictions(self):
        
        ### NB, N, SPNB
        
        nb_imgs_tr= self.X_Train_Data[800:1600]
        nb_imgs_te= self.X_Test_Data[100:200]
        
        n_imgs_tr= self.X_Train_Data[2400:3200]
        n_imgs_te= self.X_Test_Data[300:400]
        
        spnb_imgs_tr= self.X_Train_Data[3200:4000]
        spnb_imgs_te= self.X_Test_Data[400:500]
        
        
        
        onlycnn_X_train= np.concatenate([nb_imgs_tr,n_imgs_tr,spnb_imgs_tr])
        onlycnn_y_train= np.concatenate([self.ytr[800:1600], self.ytr[2400:3200], self.ytr[3200:4000]])
        
        onlycnn_X_test= np.concatenate([nb_imgs_te, n_imgs_te,spnb_imgs_te  ])
        self.onlycnn_y_test= np.concatenate([self.y_test[100:200], self.y_test[300:400], self.y_test[400:500] ])
        
        self.cnn_model.fit(onlycnn_X_train, onlycnn_y_train)
        onlycnn_preds= self.cnn_model.predict(onlycnn_X_test)
        
        index_preds= np.argmax(onlycnn_preds, axis=1)
        
        self.pure_cnn_preds = [self.classes[i] for i in index_preds]
        
    
    def combo_predictions(self):
        
        ### BP, NBDRD, S, SSPNB
        
        bp_imgs_tr= self.X_Train_Data[0:800]
        bp_imgs_te= self.X_Test_Data[0:100]
        
        drd_imgs_tr= self.X_Train_Data[1600:2400]
        drd_imgs_te= self.X_Test_Data[200:300]

        s_imgs_tr= self.X_Train_Data[4000:4800]
        s_imgs_te= self.X_Test_Data[500:600]
        
        sspnb_imgs_tr= self.X_Train_Data[4800:5600]
        sspnb_imgs_te= self.X_Test_Data[600:700]
        
        combocnn_X_train= np.concatenate([  bp_imgs_tr,  drd_imgs_tr,   s_imgs_tr,  sspnb_imgs_tr    ])
        combocnn_y_train= np.concatenate([ self.ytr[0:800] , self.ytr[1600:2400] ,  self.ytr[4000:4800],  self.ytr[4800:5600]   ])
        
        combocnn_X_test= np.concatenate([  bp_imgs_te,  drd_imgs_te ,   s_imgs_te,  sspnb_imgs_te   ])
        self.combo_y_test= np.concatenate([    self.y_test[0:100] , self.y_test[200:300] ,  self.y_test[500:600],  self.y_test[600:700]   ])
        
        self.cnn_model.fit(combocnn_X_train, combocnn_y_train)
        combocnn_train_preds= self.cnn_model.predict(combocnn_X_train)
        combocnn_test_preds= self.cnn_model.predict(combocnn_X_test)
        
        index_preds= np.argmax(combocnn_test_preds, 1)
        
        
        # create df 
        
        bp_tr= self.X_Train_Features[0:800]
        bp_te= self.X_Test_Features[0:100]

        drd_tr= self.X_Train_Features[1600:2400]
        drd_te= self.X_Test_Features[200:300]

        s_tr= self.X_Train_Features[4000:4800]
        s_te= self.X_Test_Features[500:600]

        sspnb_tr= self.X_Train_Features[4800:5600]
        sspnb_te= self.X_Test_Features[600:700]
        
        combo_train_feats= pd.concat([bp_tr ,drd_tr, s_tr,  sspnb_tr  ], axis=0)
        combo_train_feats.reset_index(drop=True, inplace=True)
        combo_train_feats.drop(columns='Is Noisy', inplace=True)
        
        combo_test_feats= pd.concat([bp_te ,drd_te, s_te,  sspnb_te  ], axis=0)
        combo_test_feats.reset_index(drop=True, inplace=True)
        combo_test_feats.drop(columns='Is Noisy', inplace=True)
        
        
        combo_y_train= np.concatenate([   self.y_train[0:800], self.y_train[1600:2400],  self.y_train[4000:4800],  self.y_train[4800:5600]     ])
        
        
        combo_probs_train= pd.DataFrame(combocnn_train_preds)
        combo_probs_train= combo_probs_train.rename(columns={0:'BP Prob', 1:'NB Prob', 2:'NBDRD Prob', 3:'N Prob',4:'SPNB Prob',5:'S Prob',6:'SSPNB Prob'})
        
        combo_probs_test= pd.DataFrame(combocnn_test_preds)
        combo_probs_test= combo_probs_test.rename(columns={0:'BP Prob', 1:'NB Prob', 2:'NBDRD Prob', 3:'N Prob',4:'SPNB Prob',5:'S Prob',6:'SSPNB Prob'})
        
        self.combo_X_train= pd.concat([combo_train_feats, combo_probs_train ], axis=1)
        combo_X_test= pd.concat([combo_test_feats,  combo_probs_test], axis=1)
        
        rf= RandomForestClassifier(criterion='entropy', max_depth=11, max_features=5, n_estimators=100)
        rf.fit(self.combo_X_train, combo_y_train)
        self.combo_preds= rf.predict(combo_X_test)
        self.ens_feature_importances_ = rf.feature_importances_
        
        
    def build_ensemble(self):
        self.cnn_only_predictions()
        self.combo_predictions()
        
        self.final_preds= np.concatenate([self.pure_cnn_preds, self.combo_preds])
        self.y_actual= np.concatenate([self.onlycnn_y_test, self.combo_y_test])
        self.accuracy= accuracy_score(self.y_actual,  self.final_preds )

        
        

In [6]:
train= ImageData(train_path)
X_tr, y_tr= train.split_and_resize()
X_tr_bin= train.create_binary_image()

classes= train.classes

In [7]:
# valid= ImageData(valid_path)
# X_val, y_val= valid.split_and_resize()
# X_val_bin= valid.create_binary_image()

In [8]:
test= ImageData(test_path)
X_te, y_te= test.split_and_resize()
X_te_bin= test.create_binary_image()

In [9]:
fe_train= FeatureEngineering()
TRAIN_DF= fe_train.assemble(X_tr_bin, thresh=10)

In [None]:
# fe_valid = FeatureEngineering()
# VAL_DF= fe_valid.assemble(X_val_bin, thresh=10)

In [10]:
fe_test= FeatureEngineering()
TEST_DF= fe_test.assemble(X_te_bin, thresh=10)

In [11]:
del X_tr_bin
del X_te_bin
del train
del test
del FeatureEngineering
del fe_train
del fe_test


In [12]:
gc.collect()

In [24]:
cnn_path= '../input/88-model/VGG16_model88acc.h5'
e= EnsembleModel(X_tr, y_tr, TRAIN_DF, X_te, y_te, TEST_DF, cnn_path)
e.build_ensemble()

In [25]:
e.accuracy

In [26]:
e.ens_feature_importances_

In [29]:
plt.figure(figsize=(5,5))
sns.set_style('darkgrid')
sorted_idx= e.ens_feature_importances_.argsort()[:]
plt.barh(e.combo_X_train.columns[sorted_idx], e.ens_feature_importances_[sorted_idx])
plt.xlabel('Feature Importance')
plt.ylabel("Image Type")
plt.title("Random Forest Feature Importances")
plt.show

In [30]:
classes_abv= ['BP','NB','NBDRD','N','SPNB','S','SSPNB']

In [31]:

cm= confusion_matrix(e.y_actual, e.final_preds)
cm_df= pd.DataFrame(cm, columns=list(classes), index= list(classes))
cm_df
ax = sns.heatmap(cm, annot=True, cmap='Purples')

ax.set_title('Ensemble Model Confusion Matrix')
ax.set_xlabel('Predicted Image Type')
ax.set_ylabel('Actual Image Type')
ax.xaxis.set_ticklabels(list(classes_abv), rotation= 270)
ax.yaxis.set_ticklabels(list(classes_abv), rotation= 360)

plt.show()

In [32]:
cls_report= classification_report(e.y_actual, e.final_preds, output_dict= True)
cls_report= pd.DataFrame(cls_report).T
cls_report