In [1]:
import os
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
from keras import layers
from keras.preprocessing import image
from keras import Sequential
from keras.layers import Dense,Activation, Conv2D, Flatten, Dropout, MaxPooling2D, GlobalAveragePooling2D, BatchNormalization
from keras.models import Model
from keras.metrics import categorical_crossentropy
from keras.utils.np_utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,max_error, explained_variance_score,median_absolute_error, accuracy_score,classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import random
import seaborn as sns
import matplotlib.pyplot as plt
from skimage import *
import cv2
import math
import joblib
from scipy import stats
import gc


img_w= 256
img_h=256

In [2]:
train_path= "../input/seti-data/primary_small/train/"
# valid_path= '../input/seti-data/primary_small/valid/'
test_path= '../input/seti-data/primary_small/test/'

In [3]:
class ImageData:

    def __init__(self, filepath):
        self.filepath= filepath
        self.X_data= []
        self.y_data= []
        self.X_gray= []
        self.X_binary=[]
        self.classes=[]
        self.y_categorical= []
           
            
            
    def split_and_resize(self): 

        self.classes= sorted(os.listdir(self.filepath))

        for c in range(len(self.classes)):
            path = os.path.join(self.filepath, self.classes[c])
            signal_type= self.classes[c]
            
            for i in os.listdir(path):
                img= cv2.imread(os.path.join(path, i))
                img= cv2.resize(img, (256, 256))
                
                self.X_data.append(img)
                self.y_data.append(signal_type)
                
        return np.array(self.X_data), np.array(self.y_data)

    
    def grayscale(self):

        for i in range(len(self.X_data)):
            x= cv2.cvtColor(self.X_data[i], cv2.COLOR_BGR2GRAY)
            self.X_gray.append(x)

            
    
    def create_binary_image(self):
        self.grayscale()
        for i in range(len(self.X_gray)):
            img= np.where(self.X_gray[i]>100,1,0)
            self.X_binary.append(img)
        return np.array(self.X_binary)
            
    
    def create_categorical_label(self):
        self.y_categorical = to_categorical([self.classes.index(i) for i in self.y_data])
        return self.y_categorical
    

In [4]:
class FeatureEngineering:
    
    def __init__(self):
        self.lr_rmse=[]
        self.lr_max_error= []
        self.lr_spread= []
        self.lr_av_slope= []
        self.lr_slope= []
        self.y_int= []
        
        self.num_gaps= []
        self.mean_squiggly =[]
        self.num_bright_pix =[]
        self.pct_bright_lines =[]
        self.noisy= []
        self.biggest_gap=[]
#         self.frequent_gap=[]

        
    
        self.Features_DF = pd.DataFrame()
        
    
    def make_df(self):

        img_tuple = [([(i,x,y) for x, y in enumerate(j)]) for i, j in enumerate(self.temp)]

        
        new_list=[]
        for x in range(256):
            for y in range(256):
                new_list.append(img_tuple[x][y])
                
                   #create df from the list, filter for only bright pixels (ie, 1 values)
        df= pd.DataFrame(new_list).rename(columns={0:'Y', 1:"X", 2:'Bright'})
        df= df[df['Bright']==1]
        df.drop(columns='Bright', inplace=True)
        df.reset_index(inplace=True, drop=True)
        self.brightpix_df= df
        self.num_bright_pix.append(len(self.brightpix_df))
        return self.brightpix_df
    

        
    def lin_reg(self, binary_array):

        
        binary_df =  pd.DataFrame(self.make_df())
        Y= binary_df['Y']
        X= binary_df.drop(columns='Y')
        
        LinReg = LinearRegression()
        LinReg.fit(X,Y)
        
        self.y_pred= LinReg.predict(X)
        
        self.lr_rmse.append(math.sqrt(mean_squared_error(Y, self.y_pred)))
        self.lr_max_error.append(max_error(Y, self.y_pred))
        self.lr_av_slope.append(abs(LinReg.coef_))
        self.lr_slope.append(LinReg.coef_)
        self.y_int.append(LinReg.intercept_)
        self.lr_spread.append(np.max(X)- np.min(X))
        
    def gap_detector(self):
        self.num_gaps.append(len(self.brightpix_df[self.brightpix_df.Y.diff()> self.thresh]))
            
    def mean_squiggliness(self):
        if self.brightpix_df.X.diff().apply(abs).mean() ==np.nan:
            self.mean_squiggly.append(0)
        else:
            self.mean_squiggly.append(self.brightpix_df.X.diff().apply(abs).mean())
            

    def big_and_most_gap(self):
        
        diffs = self.brightpix_df.Y.diff()
        gap_info= pd.DataFrame(diffs.value_counts()).reset_index().rename(columns={'index':'Gap Size', 'Y':'Gap Freq'})
        freq_gap = gap_info.sort_values(by='Gap Freq', ascending=False)
        
        
                                        # ***********
        if np.max(gap_info['Gap Size']) > self.thresh:
            self.biggest_gap.append(np.max(gap_info['Gap Size']))
        else:
            self.biggest_gap.append(0)
            

    
    def find_bright_lines(self):
        lines=[]
        
        for line in range(256):
            if np.max(self.temp[line]) >0:
                lines.append(1)
        self.pct_bright_lines.append(len(lines)/256)
        

    def detect_noise(self):
        if len(self.brightpix_df)>150:
            self.noisy.append(1)
        else:
            self.noisy.append(0)
    

    
    
    
    def assemble(self, binary_array, thresh=20):
        self.binary_array= binary_array
        self.thresh= thresh
        
        for i in range(len(self.binary_array)):
            self.temp = self.binary_array[i]
            
            self.lin_reg(self.temp)
            self.big_and_most_gap()
            
            
            
            
            self.gap_detector()
            self.mean_squiggliness()
            self.find_bright_lines()
            self.detect_noise()
        self.Features_DF = pd.DataFrame(np.column_stack([self.lr_rmse, self.lr_max_error, self.lr_spread, self.lr_av_slope,
                                        self.noisy, self.mean_squiggly, self.num_bright_pix, self.pct_bright_lines, 
                                        self.num_gaps, self.biggest_gap]),
                                       columns=['RMSE', 'Max Error', 'Spread','Slope', 'Is Noisy','Mean Squiggliness',
                                               'Num Bright Pix', 'Pct Bright Lines','Num Gaps','Biggest Gap'])
        self.Features_DF.fillna(0, inplace=True)
        return self.Features_DF
            
        

In [5]:
train= ImageData(train_path)
X_tr, y_tr= train.split_and_resize()
X_tr_bin= train.create_binary_image()

classes= train.classes

In [None]:
# valid= ImageData(valid_path)
# X_val, y_val= valid.split_and_resize()
# X_val_bin= valid.create_binary_image()

In [None]:
test= ImageData(test_path)
X_te, y_te= test.split_and_resize()
X_te_bin= test.create_binary_image()

In [6]:
fe_train= FeatureEngineering()
TRAIN_DF= fe_train.assemble(X_tr_bin, thresh=10)

In [None]:
# fe_valid = FeatureEngineering()
# VAL_DF= fe_valid.assemble(X_val_bin, thresh=10)

In [None]:
fe_test= FeatureEngineering()
TEST_DF= fe_test.assemble(X_te_bin, thresh=10)

In [None]:
#######################################################################################################

In [17]:
def create_combo_images(  arr , img_index , feature , color    ):
    tpls = [([(i,x,y) for x, y in enumerate(j)]) for i, j in enumerate(arr[img_index])]

    new_list=[]
    for x in range(256):
        for y in range(256):
            new_list.append(tpls[x][y])

    binary_df= pd.DataFrame(new_list)
    binary_df = binary_df.rename(columns={0:'Y', 1:"X", 2:'Bright'})
    binary_df= binary_df[binary_df['Bright']==1]
    binary_df.drop(columns='Bright', inplace=True)
    binary_df.reset_index(drop=True)

    X= binary_df['X']
    Y= binary_df.drop(columns='X')    

    LinReg= LinearRegression()
    LinReg.fit(Y,X)
    x_pred= LinReg.predict(Y)
    
    img_name= y_tr[img_index]
    feat= round(TRAIN_DF.loc[img_index, feature],2)
    

    plt.figure(figsize=(7,7))
    plt.plot(x_pred, Y,  color='r', linestyle='--',  alpha=0.3)
    plt.imshow(arr[img_index], origin='lower', cmap=color)
    plt.title('{}: {} {}'.format(img_name, feat, feature), color='black')
    plt.show()




In [26]:
img_index= 4920
arr=  X_tr_bin
create_combo_images( arr, img_index,'Slope', 'viridis' )

In [None]:
#######################################################################################################

In [None]:

cm= confusion_matrix(y_test, y_pred)
cm_df= pd.DataFrame(cm, columns=list(classes), index= list(classes))
cm_df
ax = sns.heatmap(cm, annot=True, cmap='Purples')

ax.set_title('CNN Model Confusion Matrix')
ax.set_xlabel('Predicted Image Type')
ax.set_ylabel('Actual Image Type')
ax.xaxis.set_ticklabels(list(classes), rotation= 270)
ax.yaxis.set_ticklabels(list(classes), rotation= 360)

plt.show()

In [None]:
#####################################################################################################

In [None]:
rfc= RandomForestClassifier(criterion='entropy', n_estimators= 105, max_features= 9, max_depth=10)
rfc.fit(ens_train, y_tr)

pred= rfc.predict(ens_test)
accuracy_score(y_te, pred)

plt.figure(figsize= (7,7))
sns.set_style('darkgrid')
sorted_idx= rfc.feature_importances_.argsort()
plt.barh(TRAIN_DF.columns[sorted_idx], rfc.feature_importances_[sorted_idx])
plt.xlabel('Random Forest Feature Importance')

In [None]:
#######################################################################################################

In [None]:
Train_combo= TRAIN_DF.copy()
Train_combo['Img Type']= y_tr

In [None]:
sns.set(rc={"figure.figsize":(12,8)})
sns.color_palette('tab10')
sns.set_style('darkgrid')
sns.boxplot(x='Slope', y='Img Type', data= Train_combo, order=['narrowband', 'squarepulsednarrowband','narrowbanddrd','squiggle','squigglesquarepulsednarrowband','brightpixel','noise'],
           showmeans=True, palette= ['#1f77b4','#ff7f0e','#2ca02c','yellow','#8c564b', '#d62728', '#9467bd'],
           meanprops={'markerfacecolor':'white', 'markeredgecolor':'gray'},medianprops = dict(color="white",linewidth=1.5))
plt.title('Slope Magnitude for Each Image Type')
plt.ylabel('Image Types')
plt.xlabel('Slope')

plt.show()

In [None]:
#########################################################################################################

In [27]:
nb = TRAIN_DF.loc[800:1600,'Slope']
spnb= TRAIN_DF.loc[3200:4000, 'Slope']

a= TRAIN_DF.loc[0:800,'Slope']
b= TRAIN_DF.loc[1600:3200,'Slope']
c= TRAIN_DF.loc[4000:,'Slope']


data1= pd.concat([nb, spnb] ,axis=0)
data1.reset_index(drop=True , inplace= True)


data2= pd.concat([a, b,c], axis=0)
data2.reset_index(drop=True, inplace=True)

In [28]:
import scipy.stats

In [29]:
t= scipy.stats.ttest_ind(data1, data2)
t