#### Copyright (C) 2022 Sobhan Moradian Daghigh
#### Date: 2/2/2022

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import os
from scipy.stats import norm
from sklearn.feature_selection import chi2

In [2]:
datasets_name = ['Amazon', 'IMDB  ', 'Yelp  ']

### Filtering the best features

In [3]:
def filter_features(features, thresh, absolute=False):
    if absolute:
        filtr = [abs(x) if abs(x) >= thresh else 0 for x in features]
    else:
        filtr = [x if x >= thresh else 0 for x in features]
        
    sorted_indx = np.flip(np.argsort(filtr))
    tops = sorted_indx[:~filtr.count(0)]
    return tops

### Odd Ratio FS

In [18]:
def odd_ratio(clfs, datasets_name=datasets_name):
    tops = []
    print('(Main, Selected) Features size:')
    for i, clf in enumerate(clfs):
        odd_ratios = np.array(np.exp(clf.coef_).tolist()[0])
        top_odd_ratios = filter_features(features=odd_ratios, thresh=1)
        tops.append(top_odd_ratios)
        print(' |_ {}: ({}, {})'.format(datasets_name[i], len(odd_ratios), len(top_odd_ratios)))
    return tops

### Chi-Square FS

In [21]:
def chi_square(x_datasets, y_datasets, datasets_name=datasets_name):
    tops = []
    print('(Main, Selected) Features size:')
    for i, (x_dset, y_dset) in enumerate(zip(x_datasets, y_datasets)):
        chi_scores = chi2(x_dset, y_dset)
        top_chi2 = filter_features(features=chi_scores[0], thresh=0.45)
        tops.append(top_chi2)
        print(' |_ {}: ({}, {})'.format(datasets_name[i], len(chi_scores[0]), len(top_chi2)))
    return tops

### Count Difference FS

In [16]:
def count_diff(x_datasets, y_datasets, datasets_name=datasets_name):
    tops = []
    print('(Main, Selected) Features size:')
    for i, (x_dset, y_dset) in enumerate(zip(x_datasets, y_datasets)):
        features_sum = 0
        for x, y in zip(x_dset.todense(), y_dset):
            if y == 1:
                features_sum = np.add(features_sum, x)
            if y == 0:
                features_sum = np.subtract(features_sum, x)
                
        top_cd = filter_features(features=features_sum.tolist()[0], thresh=0.5, absolute=True)
        tops.append(top_cd)
        print(' |_ {}: ({}, {})'.format(datasets_name[i], x_dset.shape[1], len(top_cd)))
    return tops

### GSS Coefficient FS

In [7]:
def gss_cal(x_dset, vocab, count, index):
    
    if vocab in x_dset[index].vocabulary_:
        nfck = x_dset[index].vocabulary_[vocab]
    else:
        nfck = 0
    nf_ck = sum(x_dset[index].vocabulary_.values()) - nfck
    
    if vocab in x_dset[int(not(index))].vocabulary_:
        nfck_ = x_dset[int(not(index))].vocabulary_[vocab]
        nf_ck_ = sum(x_dset[int(not(index))].vocabulary_.values()) - nfck_
    else:
        nfck_ = 0
        nf_ck_ = sum(x_dset[int(not(index))].vocabulary_.values())
        
    GSSf = (nfck * nf_ck_) - (nfck_ * nf_ck)
    return GSSf

In [8]:
def normalizer(features, thresh):
    
    amin, amax = min(features), max(features)
    for i, val in enumerate(features):
        # Normalizing the integers into 0-thresh
        features[i] = ((val - amin) / (amax - amin)) * thresh
    return features

In [9]:
def gss(vectorizes, x_datasets, y_datasets, datasets_name=datasets_name):
    
    tops = []
    print('(Main, Selected) Features size:')
    for i, (vector, x_dset, y_dset) in enumerate(zip(vectorizes, x_datasets, y_datasets)):
        features = []
        for vocab, count in vector.vocabulary_.items():
            GSSck  = gss_cal(x_dset, vocab, count, index=0)
            GSSck_ = gss_cal(x_dset, vocab, count, index=1)
            GSS = max(GSSck, GSSck_)
            features.append(GSS)
        
        features = normalizer(features, thresh=4)
        top_gss = filter_features(features=features, thresh=1)
        tops.append(top_gss)
        print(' |_ {}: ({}, {})'.format(datasets_name[i], len(vector.vocabulary_), len(top_gss)))
    return tops

### Bi-Normal Separation

In [10]:
def bns_cal(x_dset, vocab, count, y_dset, index):
    if vocab in x_dset[index].vocabulary_:
        nfck = x_dset[index].vocabulary_[vocab]
    else:
        nfck = 10 ** -10
        
    nck = sum(x_dset[index].vocabulary_.values())
    if vocab in x_dset[int(not(index))].vocabulary_:
        nfck_ = x_dset[int(not(index))].vocabulary_[vocab]
    else:
        nfck_ = 10 ** -10
    
    nck_ = sum(x_dset[int(not(index))].vocabulary_.values())    
    BNS = np.subtract(norm.ppf(np.divide(nfck, nck)), norm.ppf(np.divide(nfck_, nck_)))
    return BNS, nck

In [11]:
def bi_normal_sep(vectorizes, x_datasets, y_datasets, datasets_name=datasets_name):
    
    tops = []
    print('(Main, Selected) Features size:')
    for i, (vector, x_dset, y_dset) in enumerate(zip(vectorizes, x_datasets, y_datasets)):
        maximum = 0
        features = []
        BNScks, BNSck_s, counts_cks, counts_ck_s = [], [], [], []
        for vocab, count in vector.vocabulary_.items():
            BNSck, counts_ck   = bns_cal(x_dset, vocab, count, y_dset, index=0)
            BNSck_, counts_ck_ = bns_cal(x_dset, vocab, count, y_dset, index=1)
            
            # The maximum value is replaced for the inf and -inf
            [maximum == BNSck if maximum < BNSck else maximum]
            
            BNScks.append(BNSck)
            BNSck_s.append(BNSck_)
            counts_cks.append(counts_ck)
            counts_ck_s.append(counts_ck_)
            
        for BNSck, BNSck_, counts_ck, counts_ck_ in zip(BNScks, BNSck_s, counts_cks, counts_ck_s):
            
            if BNSck in [np.inf, -np.inf]:
                BNSck = maximum 
            if BNSck_ in [np.inf, -np.inf]:
                BNSck_ = -maximum
            
            BNS = np.divide(np.add(np.multiply(counts_ck, BNSck), np.multiply(counts_ck_, BNSck_)), len(vector.vocabulary_))
            features.append(BNS)
            
        top_bns = filter_features(features=features, thresh=[400, 590, 320][i], absolute=True)
        tops.append(top_bns)
        print(' |_ {}: ({}, {})'.format(datasets_name[i], len(vector.vocabulary_), len(top_bns)))
    return tops