#### Copyright (C) 2022 Sobhan Moradian Daghigh
#### Date: 2/2/2022

### Import Libraries

In [2]:
import numpy as np
import pandas as pd
import os

from sklearn.feature_selection import chi2

In [3]:
datasets_name = ['Amazon', 'IMDB  ', 'Yelp  ']

### Filtering the best features

In [4]:
def filter_features(features, thresh, absolute=False):
    if absolute:
        filtr = [abs(x) if abs(x) >= thresh else 0 for x in features]
    else:
        filtr = [x if x >= thresh else 0 for x in features]
        
    sorted_indx = np.flip(np.argsort(filtr))
    tops = sorted_indx[:~filtr.count(0)]
    return tops

### Odd Ratio FS

In [5]:
def odd_ratio(clfs, datasets_name=datasets_name):
    tops = []
    print('(Main, Selected) Features size:')
    for i, clf in enumerate(clfs):
        odd_ratios = np.array(np.exp(clf.coef_).tolist()[0])
        top_odd_ratios = filter_features(features=odd_ratios, thresh=1)
        tops.append(top_odd_ratios)
        print(' |_ {}: ({}, {})'.format(datasets_name[i], len(odd_ratios), len(top_odd_ratios)))
    return tops

### Chi-Square FS

In [6]:
def chi_square(x_datasets, y_datasets, datasets_name=datasets_name):
    tops = []
    print('(Main, Selected) Features size:')
    for i, (x_dset, y_dset) in enumerate(zip(x_datasets, y_datasets)):
        chi_scores = chi2(x_dset, y_dset)
        top_chi2 = filter_features(features=chi_scores[0], thresh=0.8)
        tops.append(top_chi2)
        print(' |_ {}: ({}, {})'.format(datasets_name[i], len(chi_scores[0]), len(top_chi2)))
    return tops

### Count Difference FS

In [7]:
def count_diff(x_datasets, y_datasets, datasets_name=datasets_name):
    tops = []
    print('(Main, Selected) Features size:')
    for i, (x_dset, y_dset) in enumerate(zip(x_datasets, y_datasets)):
        features_sum = 0
        for x, y in zip(x_dset.todense(), y_dset):
            if y == 1:
                features_sum = np.add(features_sum, x)
            if y == 0:
                features_sum = np.subtract(features_sum, x)
                
        top_cd = filter_features(features=features_sum.tolist()[0], thresh=0.8, absolute=True)
        tops.append(top_cd)
        print(' |_ {}: ({}, {})'.format(datasets_name[i], x_dset.shape[1], len(top_cd)))
    return tops

### GSS Coefficient FS

In [112]:
def gss_cal(x_dset, vocab, count, index):
    
    if vocab in x_dset[index].vocabulary_:
        nfck = x_dset[index].vocabulary_[vocab]
    else:
        nfck = 0
        
    nf_ck = sum(x_dset[index].vocabulary_.values()) - nfck
    
    if vocab in x_dset[int(not(index))].vocabulary_:
        nfck_ = x_dset[int(not(index))].vocabulary_[vocab]
        nf_ck_ = sum(x_dset[int(not(index))].vocabulary_.values()) - nfck_
        
    else:
        nfck_ = 0
        nf_ck_ = sum(x_dset[int(not(index))].vocabulary_.values())
        
        
    GSSf = (nfck * nf_ck_) - (nfck_ * nf_ck)
    
    return GSSf

In [115]:
def normalizer(features):
    amin, amax = min(features), max(features)
    for i, val in enumerate(features):
        # Normalizing the integers into 0-4
        features[i] = ((val - amin) / (amax - amin)) * 4
    return features

In [105]:
def gss(vectorizes, x_datasets, y_datasets, datasets_name=datasets_name):
    tops = []
    print('(Main, Selected) Features size:')
    for i, (vector, x_dset, y_dset) in enumerate(zip(vectorizes, x_datasets, y_datasets)):
        features = []
        for vocab, count in vector.vocabulary_.items():
            GSSck  = gss_cal(x_dset, vocab, count, index=0)
            GSSck_ = gss_cal(x_dset, vocab, count, index=1)
            GSS = max(GSSck, GSSck_)
            features.append(GSS)
        
        features = normalizer(features)
        top_gss = filter_features(features=features, thresh=1)
        tops.append(top_gss)
        print(' |_ {}: ({}, {})'.format(datasets_name[i], len(vector.vocabulary_), len(top_gss)))
    return tops