<a href="https://colab.research.google.com/github/luixmartins/natural-language-processing/blob/main/features_selection_for_tdbert/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install keybert -q
!pip install git+https://github.com/ivanfilhoreis/tf_bert.git -q
!pip install yake -q

In [None]:
import pandas as pd 
import numpy as np 
from keybert import KeyBERT 
from bertVectorizer import bertVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from scipy.spatial.distance import cosine
import yake 

In [None]:
class Classifiers:
    def __init__(self, X_values, y_values, representation, kfolds=10) -> None:
        self.X = X_values 
        self.y = y_values 
        self.representation = representation 
        self.kfold = kfolds 


    def cossine_distance(self, x, y):
        distance = cosine(x, y)
        
        if np.isnan(distance):
            return 1 

        return distance 

    def classifier_models(self):
        models = [
            MLPClassifier(),
            KNeighborsClassifier(),
            SVC(),
            GaussianNB(),
        ]
        return models
    
    def make_prediction(self):
        classifiers = self.classifier_models()
        entries = []
        
        for model in classifiers:
            name = model.__class__.__name__
            accuracies = cross_val_score(model, self.X, self.y, scoring='accuracy', cv=self.kfold)

            for fold_index, accuracy in enumerate(accuracies):
                entries.append((name, fold_index, accuracy))
        
        df_validation = pd.DataFrame(entries, columns=['model_name', 'fold_index', 'accuracy'])

        mean = df_validation.groupby(['model_name']).accuracy.mean()
        std = df_validation.groupby(['model_name']).accuracy.std()

        df = pd.concat([mean, std], axis=1, ignore_index=True)
        df.columns = ['Mean Accuracy', 'Standard Deviation']

        #df.to_csv(f'./datasets/results/{self.representation}.csv')

        return df

In [None]:
path = "/content/drive/MyDrive/datasets/sentiment_analyze_data.csv"

dataset = pd.read_csv(path, usecols=["review", "positive"])
keybert = KeyBERT()

## BIGRAMAS

In [None]:
features = set()

for text in dataset['review'].values:
  features.add(keybert.extract_keywords(text, keyphrase_ngram_range=(2, 2), stop_words="english")[0][0]) 

In [None]:
td_bertbase = bertVectorizer(bert_model='bert-base-multilingual-cased', candidates=list(features))
X_tdbert = td_bertbase.fit_transform(dataset['review'])

In [None]:
td_clf = Classifiers(X_tdbert, dataset['positive'].values, 'TD-Bert Bert Base', kfolds=10)
results_tdbert = td_clf.make_prediction()

results_tdbert 

Unnamed: 0_level_0,Mean Accuracy,Standard Deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
GaussianNB,0.592,0.049844
KNeighborsClassifier,0.665,0.038658
MLPClassifier,0.706,0.062752
SVC,0.757,0.045228


In [None]:
language = "en"
max_ngram_size = 2
deduplication_thresold = 0.9
deduplication_algo = 'seqm'
windowSize = 1
numOfKeywords = 10

custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
features_yake = set()

for text in dataset['review'].values:
  keywords = custom_kw_extractor.extract_keywords(text)

  sorted_list = sorted(
    keywords,
    key=lambda t: t[1],
    reverse=True
  )
  for item in sorted_list:
    if len(item[0].split()) > 1:
      features_yake.add(item[0])
      break;

In [None]:
td_bert = bertVectorizer(bert_model='bert-base-multilingual-cased', candidates=list(features_yake))
X = td_bert.fit_transform(dataset['review'])

In [None]:
clf = Classifiers(X, dataset['positive'].values, 'TD-Bert Bert Base', kfolds=10)
results = clf.make_prediction()

results 

Unnamed: 0_level_0,Mean Accuracy,Standard Deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
GaussianNB,0.627,0.041913
KNeighborsClassifier,0.643,0.073341
MLPClassifier,0.716,0.077632
SVC,0.739,0.050431


## TRIGRAMAS 

In [None]:
features = set()

for text in dataset['review'].values:
  features.add(keybert.extract_keywords(text, keyphrase_ngram_range=(3, 3), stop_words="english")[0][0]) 

In [None]:
td_bertbase = bertVectorizer(bert_model='bert-base-multilingual-cased', candidates=list(features))
X_tdbert = td_bertbase.fit_transform(dataset['review'])

td_clf = Classifiers(X_tdbert, dataset['positive'].values, 'TD-Bert Bert Base', kfolds=10)
results_tdbert = td_clf.make_prediction()

results_tdbert 

Unnamed: 0_level_0,Mean Accuracy,Standard Deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
GaussianNB,0.601,0.054047
KNeighborsClassifier,0.668,0.033599
MLPClassifier,0.695,0.050607
SVC,0.761,0.040675


In [None]:
language = "en"
max_ngram_size = 3
numOfKeywords = 20

custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, top=numOfKeywords)
features_yake = set()

for text in dataset['review'].values:
  keywords = custom_kw_extractor.extract_keywords(text)

  sorted_list = sorted(
    keywords,
    key=lambda t: t[1],
    reverse=True
  )
  for item in sorted_list:
    if len(item[0].split()) > 2:
      features_yake.add(item[0])
      break;

In [None]:
td_bert = bertVectorizer(bert_model='bert-base-multilingual-cased', candidates=list(features_yake))
X = td_bert.fit_transform(dataset['review'])

clf = Classifiers(X, dataset['positive'].values, 'TD-Bert Bert Base', kfolds=10)
results = clf.make_prediction()

results 

Unnamed: 0_level_0,Mean Accuracy,Standard Deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
GaussianNB,0.644,0.03134
KNeighborsClassifier,0.68,0.053955
MLPClassifier,0.691,0.049542
SVC,0.749,0.051737
