In [1]:
from azure.cognitiveservices.language.textanalytics import TextAnalyticsClient
from msrest.authentication import CognitiveServicesCredentials
import sys
sys.path.append("../../")
from utils_nlp.dataset.bert_sentiment import download_and_load_datasets

import time
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

The set up process below can be found on [here](https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/quickstarts/python-sdk).

## Authenticate the credential

In [2]:
subscription_key = "19a480c69de84e79abb76a9f2fab0f96"
credentials = CognitiveServicesCredentials(subscription_key)

## Create a Text Analytic client

In [3]:
text_analytics_url = "https://eastus.api.cognitive.microsoft.com/"
text_analytics = TextAnalyticsClient(endpoint=text_analytics_url, credentials=credentials)

## Sample document to check connection

In [59]:
documents = [
    {
        "id": "1",
        "language": "en",
        "text": 'Hello it\'s me'
    },
    {
        "id": "2",
        "language": "en",
        "text": "This was a waste of my time. The speaker put me to sleep."
    },
    {
        "id": "3",
        "language": "es",
        "text": 'This is interesting \' right'
    },
    {
        "id": "4",
        "language": "it",
        "text": "a"*5500
    }
]

## Making call to the TA client

In [21]:
response = text_analytics.sentiment(documents=documents)
for document in response.documents:
    print("Document Id: ", document.id, ", Sentiment Score: ",
          "{:.2f}".format(document.score))

Document Id:  1 , Sentiment Score:  0.97
Document Id:  2 , Sentiment Score:  0.11
Document Id:  3 , Sentiment Score:  0.52


## Prepare the test data

In [7]:
_, test_df = download_and_load_datasets()

=====> Begin downloading
=====> Done downloading
=====> Finish extracting
**** Dataset path: C:\Users\ducl\Documents\GitHub\nlp-2\scenarios\text_classification\data\aclImdb
===> Directory: C:\Users\ducl\Documents\GitHub\nlp-2\scenarios\text_classification\data\aclImdb\train


HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))


===> Complete train df
===> Directory: C:\Users\ducl\Documents\GitHub\nlp-2\scenarios\text_classification\data\aclImdb\test


HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))


===> Complete test df


In [46]:
def create_document_from_df_row(row):
    """
    Creating a document from a single row in pandas DataFrame.
    
    Args:
        row (Dataframe row): A single row from pandas dataframe, in the form of tupple (index, data)
    Return:
        A dictionary representation of the row which can be used to make call to the API
    """
    index = str(row[0])
    sentence = row[1]["sentence"]
    
    # Truncate the sentence if it exceed the amount of character allow by TA client
    if len(sentence) > 5120:
        sentence = sentence[:5120]
    
    return {
            "id": index, 
            "language":"en", 
            "text":str(sentence)
    }
    
def get_sentiment_values(rows):
    """
    Method to get the sentiment value for all documents from Text Analytic client
    
    Args:
        rows (DataFrame): A DataFrame of documents
        
    Return:
        results (array): An array with value of 0 (negative) and 1 (positive) relavant to each document in the same order
    """
    
    documents = []
    
    for row in rows.iterrows():
        documents.append(create_document_from_df_row(row))
    
    # Get the sentiment values for all documents        
    response = text_analytics.sentiment(documents=documents)
    
    results = []
    
    for document in response.documents:
        # A score > 0.5 means positive
        if document.score > 0.5:
            results.append(1)
        else:
            results.append(0)
    
    return results

In [47]:
# The amount of documents in each request. There is a limit for number of requests and its size
# https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/overview#data-limits
MAX_PER_REQUEST = 100

documents_size = len(test_df)
results = []

for i in tqdm(range(0, documents_size, MAX_PER_REQUEST)):
    results += get_sentiment_values(test_df[i: i+MAX_PER_REQUEST])
    
    # Pause to make the notebook not overloaded
    time.sleep(1)

print(results)


HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

[0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 




## Write the result to the file

There is a limit of how many requests can we submit, and the process takes a substantial amount of time to run. So we saved the result to a local file for reusable purpose.

In [48]:
with open("benchmark_result.txt", "w") as f:
    f.write(str(results))
    f.close()

In [56]:
with open("benchmark_result_final.txt", "r") as f:
    data = f.read()
    results_TA = eval(data)

## Making the report

In [58]:
LABEL_COL = "polarity"

label_encoder = LabelEncoder()

labels_test = label_encoder.fit_transform(test_df[LABEL_COL])

print(classification_report(labels_test, results_TA, target_names=["negative", "positive"]))

              precision    recall  f1-score   support

    negative       0.74      0.69      0.71     12500
    positive       0.71      0.75      0.73     12500

   micro avg       0.72      0.72      0.72     25000
   macro avg       0.72      0.72      0.72     25000
weighted avg       0.72      0.72      0.72     25000

