Import Libraries

In [2]:
import pandas as pd
import numpy as np
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelWithLMHead
import torch

NLP Functions

In [7]:
# Simple Sentiment Analysis
def nlp_sentiment_pd (sentences):
    nlp = pipeline("sentiment-analysis")
    nlp_results = nlp(sentences)
    nlp_results_pd = pd.DataFrame(nlp_results)
    nlp_results_pd.insert(0, 'sentences', sentences)
    nlp_results_pd = nlp_results_pd.sort_values('score', ascending=False)
    return nlp_results_pd


# Simple Paraphrase Analysis
def nlp_paraphrase_pd(sequence_list, threshold=0.9):
    if len(sequence_list)>1 and type(sequence_list) is list:
        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
        model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

        pairs_list = pair_list(sequence_list)
        result_list = []

        for sequence_pair in pairs_list:
            token = tokenizer(sequence_pair[0], sequence_pair[1], return_tensors="pt")
            classification_logits = model(**token).logits
            result = torch.softmax(classification_logits, dim=1).tolist()[0]
            result_list.append(result)

        pairs_pd = pd.DataFrame(pairs_list, columns=['sequence_1', 'sequence_2'])
        results_pd = pd.DataFrame(result_list, columns=['not_paraphrase', 'is_paraphrase'])
        results_pd.loc[results_pd['is_paraphrase']>=threshold, 'label'] = 'PARAPHRASE'
        results_pd.loc[results_pd['not_paraphrase']>=threshold, 'label'] = 'NOT PARAPHRASE'
        results_pd.loc[((results_pd['not_paraphrase']<threshold) & (results_pd['is_paraphrase']<threshold)), 'label'] = 'NOT SURE'
        paraphrases_pd = pd.concat([pairs_pd, results_pd],ignore_index=False, axis=1)
        paraphrases_pd = movecol(paraphrases_pd, cols_to_move=['label'], ref_col='sequence_2', place='After')
        return paraphrases_pd
    else:
        return None

# Summarize text
def nlp_summarizer_pd(list_text, max_length=200, min_length=100, do_sample=False):
    summarizer = pipeline("summarization")
    for text in list_text:
        summary_list = summarizer(text, max_length=max_length, min_length=min_length, do_sample=do_sample)
    return summary_list
    
# Make sentence pair combinations
def pair_list(sentences):
    pair_list = []
    if len(sentences)>1 and type(sentences) is list:
        for x in range(len(sentences)):
            for y in range(len(sentences)):
                if x+1<=y:
                    pair_list.append([sentences[x],sentences[y]])
    else:
        return None
    return pair_list

# Move columns in pd
def movecol(df, cols_to_move=[], ref_col='', place='After'):
    
    cols = df.columns.tolist()
    if place == 'After':
        seg1 = cols[:list(cols).index(ref_col) + 1]
        seg2 = cols_to_move
    if place == 'Before':
        seg1 = cols[:list(cols).index(ref_col)]
        seg2 = cols_to_move + [ref_col]
    
    seg1 = [i for i in seg1 if i not in seg2]
    seg3 = [i for i in cols if i not in seg1 + seg2]
    
    return(df[seg1 + seg2 + seg3])

In [8]:
sentences = ["I hate you", \
             "I love you", \
             "Chenlei is very bad"]

print(nlp_sentiment_pd(sentences))

             sentences     label     score
1           I love you  POSITIVE  0.999866
2  Chenlei is very bad  NEGATIVE  0.999804
0           I hate you  NEGATIVE  0.999113


In [None]:
sequence_list = ["Helmut is Anna's brother", \
                "Anna is Helmut's sister", \
                "Helmut is also Anna's uncle"]
print(nlp_paraphrase_pd(sequence_list))

In [11]:
list_text = [open("./text.txt", "r").read()]

print (nlp_summarizer_pd(list_text))



[{'summary_text': " The approval of a new drug to treat Alzheimer's is premature, risky and wrong . The FDA admits that it is not proven that the new drug, a monoclonal antibody to be sold under the name Aduhelm, actually works ."}]
