In [1]:
WHALE = '/kaggle/input/moby-dicktext/whale2.txt'
with open(file=WHALE, mode='r', encoding='utf-8') as input_fp:
    text = input_fp.read()
    
text = text.replace('\n', ' ')
text = ' '.join(text.split())
print(len(text))

1211194


Our text is now one one string. Let's break it into sentences.

In [2]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from plotly.express import histogram
sentences = sent_tokenize(text=text, language='english')
sentences_df = pd.DataFrame(data=sentences, columns=['text'])
sentences_df['token count'] = sentences_df['text'].str.split().str.len()
histogram(data_frame=sentences_df, x='token count', log_y=True)

In this case we have a little data that is too long for one model so we need to do something about it; since it is only a few sentences out of several hundred we can trim them and hope for the best or we can drop them; neither will make much difference in the final result.

In [3]:
# we need to clean up some outliers to make our sentiment pipeline below happy
sentences_df['clean'] = sentences_df['text'].apply(func=lambda x: x if len(x.split()) < 120 else ' '.join(x.split()[:120]))

Now we are ready to do sentiment analysis.

In [4]:
import pandas as pd
from arrow import now
from plotly.express import line_polar
from plotly.graph_objects import Figure
from transformers import pipeline

# expect this to take several minutes per model
def plot(input_df: pd.DataFrame, column: str, model: str) -> Figure:
    time_start = now()
    pipe = pipeline(task='sentiment-analysis', model=model,)
    result_df = pd.DataFrame(pipe(input_df[column].values.tolist())).groupby(by='label').sum().reset_index()
    print('sentiment analysis with {} done in {}'.format(model, now() - time_start))
    return line_polar(data_frame=result_df, r='score', theta='label', line_close=True, title='Moby Dick sentiment', log_r=True, )


for model in [
    'bhadresh-savani/distilbert-base-uncased-emotion',
    'cardiffnlp/twitter-roberta-base-sentiment',
    'nlptown/bert-base-multilingual-uncased-sentiment',
]:
    plot(input_df=sentences_df, column='clean', model=model).show()


config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

sentiment analysis with bhadresh-savani/distilbert-base-uncased-emotion done in 0:07:50.660599


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

sentiment analysis with cardiffnlp/twitter-roberta-base-sentiment done in 0:15:08.504530


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

sentiment analysis with nlptown/bert-base-multilingual-uncased-sentiment done in 0:15:23.577027
