# 04_Sentiment Analysis - Cimatebert

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import pipeline

### Define File Paths

In [2]:
fname_sentences  = '../Data/Output/CorpRepSentences.csv'
fname_similarity = '../Data/Output/ResultSimilarity.csv'
fname_result     = '../Data/Output/ResultBoth_Climatebert.csv'

### Read Similarity Score File

In [3]:
df_s = pd.read_csv(fname_similarity)
df_s

Unnamed: 0,doc_id,fname,sentence,goal01,goal02,goal03,goal04,goal05,goal06,goal07,goal08,goal09,goal10,goal11,goal12,goal13,goal14,goal15,goal16,goal17
0,1,Asda_2020.pdf,Our action on sustainability supports the broa...,0.220957,0.249609,0.209004,0.140543,0.302925,0.211519,0.161349,0.215691,0.296752,0.221673,0.320755,0.243053,0.316457,0.310363,0.266566,0.314579,0.301656
1,1,Asda_2020.pdf,"In particular, our efforts are contributing to...",0.247329,0.282171,0.244253,0.187107,0.321928,0.249514,0.175626,0.230185,0.322435,0.231982,0.318060,0.236173,0.324246,0.298038,0.283017,0.303132,0.284681
2,1,Asda_2020.pdf,"For example, our work to tackle food poverty i...",0.239850,0.288944,0.202165,0.158243,0.244996,0.173143,0.131375,0.201546,0.242778,0.165640,0.270407,0.211620,0.294024,0.282431,0.283044,0.311684,0.272534
3,1,Asda_2020.pdf,Our CCFB strategy covers every aspect of our b...,0.157094,0.224665,0.140007,0.108842,0.212423,0.155067,0.143242,0.163626,0.156430,0.154378,0.153755,0.101164,0.180990,0.223748,0.151392,0.173541,0.154607
4,1,Asda_2020.pdf,It also covers International Procurement and L...,0.163866,0.202955,0.155638,0.131375,0.238785,0.139401,0.129099,0.191975,0.146139,0.171683,0.183722,0.090505,0.175276,0.187808,0.142089,0.164518,0.144709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168272,84,Toyota_2023.pdf,Environmental Data [O] Remanufactured and Used...,0.113916,0.153440,0.114095,0.094321,0.159580,0.110153,0.082341,0.105566,0.225363,0.113817,0.245668,0.303303,0.320671,0.328372,0.292591,0.263292,0.249364
168273,84,Toyota_2023.pdf,306-2 Management of significant waste-related ...,0.155386,0.216601,0.160928,0.133781,0.191180,0.150989,0.134371,0.154997,0.261323,0.182886,0.265438,0.285949,0.295571,0.356245,0.336346,0.338445,0.354660
168274,84,Toyota_2023.pdf,407-1 Operations and suppliers in which the ri...,0.241352,0.238156,0.235872,0.244054,0.231353,0.268405,0.244760,0.254164,0.226894,0.233353,0.233371,0.185475,0.189659,0.195256,0.186064,0.204810,0.209217
168275,84,Toyota_2023.pdf,416-1 Assessment of the health and safety impa...,0.180509,0.204682,0.216033,0.113025,0.187275,0.158147,0.194934,0.184066,0.246529,0.253336,0.205350,0.134587,0.155782,0.211357,0.174801,0.196577,0.181328


### Sentiment Analysis (DistilBERT)

In [4]:
# The score is calculated during the sentiment analysis process using the transformer distilbert function
# used to classify the sentiment of sentences.

classifier = pipeline('sentiment-analysis', model='climatebert/environmental-claims')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [5]:
sent_list = df_s['sentence'].tolist()

In [6]:
%%time

CNT = 100
LEN = len(sent_list)

# Ensure that sent_list contains only valid text inputs
valid_types = (str, list)
sent_list = [item for item in sent_list if isinstance(item, valid_types)]


result = []
for i in range(0, LEN, CNT):
    off_e = (i+CNT) if (i+CNT) < LEN else LEN
    
    print(f'sent_list[{i}:{off_e}]')
    # Max length of sentence: 512
    res = classifier(sent_list[i:off_e], truncation=True)
    
    result.extend(res)
        
print('==== End of jobs ====')

sent_list[0:100]
sent_list[100:200]
sent_list[200:300]
sent_list[300:400]
sent_list[400:500]
sent_list[500:600]
sent_list[600:700]
sent_list[700:800]
sent_list[800:900]
sent_list[900:1000]
sent_list[1000:1100]
sent_list[1100:1200]
sent_list[1200:1300]
sent_list[1300:1400]
sent_list[1400:1500]
sent_list[1500:1600]
sent_list[1600:1700]
sent_list[1700:1800]
sent_list[1800:1900]
sent_list[1900:2000]
sent_list[2000:2100]
sent_list[2100:2200]
sent_list[2200:2300]
sent_list[2300:2400]
sent_list[2400:2500]
sent_list[2500:2600]
sent_list[2600:2700]
sent_list[2700:2800]
sent_list[2800:2900]
sent_list[2900:3000]
sent_list[3000:3100]
sent_list[3100:3200]
sent_list[3200:3300]
sent_list[3300:3400]
sent_list[3400:3500]
sent_list[3500:3600]
sent_list[3600:3700]
sent_list[3700:3800]
sent_list[3800:3900]
sent_list[3900:4000]
sent_list[4000:4100]
sent_list[4100:4200]
sent_list[4200:4300]
sent_list[4300:4400]
sent_list[4400:4500]
sent_list[4500:4600]
sent_list[4600:4700]
sent_list[4700:4800]
sent_list[480

In [7]:
result

[{'label': 'yes', 'score': 0.8200068473815918},
 {'label': 'yes', 'score': 0.8701655268669128},
 {'label': 'yes', 'score': 0.9938177466392517},
 {'label': 'no', 'score': 0.9989000558853149},
 {'label': 'no', 'score': 0.9988246560096741},
 {'label': 'no', 'score': 0.9958837628364563},
 {'label': 'no', 'score': 0.7991346716880798},
 {'label': 'yes', 'score': 0.5235083699226379},
 {'label': 'no', 'score': 0.7240632176399231},
 {'label': 'no', 'score': 0.90181565284729},
 {'label': 'no', 'score': 0.8604944944381714},
 {'label': 'no', 'score': 0.9561681151390076},
 {'label': 'no', 'score': 0.9951871633529663},
 {'label': 'no', 'score': 0.8900843262672424},
 {'label': 'yes', 'score': 0.8906427621841431},
 {'label': 'no', 'score': 0.9970049262046814},
 {'label': 'no', 'score': 0.9987640380859375},
 {'label': 'no', 'score': 0.9986807703971863},
 {'label': 'no', 'score': 0.9987189769744873},
 {'label': 'no', 'score': 0.9988440275192261},
 {'label': 'no', 'score': 0.9987934827804565},
 {'label':

### Reverse Negative Scores

The sentences are analyzed in batches (up to 100 sentences at a time) to manage memory and computational efficiency. The model processes each batch and returns a list of dictionaries, where each dictionary contains:

A label indicating whether the sentiment is "POSITIVE" or "NEGATIVE".
A score indicating the confidence of the sentiment classification.

For sentences labeled as "NEGATIVE", the score is adjusted by taking 1 - score to reflect a positive sentiment score scale. This adjustment ensures that higher scores consistently indicate stronger positive sentiment, regardless of the original label.

In [9]:
label = []
score = []
for dic in result:
    if dic['label'] == 'NEGATIVE':
        dic['score'] = 1 - dic['score']
    #else:
        #label.append(dic['label'])
    label.append(dic['label'])
    score.append(dic['score'])

In [10]:
len(label)

168276

In [23]:
label

['yes',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'yes',
 '

In [11]:
len(score)

168276

In [12]:
len(df_s)

168277

In [13]:
# Remove the last row from df_s using iloc
df_new = df_s.iloc[:-1]

In [14]:
df_s = df_new

In [15]:
df_s

Unnamed: 0,doc_id,fname,sentence,goal01,goal02,goal03,goal04,goal05,goal06,goal07,goal08,goal09,goal10,goal11,goal12,goal13,goal14,goal15,goal16,goal17
0,1,Asda_2020.pdf,Our action on sustainability supports the broa...,0.220957,0.249609,0.209004,0.140543,0.302925,0.211519,0.161349,0.215691,0.296752,0.221673,0.320755,0.243053,0.316457,0.310363,0.266566,0.314579,0.301656
1,1,Asda_2020.pdf,"In particular, our efforts are contributing to...",0.247329,0.282171,0.244253,0.187107,0.321928,0.249514,0.175626,0.230185,0.322435,0.231982,0.318060,0.236173,0.324246,0.298038,0.283017,0.303132,0.284681
2,1,Asda_2020.pdf,"For example, our work to tackle food poverty i...",0.239850,0.288944,0.202165,0.158243,0.244996,0.173143,0.131375,0.201546,0.242778,0.165640,0.270407,0.211620,0.294024,0.282431,0.283044,0.311684,0.272534
3,1,Asda_2020.pdf,Our CCFB strategy covers every aspect of our b...,0.157094,0.224665,0.140007,0.108842,0.212423,0.155067,0.143242,0.163626,0.156430,0.154378,0.153755,0.101164,0.180990,0.223748,0.151392,0.173541,0.154607
4,1,Asda_2020.pdf,It also covers International Procurement and L...,0.163866,0.202955,0.155638,0.131375,0.238785,0.139401,0.129099,0.191975,0.146139,0.171683,0.183722,0.090505,0.175276,0.187808,0.142089,0.164518,0.144709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168271,84,Toyota_2023.pdf,Policy and Environmental Management > Environm...,0.214285,0.240569,0.191189,0.185037,0.260155,0.234097,0.201038,0.221959,0.313757,0.252605,0.323585,0.276730,0.318597,0.345756,0.309611,0.333787,0.324321
168272,84,Toyota_2023.pdf,Environmental Data [O] Remanufactured and Used...,0.113916,0.153440,0.114095,0.094321,0.159580,0.110153,0.082341,0.105566,0.225363,0.113817,0.245668,0.303303,0.320671,0.328372,0.292591,0.263292,0.249364
168273,84,Toyota_2023.pdf,306-2 Management of significant waste-related ...,0.155386,0.216601,0.160928,0.133781,0.191180,0.150989,0.134371,0.154997,0.261323,0.182886,0.265438,0.285949,0.295571,0.356245,0.336346,0.338445,0.354660
168274,84,Toyota_2023.pdf,407-1 Operations and suppliers in which the ri...,0.241352,0.238156,0.235872,0.244054,0.231353,0.268405,0.244760,0.254164,0.226894,0.233353,0.233371,0.185475,0.189659,0.195256,0.186064,0.204810,0.209217


In [16]:
# NEW Ensure the lengths of the lists match the original DataFrame length
if len(label) != len(df_s) or len(score) != len(df_s):
    raise ValueError(f"Length of results ({len(label)}) does not match length of index ({len(df_s)})")

In [17]:
# Add columns 'label' and 'score'

df_s['label'] = label
df_s['score'] = score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_s['label'] = label
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_s['score'] = score


In [18]:
df_s['label'] = df_s['label'].str.title()
df_s['score'] = df_s['score'].round(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_s['label'] = df_s['label'].str.title()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_s['score'] = df_s['score'].round(2)


In [19]:
df_s

Unnamed: 0,doc_id,fname,sentence,goal01,goal02,goal03,goal04,goal05,goal06,goal07,...,goal10,goal11,goal12,goal13,goal14,goal15,goal16,goal17,label,score
0,1,Asda_2020.pdf,Our action on sustainability supports the broa...,0.220957,0.249609,0.209004,0.140543,0.302925,0.211519,0.161349,...,0.221673,0.320755,0.243053,0.316457,0.310363,0.266566,0.314579,0.301656,Yes,0.82
1,1,Asda_2020.pdf,"In particular, our efforts are contributing to...",0.247329,0.282171,0.244253,0.187107,0.321928,0.249514,0.175626,...,0.231982,0.318060,0.236173,0.324246,0.298038,0.283017,0.303132,0.284681,Yes,0.87
2,1,Asda_2020.pdf,"For example, our work to tackle food poverty i...",0.239850,0.288944,0.202165,0.158243,0.244996,0.173143,0.131375,...,0.165640,0.270407,0.211620,0.294024,0.282431,0.283044,0.311684,0.272534,Yes,0.99
3,1,Asda_2020.pdf,Our CCFB strategy covers every aspect of our b...,0.157094,0.224665,0.140007,0.108842,0.212423,0.155067,0.143242,...,0.154378,0.153755,0.101164,0.180990,0.223748,0.151392,0.173541,0.154607,No,1.00
4,1,Asda_2020.pdf,It also covers International Procurement and L...,0.163866,0.202955,0.155638,0.131375,0.238785,0.139401,0.129099,...,0.171683,0.183722,0.090505,0.175276,0.187808,0.142089,0.164518,0.144709,No,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168271,84,Toyota_2023.pdf,Policy and Environmental Management > Environm...,0.214285,0.240569,0.191189,0.185037,0.260155,0.234097,0.201038,...,0.252605,0.323585,0.276730,0.318597,0.345756,0.309611,0.333787,0.324321,No,0.99
168272,84,Toyota_2023.pdf,Environmental Data [O] Remanufactured and Used...,0.113916,0.153440,0.114095,0.094321,0.159580,0.110153,0.082341,...,0.113817,0.245668,0.303303,0.320671,0.328372,0.292591,0.263292,0.249364,No,1.00
168273,84,Toyota_2023.pdf,306-2 Management of significant waste-related ...,0.155386,0.216601,0.160928,0.133781,0.191180,0.150989,0.134371,...,0.182886,0.265438,0.285949,0.295571,0.356245,0.336346,0.338445,0.354660,No,1.00
168274,84,Toyota_2023.pdf,407-1 Operations and suppliers in which the ri...,0.241352,0.238156,0.235872,0.244054,0.231353,0.268405,0.244760,...,0.233353,0.233371,0.185475,0.189659,0.195256,0.186064,0.204810,0.209217,No,1.00


### Save the Result

In [20]:
df_s.to_csv(fname_result, index=False)

In [None]:
# df_s.to_pickle(fname_result)

---

In [None]:
# End of file