# 03_Sentence Similarity

### Import Libraries

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import openpyxl
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

### Define File Paths

In [14]:
fname_sdg           = '../Data/Output/sdg.xlsx'
fname_sentences     = '../Data/Output/CorpRepSentences.csv'
fname_similarity    = '../Data/Output/ResultSimilarity.csv'

### Read SDG File & Clean Data

In [15]:
df_sdg = pd.read_excel(fname_sdg)

In [16]:
# Replace characters that are not on the keyboard with a space.
reg_str = r'[^!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~\\\\0-9a-zA-Z]'
df_sdg['sentence'] = df_sdg['sentence'].str.replace(reg_str,' ', regex=True)

In [17]:
df_sdg

Unnamed: 0,gpnum,gpname,goalnum,sentence
0,gp01,Life,goal01,End poverty in all its forms everywhere
1,gp01,Life,goal01,"Despite progress under the MDGs, approximately..."
2,gp01,Life,goal01,"Over the past decade, markets in developing co..."
3,gp01,Life,goal01,Certain groups are disproportionately represen...
4,gp01,Life,goal01,"These include women, persons with disabilities..."
...,...,...,...,...
636,gp06,Environments,goal15,15.7 Take urgent action to end poaching and tr...
637,gp06,Environments,goal15,"15.8 By 2020, introduce measures to prevent th..."
638,gp06,Environments,goal15,"15.9 By 2020, integrate ecosystems and biodive..."
639,gp06,Environments,goal15,15.a Mobilize and significantly increase from ...


### Read Report Sentences & Clean Data

In [18]:
df_report = pd.read_csv(fname_sentences)

In [19]:
# Replace characters that are not on the keyboard with a space.
reg_str = r'[^!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~\\\\0-9a-zA-Z]'
df_report['sentence'] = df_report['sentence'].str.replace(reg_str,' ', regex=True)

In [20]:
df_report

Unnamed: 0,doc_id,fname,sentence
0,1,Asda_2020.pdf,Our action on sustainability supports the broa...
1,1,Asda_2020.pdf,"In particular, our efforts are contributing to..."
2,1,Asda_2020.pdf,"For example, our work to tackle food poverty i..."
3,1,Asda_2020.pdf,Our CCFB strategy covers every aspect of our b...
4,1,Asda_2020.pdf,It also covers International Procurement and L...
...,...,...,...
168272,84,Toyota_2023.pdf,Environmental Data [O] Remanufactured and Used...
168273,84,Toyota_2023.pdf,306-2 Management of significant waste-related ...
168274,84,Toyota_2023.pdf,407-1 Operations and suppliers in which the ri...
168275,84,Toyota_2023.pdf,416-1 Assessment of the health and safety impa...


### Calculate Sentence Similarity Scores

In [21]:
%%time

# Tokenise sentences
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create lists of sentences
sentences1 = df_sdg['sentence'].tolist()
sentences2 = df_report['sentence'].tolist()

#Compute embeddings for both lists
embedding1 = model.encode(sentences1, convert_to_tensor=True)
embedding2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarity
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)

CPU times: user 14min 16s, sys: 17.8 s, total: 14min 34s
Wall time: 16min 14s


In [22]:
cosine_scores

tensor([[0.3225, 0.3395, 0.3843,  ..., 0.0955, 0.0643, 0.1310],
        [0.1070, 0.2133, 0.2176,  ..., 0.0906, 0.0382, 0.0502],
        [0.3374, 0.3510, 0.2929,  ..., 0.1191, 0.0249, 0.0884],
        ...,
        [0.4042, 0.4519, 0.4156,  ..., 0.2157, 0.1781, 0.3476],
        [0.4758, 0.4257, 0.3900,  ..., 0.2258, 0.2201, 0.2563],
        [0.3984, 0.3435, 0.3967,  ..., 0.3061, 0.2030, 0.2432]],
       device='mps:0')

In [23]:
sim_array = np.array(cosine_scores.cpu()).transpose()

In [24]:
sim_array.shape

(168277, 641)

### Similarity Score for Each Sentence

In [25]:
sr_sdg = df_sdg['goalnum'].value_counts().sort_index()
sr_sdg

goalnum
goal01    40
goal02    38
goal03    43
goal04    37
goal05    43
goal06    45
goal07    29
goal08    42
goal09    29
goal10    44
goal11    35
goal12    41
goal13    30
goal14    40
goal15    44
goal16    40
goal17    21
Name: count, dtype: int64

In [26]:
%%time

sr_sdg = df_sdg['goalnum'].value_counts().sort_index()

# Initialize
for key in sr_sdg.index:
    df_report[key] = 0.0

for idx in df_report.index:
    off_b = 0
    off_e = 0
    
    for key in sr_sdg.index:
        off_b = off_e
        off_e += sr_sdg[key]
        df_report.loc[idx,key] = sim_array[idx][off_b:off_e].mean()
        
    if idx % 1000 == 0:
        print('<cnt:{}, idx:{}> ======================'.format(idx//1000,idx))
        
print('==== End of job ======================')

CPU times: user 2min 39s, sys: 377 ms, total: 2min 39s
Wall time: 2min 40s


In [27]:
df_report

Unnamed: 0,doc_id,fname,sentence,goal01,goal02,goal03,goal04,goal05,goal06,goal07,goal08,goal09,goal10,goal11,goal12,goal13,goal14,goal15,goal16,goal17
0,1,Asda_2020.pdf,Our action on sustainability supports the broa...,0.220957,0.249609,0.209004,0.140543,0.302925,0.211519,0.161349,0.215691,0.296752,0.221673,0.320755,0.243053,0.316457,0.310363,0.266566,0.314579,0.301656
1,1,Asda_2020.pdf,"In particular, our efforts are contributing to...",0.247329,0.282171,0.244253,0.187107,0.321928,0.249514,0.175626,0.230185,0.322435,0.231982,0.318060,0.236173,0.324246,0.298038,0.283017,0.303132,0.284681
2,1,Asda_2020.pdf,"For example, our work to tackle food poverty i...",0.239850,0.288944,0.202165,0.158243,0.244996,0.173143,0.131375,0.201546,0.242778,0.165640,0.270407,0.211620,0.294024,0.282431,0.283044,0.311684,0.272534
3,1,Asda_2020.pdf,Our CCFB strategy covers every aspect of our b...,0.157094,0.224665,0.140007,0.108842,0.212423,0.155067,0.143242,0.163626,0.156430,0.154378,0.153755,0.101164,0.180990,0.223748,0.151392,0.173541,0.154607
4,1,Asda_2020.pdf,It also covers International Procurement and L...,0.163866,0.202955,0.155638,0.131375,0.238785,0.139401,0.129099,0.191975,0.146139,0.171683,0.183722,0.090505,0.175276,0.187808,0.142089,0.164518,0.144709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168272,84,Toyota_2023.pdf,Environmental Data [O] Remanufactured and Used...,0.113916,0.153440,0.114095,0.094321,0.159580,0.110153,0.082341,0.105566,0.225363,0.113817,0.245668,0.303303,0.320671,0.328372,0.292591,0.263292,0.249364
168273,84,Toyota_2023.pdf,306-2 Management of significant waste-related ...,0.155386,0.216601,0.160928,0.133781,0.191180,0.150989,0.134371,0.154997,0.261323,0.182886,0.265438,0.285949,0.295571,0.356245,0.336346,0.338445,0.354660
168274,84,Toyota_2023.pdf,407-1 Operations and suppliers in which the ri...,0.241352,0.238156,0.235872,0.244054,0.231353,0.268405,0.244760,0.254164,0.226894,0.233353,0.233371,0.185475,0.189659,0.195256,0.186064,0.204810,0.209217
168275,84,Toyota_2023.pdf,416-1 Assessment of the health and safety impa...,0.180509,0.204682,0.216033,0.113025,0.187275,0.158147,0.194934,0.184066,0.246529,0.253336,0.205350,0.134587,0.155782,0.211357,0.174801,0.196577,0.181328


### Saving the Result

In [28]:
# Let's save the result into csv format

df_report.to_csv(fname_similarity, index=False)

In [29]:
# Let's save the result into pickle format (a binary format for serialising Python objects):
fname_similarity_pkl    = '../Data/Output/ResultSimilarity.pkl'
df_report.to_pickle(fname_similarity_pkl)

---

In [None]:
# End of file