# 02_Sentence Similarity

### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sentence_transformers import SentenceTransformer, util

### Define File Paths

In [None]:
fname_sdg           = 'data/sdg.xlsx'
fname_sentences     = 'data/sentences.csv'
fname_similarity    = 'data/result_similarity.pkl'

### Read SDG File & Clean Data

In [None]:
df_sdg = pd.read_excel(fname_sdg)

In [None]:
# Replace characters that are not on the keyboard with a space.
reg_str = r'[^!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~\\\\0-9a-zA-Z]'
df_sdg['sentence'] = df_sdg['sentence'].str.replace(reg_str,' ', regex=True)

In [None]:
df_sdg

### Read Report Sentences & Clean Data

In [None]:
df_report = pd.read_csv(fname_sentences)

In [None]:
# Replace characters that are not on the keyboard with a space.
reg_str = r'[^!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~\\\\0-9a-zA-Z]'
df_report['sentence'] = df_report['sentence'].str.replace(reg_str,' ', regex=True)

In [None]:
df_report

### Calculate Sentence Similarity Scores

In [None]:
%%time

model = SentenceTransformer('all-MiniLM-L6-v2')

# Create lists of sentences
sentences1 = df_sdg['sentence'].tolist()
sentences2 = df_report['sentence'].tolist()

#Compute embeddings for both lists
embedding1 = model.encode(sentences1, convert_to_tensor=True)
embedding2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarity
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)

In [None]:
sim_array = np.array(cosine_scores.cpu()).transpose()

In [None]:
sim_array.shape

### Similarity Score for Each Sentence

In [None]:
sr_sdg = df_sdg['goalnum'].value_counts().sort_index()
sr_sdg

In [None]:
%%time

sr_sdg = df_sdg['goalnum'].value_counts().sort_index()

# Initialize
for key in sr_sdg.index:
    df_report[key] = 0.0

for idx in df_report.index:
    off_b = 0
    off_e = 0
    
    for key in sr_sdg.index:
        off_b = off_e
        off_e += sr_sdg[key]
        df_report.loc[idx,key] = sim_array[idx][off_b:off_e].mean()
        
    if idx % 1000 == 0:
        print('<cnt:{}, idx:{}> ======================'.format(idx//1000,idx))
        
print('==== End of job ======================')

In [None]:
df_report

### Pickle the Result

In [None]:
df_report.to_pickle(fname_result)

---

In [None]:
# End of file