# 01_Document Similarity Analysis

### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
from sentence_transformers import SentenceTransformer, util

### Define File Paths

In [None]:
fname_abcd      = 'data/abcd_keywords.xlsx'
fname_news      = 'data/news.xlsx'
fname_result_ko = 'data/result_ko.pkl'
fname_result_in = 'data/result_in.pkl'

### Read ABCD Keywords

In [None]:
df_abcd = pd.read_excel(fname_abcd, header=[0, 1])

In [None]:
df_abcd

In [None]:
df_abcd_m = df_abcd.melt()
df_abcd_m

### Read Articles & Clean Data

In [None]:
df_news_ko = pd.read_excel(fname_news, sheet_name='South Korea')

In [None]:
# Replace characters that are not on the keyboard with a space.
reg_str = r'[^!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~\\\\0-9a-zA-Z]'
df_news_ko['Contents'] = df_news_ko['Contents'].str.replace(reg_str,' ', regex=True)

In [None]:
df_news_in = pd.read_excel(fname_news, sheet_name='India')

In [None]:
# Replace characters that are not on the keyboard with a space.
reg_str = r'[^!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~\\\\0-9a-zA-Z]'
df_news_in['Contents'] = df_news_in['Contents'].str.replace(reg_str,' ', regex=True)

### Define Function: Generate Document

In [None]:
def gen_document(doc_id, Title, URL, Date_Created, Summary, sent_lst):

    df_tmp = pd.DataFrame(
        {
            'doc_id'      : doc_id,
            'Title'       : Title,
            'URL'         : URL,
            'Date Created': Date_Created,
            'Summary'     : Summary,
            'Contents'    : sent_lst
        }
    )
    
    return df_tmp

In [None]:
def gen_DataFrame(df):
    
    # Create empty DataFrame
    df_res = pd.DataFrame()

    for i, row in enumerate(df.itertuples()):
        sent_lst = sent_tokenize(row.Contents)
        df_doc   = gen_document(i, row.Title, row.URL, row._3, row.Summary, sent_lst)
        df_res   = pd.concat([df_res,df_doc])
        
    df_res.reset_index(inplace=True)
    df_res.drop('index', axis=1, inplace=True)
    
    return df_res

In [None]:
df_ko = gen_DataFrame(df_news_ko)
df_ko

In [None]:
df_in = gen_DataFrame(df_news_in)
df_in

### Define Function: Document Similarity Calculation

In [None]:
def get_similarity(df_con):
    model = SentenceTransformer('all-MiniLM-L6-v2')

    #Create Document lists
    document1 = df_abcd_m['value'].tolist()
    document2 = df_con['Contents'].tolist()

    #Compute embedding for both lists
    embeddings1 = model.encode(document1, convert_to_tensor=True)
    embeddings2 = model.encode(document2, convert_to_tensor=True)

    #Compute cosine-similarits
    cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
    
    return np.array(cosine_scores.cpu()).transpose()

In [None]:
%%time
sim_array_ko = get_similarity(df_ko)

In [None]:
%%time
sim_array_in = get_similarity(df_in)

### Document Similarity Score

In [None]:
def set_score(sim_array, df_con):

    sr_abcd = df_abcd_m['variable_1'].value_counts().reindex(['Speed','Precision',
                                                              'Learning','Best Practice',
                                                              'Mix','Synergy',
                                                              'Diligence','Goal'])
    # Initialize
    for key in sr_abcd.index:
        df_con[key] = 0.0

    for idx in df_con.index:
        off_b = 0
        off_e = 0
    
        for key in sr_abcd.index:
            off_b = off_e
            off_e += sr_abcd[key]
            df_con.loc[idx,key] = sim_array[idx][off_b:off_e].mean()
        
        if idx % 1000 == 0:
            print('<cnt:{}, idx:{}> ======================'.format(idx//1000,idx))

    print('==== End of job ======================')
    
    return df_con

In [None]:
%%time
df_ko = set_score(sim_array_ko, df_ko)

In [None]:
%%time
df_in = set_score(sim_array_in, df_in)

### Pickle the Result

In [None]:
df_ko.to_pickle(fname_result_ko)
df_in.to_pickle(fname_result_in)

---

In [None]:
# End of file