In [1]:
# https://lvngd.com/blog/named-entity-recognition-in-python-with-stanford-ner-and-spacy/
# download models to : C:\Users\effbl\stanza_corenlp\stanford-ner-2020-11-17
# model from: https://nlp.stanford.edu/software/CRF-NER.shtml#Download

import nltk
from nltk.tag.stanford import StanfordNERTagger
import os

java_path = r"C:\Program Files\AdoptOpenJDK\jdk-11.0.7.10-hotspot\bin\java.exe"
PATH_TO_JAR= r"C:\Users\effbl\stanza_corenlp\stanford-ner-2020-11-17\stanford-ner.jar"
PATH_TO_MODEL = r"C:\Users\effbl\stanza_corenlp\stanford-ner-2020-11-17\classifiers\english.muc.7class.distsim.crf.ser.gz"

os.environ['JAVAHOME'] = java_path
tagger = StanfordNERTagger(model_filename=PATH_TO_MODEL,path_to_jar=PATH_TO_JAR, encoding='utf-8')

In [2]:
import os
import re
import pandas as pd
from tqdm import tqdm
from copy import copy

graph_folder = r"D:\66 CausalMap\SciLit_CausalMap\visualization\mir"
csv_file_path = os.path.join(graph_folder,'graph_spans.csv')
data = pd.read_csv(csv_file_path)
data

Unnamed: 0,effect,source,support,evidence,cause
0,""" A U . S . facility would need a better U . S...",f-476,1,"""A U.S. facility would need a better U.S. supp...",to get similar to the low cost available in Ch...
1,""" As a result of great uncertainties , Finns s...",f-4849,2,"""As a result of great uncertainties, Finns sho...",a possible power shortage in the coming winter
2,""" Being profitable in India is possible",f-29,1,"""Being profitable in India is possible if you ...",you do things the India way
3,""" But real challenge is bringing in maximum sa...",f-123,1,"""But real challenge is bringing in maximum saf...",cost of vehicles does not go up steeply .
4,""" China ' s high - speed lines are ruthlessly ...",f-5198,1,"""China's high-speed lines are ruthlessly effic...","once booked , a swipe of your ID card or passp..."
...,...,...,...,...,...
6432,” SOURCE : ACEA,f-1804,1,Measures will also need to be taken to stimula...,Measures will also need to be taken to stimula...
6433,” Stopping work at the factory would harm Stel...,f-808,1,We have people in Russia and we love them also...,We have people in Russia and we love them also .
6434,” The slowdown in China is likely to have knoc...,f-4742,1,“This will lock China substantially out of the...,“ This will lock China substantially out of th...
6435,” don ’ t expect to see it on Australian stree...,f-4473,1,While the revealed car is “production ready” d...,VW prioritises markets with stricter emissions...


In [3]:
# classes = ['LOCATION','PERSON','ORGANIZATION','MONEY','PERCENT','DATE','TIME']
DEMO_SIZE = None
REDUCE_SIZE = None #REDUCE_SIZE = int(N_TOPICS/2)
TOP_N_WORDS = 15

In [4]:
if DEMO_SIZE is None:
    N_TOPICS = int(len(data)*2*0.3)
else:
    N_TOPICS = int(DEMO_SIZE*0.3)

if DEMO_SIZE is not None:
    data = data[:int(DEMO_SIZE/2)]

In [5]:
template = {
    'ORGANIZATION': [],
    'LOCATION': [],
    'DATE': [],
    'TIME': [],
    'ACTION': [],
    'OBJECT': []
}

In [6]:
batch_size = 200
n_batches = (len(data)//batch_size)+1
print(n_batches)

33


In [33]:
def get_node_infos(data):
    
    cause_action = []
    effect_action = []
    cause_action_rem = []
    effect_action_rem = []
    evidence_ner = []
    cause_store = []
    effect_store = []

    whitespace = '\s*'
    specials = ['*','+','(',')'] # need to escape special regex chars
    
    for i,row in tqdm(data.iterrows(), total=data.shape[0]):
        cause = row.cause
        effect = row.effect
        cause_rem = cause
        effect_rem = effect
        sentence = row.evidence    
        words = nltk.word_tokenize(sentence) 
        tagged = tagger.tag(words)
        tagged2 = []
        curr_word = ''
        curr_ner = None
        for (word,ner) in tagged:
            if ner==curr_ner: # same, continue
                curr_word+=str(word)
            else: # different
                if curr_ner is not None:
                    tagged2.append((curr_word,curr_ner))
                # reset
                if ner=='O':
                    curr_ner=None
                    curr_word=''
                else:
                    curr_ner=ner
                    curr_word=word

        cause_template = copy(template)
        effect_template = copy(template)
        for (word,ner) in tagged2:
            if ner!='O':
#                 pattern = ''.join([str(c)+whitespace for c in word])
#                 pattern = re.sub('\(','\(',re.sub('\)','\)', pattern)) # escape special parenthesis
                pattern = ''.join(["\\"+str(c)+whitespace if c in specials else str(c)+whitespace for c in word])
                pattern = pattern[:-3]

                match = re.findall(pattern, cause)
                if match is not None:
                    cause_template[ner] =  list(match)
                    cause = re.sub(pattern, f'[{ner}]', cause)
                    cause_rem = re.sub(pattern, '[MASK]', cause_rem)

                match = re.findall(pattern, effect)
                if match is not None:
                    effect_template[ner] =  list(match)
                    effect = re.sub(pattern, f'[{ner}]', effect)
                    effect_rem = re.sub(pattern, '[MASK]', effect_rem)

        pos_tag = nltk.pos_tag(nltk.word_tokenize(re.sub('\[MASK\]','',cause_rem)))
        cause_template['ACTION'] = ' '.join([str(word) for word,tag in pos_tag if tag[:2] in ['VB']])
        cause_template['OBJECT'] = ' '.join([str(word) for word,tag in pos_tag if tag[:2] in ['NN']])

        pos_tag = nltk.pos_tag(nltk.word_tokenize(re.sub('\[MASK\]','',effect_rem)))
        effect_template['ACTION'] = ' '.join([str(word) for word,tag in pos_tag if tag[:2] in ['VB']])
        effect_template['OBJECT'] = ' '.join([str(word) for word,tag in pos_tag if tag[:2] in ['NN']])

        cause_action.append(cause)
        effect_action.append(effect)
        cause_action_rem.append(cause_rem)
        effect_action_rem.append(effect_rem)
        evidence_ner.append(tagged)
        cause_store.append(cause_template)
        effect_store.append(effect_template)

    data['cause_action'] = cause_action
    data['effect_action'] = effect_action
    data['cause_action_rem'] = cause_action_rem
    data['effect_action_rem'] = effect_action_rem
    data['evidence_ner'] = evidence_ner
    data['cause_store'] = cause_store
    data['effect_store'] = effect_store
    
    return data


continue_from = 21
if continue_from is None:
    final = pd.DataFrame()
    loop = range(n_batches)
else:
    loop = range(continue_from,n_batches)
    
for batch_id in loop:
    print(f'batch: {batch_id}')
    subset = get_node_infos(data[batch_id*batch_size:(batch_id+1)*batch_size])
    final = pd.concat([final,subset],axis=0)
    final.to_pickle(f"nodes_extracted_infos.pkl") # backup

batch: 21


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [07:12<00:00,  2.16s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

batch: 22


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [08:17<00:00,  2.49s/it]


batch: 23


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [08:23<00:00,  2.52s/it]


batch: 24


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [08:04<00:00,  2.42s/it]


batch: 25


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [08:21<00:00,  2.51s/it]


batch: 26


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [08:01<00:00,  2.41s/it]


batch: 27


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [07:45<00:00,  2.33s/it]


batch: 28


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [07:20<00:00,  2.20s/it]


batch: 29


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [08:02<00:00,  2.41s/it]


batch: 30


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [07:43<00:00,  2.32s/it]


batch: 31


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [07:54<00:00,  2.37s/it]


batch: 32


100%|██████████████████████████████████████████████████████████████████████████████████| 37/37 [01:22<00:00,  2.24s/it]


In [34]:
# ##### For Debugging

# row = data.iloc[batch_id*batch_size+75]

# whitespace = '\s*'
# specials = ['*','+','(',')'] # need to escape special regex chars

# cause = row.cause
# effect = row.effect
# cause_rem = cause
# effect_rem = effect
# sentence = row.evidence

# words = nltk.word_tokenize(sentence) 
# tagged = tagger.tag(words)
# tagged2 = []
# curr_word = ''
# curr_ner = None
# for (word,ner) in tagged:
#     if ner==curr_ner: # same, continue
#         curr_word+=str(word)
#     else: # different
#         if curr_ner is not None:
#             tagged2.append((curr_word,curr_ner))
#         # reset
#         if ner=='O':
#             curr_ner=None
#             curr_word=''
#         else:
#             curr_ner=ner
#             curr_word=word
# print(tagged2)

# cause_template = copy(template)
# effect_template = copy(template)
# for (word,ner) in tagged2:
#     if ner!='O':
#         pattern = ''.join(["\\"+str(c)+whitespace if c in specials else str(c)+whitespace for c in word])
#         pattern = pattern[:-3]

#         match = re.findall(pattern, cause)
#         if match is not None:
#             cause_template[ner] =  list(match)
#             cause = re.sub(pattern, f'[{ner}]', cause)
#             cause_rem = re.sub(pattern, '[MASK]', cause_rem)

#         match = re.findall(pattern, effect)
#         if match is not None:
#             effect_template[ner] =  list(match)
#             effect = re.sub(pattern, f'[{ner}]', effect)
#             effect_rem = re.sub(pattern, '[MASK]', effect_rem)

# pos_tag = nltk.pos_tag(nltk.word_tokenize(re.sub('\[MASK\]','',cause_rem)))
# cause_template['ACTION'] = ' '.join([str(word) for word,tag in pos_tag if tag[:2] in ['VB']])
# cause_template['OBJECT'] = ' '.join([str(word) for word,tag in pos_tag if tag[:2] in ['NN']])

# pos_tag = nltk.pos_tag(nltk.word_tokenize(re.sub('\[MASK\]','',effect_rem)))
# effect_template['ACTION'] = ' '.join([str(word) for word,tag in pos_tag if tag[:2] in ['VB']])
# effect_template['OBJECT'] = ' '.join([str(word) for word,tag in pos_tag if tag[:2] in ['NN']])

In [35]:
data = final
del(final)
# data = pd.read_pickle("nodes_extracted_infos.pkl")

In [36]:
# from collections import Counter

# objects = {}
# for i,d in enumerate(cause_store):
#     obj = d['OBJECT'].lower().split(' ')
#     for o in obj:
#         if o not in objects.keys():
#             objects[o] = [f'c{i}']
#         else:
#             objects[o].append(f'c{i}')
# for i,d in enumerate(effect_store):
#     obj = d['OBJECT'].lower().split(' ')
#     for o in obj:
#         if o not in objects.keys():
#             objects[o] = [f'e{i}']
#         else:
#             objects[o].append(f'e{i}')

# top_objects = []
# cut_off = 2
# indexes_to_keep = []
# for k,v in objects.items():
#     if len(v)>=cut_off and len(k)>1:
#         top_objects.append(k)
#         indexes_to_keep.extend(v)
# cause_indexes_to_keep = [int(i[1:]) for i in indexes_to_keep if i[0]=='c']
# effect_indexes_to_keep = [int(i[1:]) for i in indexes_to_keep if i[0]=='e']
# indexes_to_keep = list(set(cause_indexes_to_keep) & set(effect_indexes_to_keep))
# top_objects

In [37]:
# tmp = data.loc[indexes_to_keep].reset_index(drop=True)
# tmp

In [38]:
# graph_folder = r"D:\66 CausalMap\SciLit_CausalMap\visualization\mir_clust"
# headers = ['cause','effect','source','support','evidence']

# rows = []
# for i,row in tmp.iterrows():
#     if row['cause'][:2].lower()=='to':
#         # flip cause and effect
#         rows.append([
#             str(row.effect_store['ACTION']+' '+row.effect_store['OBJECT']),
#             str(row.cause_store['ACTION']+' '+row.cause_store['OBJECT']),
#             str(row.source),
#             int(row['support']),
#             row['cause']+' --> '+row['effect'] + '/// ' + str(row['cause_store'])+' --> '+str(row['effect_store'])
#         ])
#     else:
#         rows.append([
#             str(row.cause_store['ACTION']+' '+row.cause_store['OBJECT']),
#             str(row.effect_store['ACTION']+' '+row.effect_store['OBJECT']),
#             str(row.source),
#             int(row['support']),
#             row['cause']+' --> '+row['effect'] + '/// ' + str(row['cause_store'])+' --> '+str(row['effect_store'])
#         ])
# graph_df = pd.DataFrame(rows, columns=headers)
# graph_df

In [39]:
# node_df = pd.concat([
#     graph_df[['cause','source']].rename(columns={'cause':'node','source':'sources'}),
#     graph_df[['effect','source']].rename(columns={'effect':'node','source':'sources'})
# ],axis=0).drop_duplicates().reset_index(drop=True)
# node_df

In [40]:
# graph_df.to_csv(os.path.join(graph_folder,'graph.csv'), index=False, encoding='utf-8-sig')
# node_df.to_csv(os.path.join(graph_folder,'node_df.csv'), index=False, encoding='utf-8-sig')

In [41]:
from simcse import SimCSE
from sklearn.cluster import KMeans

model = SimCSE("princeton-nlp/sup-simcse-bert-base-uncased")
sentences = list(data['cause_action_rem'])+list(data['effect_action_rem'])
sentences = [re.sub('[MASK]','',i) for i in sentences]
embeddings = model.encode(sentences)
print(embeddings.shape)

kmeans = KMeans(N_TOPICS)
kmeans_output = kmeans.fit(embeddings)

100%|████████████████████████████████████████████████████████████████████████████████| 202/202 [14:10<00:00,  4.21s/it]


torch.Size([12874, 768])


In [42]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

### 4. Select Topic Words from Clusters
def _preprocess_text(documents):
    """ Basic preprocessing of text
    Steps:
        * Lower text
        * Replace \n and \t with whitespace
        * Only keep alpha-numerical characters
    """
    cleaned_documents = [doc.lower() for doc in documents]
    cleaned_documents = [doc.replace("\n", " ") for doc in cleaned_documents]
    cleaned_documents = [doc.replace("\t", " ") for doc in cleaned_documents]

    return cleaned_documents
    
# _update_topic_size
documents = pd.DataFrame({
    'Topic': list(kmeans_output.labels_),
    'Document': sentences
})
sizes = documents.groupby(['Topic']).count().sort_values("Document", ascending=False).reset_index()
topic_sizes = dict(zip(sizes.Topic, sizes.Document))

# _extract_topics
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})

# _weighting_words
concatenated_documents = _preprocess_text(documents_per_topic.Document.values)
origin_documents = _preprocess_text(documents.Document.values)

# count the words in a cluster
vectorizer_model = CountVectorizer()
vectorizer_model.fit(concatenated_documents)
words = vectorizer_model.get_feature_names()
# k * vocab
X_per_cluster = vectorizer_model.transform(concatenated_documents)
# D * vocab
X_origin = vectorizer_model.transform(origin_documents)



In [43]:
# https://github.com/hyintell/topicx/blob/56b03e5e3bfcdae9ea47d65f82c2a15be0f649a8/baselines/cetopic/tfidf_idfi.py#L8
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import normalize
import numpy as np
import pandas as pd
import scipy.sparse as sp


class TFIDF_IDFi(TfidfTransformer):

    def __init__(self, X_per_cluster, X_origin, all_documents, *args, **kwargs):
        print('====== Using TFIDF_IDFi ======')
        super().__init__(*args, **kwargs)
        self.X_per_cluster = X_per_cluster
        self.X_origin = X_origin
        self.all_documents = all_documents
        
    
    def score(self):
        
        self._global_tfidf = self.fit_transform(self.X_origin)
        
        global_df = pd.DataFrame(self._global_tfidf.toarray())
        global_df['Topic'] = self.all_documents.Topic
        
        avg_global_df = global_df.groupby(['Topic'], as_index=False).mean()
        avg_global_df = avg_global_df.drop('Topic', 1)
        self._avg_global_tfidf = avg_global_df.values
        
        local_tfidf_transformer = TfidfTransformer()
        local_tfidf_transformer.fit_transform(self.X_per_cluster)
        self._idfi = local_tfidf_transformer.idf_
        
        scores = self._avg_global_tfidf * self._idfi
        scores = normalize(scores, axis=1, norm='l1', copy=False)
        scores = sp.csr_matrix(scores)

        return scores 
    

def _top_n_idx_sparse(matrix, n):
    """ Return indices of top n values in each row of a sparse matrix
    Retrieved from:
        https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix
    Args:
        matrix: The sparse matrix from which to get the top n indices per row
        n: The number of highest values to extract from each row
    Returns:
        indices: The top n indices per row
    """
    indices = []
    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
        n_row_pick = min(n, ri - le)
        values = matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]]
        values = [values[index] if len(values) >= index + 1 else None for index in range(n)]
        indices.append(values)
    return np.array(indices)


def _top_n_values_sparse(matrix, indices):
    """ Return the top n values for each row in a sparse matrix
    Args:
        matrix: The sparse matrix from which to get the top n indices per row
        indices: The top n indices per row
    Returns:
        top_values: The top n scores per row
    """
    top_values = []
    for row, values in enumerate(indices):
        scores = np.array([matrix[row, value] if value is not None else 0 for value in values])
        top_values.append(scores)
    return np.array(top_values)


# self.word_select_method == 'tfidf_idfi'
scores = TFIDF_IDFi(X_per_cluster, X_origin, documents).score()
# _extract_words_per_topic
labels = sorted(list(topic_sizes.keys()))
indices = _top_n_idx_sparse(scores, 30)
scores = _top_n_values_sparse(scores, indices)
sorted_indices = np.argsort(scores, 1)
indices = np.take_along_axis(indices, sorted_indices, axis=1)
scores = np.take_along_axis(scores, sorted_indices, axis=1)
topics = {label: [(words[word_index], score)
                  if word_index and score > 0
                  else ("", 0.00001)
                  for word_index, score in zip(indices[index][::-1], scores[index][::-1])
                  ]
          for index, label in enumerate(labels)}
topics = {label: values[:TOP_N_WORDS] for label, values in topics.items()}

TOP_N_KEYWORDS = 3
topic2keywords = {}

for k,v in topics.items():
    print('\n', k)
    print(v)
    if topic_sizes[k]==1:
        topic2keywords[k] = documents[documents['Topic']==k]['Document'].iloc[0]
    else:
        topic2keywords[k] = '_'.join([i[0] for i in v[:TOP_N_KEYWORDS]])

topic2keywords






 0
[('assets', 0.27795146534190174), ('leveraged', 0.1712575517428647), ('injecting', 0.15509192313045295), ('inject', 0.11766217957318832), ('what', 0.06274442577756019), ('yet', 0.0604710068381538), ('capital', 0.056264913755370405), ('another', 0.05220860051517357), ('into', 0.03442346779004136), ('of', 0.00702269094998379), ('to', 0.004901774585309337), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)]

 1
[('827', 0.14094864034454707), ('30', 0.08335332163332057), ('units', 0.0824124170717943), ('drove', 0.08114218964841453), ('77', 0.06875417272593642), ('29', 0.06613785537457002), ('increased', 0.059285881505200116), ('sales', 0.052785131906086004), ('once', 0.05205932764142577), ('again', 0.0482819395263377), ('uvs', 0.0431224837628996), ('volume', 0.042381952905026506), ('sold', 0.03664848698937182), ('growth', 0.02700892513992014), ('million', 0.024111941410944433)]

 2
[('', 1e-05), ('cut', 0.07069405793322787), ('plan', 0.06338584593453145), ('50', 0.03931307962612583), 


 1794
[('ones', 0.2935678903465689), ('leaving', 0.2733517093598413), ('current', 0.18514393492112596), ('as', 0.07762456657463769), ('it', 0.05656004519197307), ('is', 0.05398245266403645), ('the', 0.031002640770758604), ('and', 0.028766760171057986), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)]

 1795
[('evokes', 0.1684263848367893), ('gurgaon', 0.1684263848367893), ('nostalgia', 0.1684263848367893), ('surprising', 0.14381520260098113), ('hardly', 0.1253561503818045), ('based', 0.06383996185419617), ('much', 0.06344998218681625), ('plant', 0.05317871113281099), ('it', 0.02008334389735085), ('that', 0.01949328375716271), ('the', 0.005504209678509409), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)]

 1796
[('murky', 0.11691310270313092), ('precarious', 0.11220410105226213), ('finances', 0.0998292611235068), ('presents', 0.05708955340688295), ('how', 0.0504066628400085), ('perspective', 0.04791068468837668), ('own', 0.04708558462450300


 2269
[('occasions', 0.1008294798801296), ('replied', 0.1008294798801296), ('curious', 0.1008294798801296), ('ns', 0.1008294798801296), ('necessarily', 0.1008294798801296), ('queries', 0.1008294798801296), ('buyers', 0.04864047232209691), ('least', 0.04862638636851679), ('different', 0.046904746496424315), ('four', 0.04646150626346886), ('potential', 0.043255785289109214), ('such', 0.030275171787245543), ('not', 0.02296736336068436), ('car', 0.021113518422317963), ('electric', 0.019782807671258056)]

 2270
[('shelved', 0.28326080049376473), ('plans', 0.07420647084515224), ('away', 0.053116128655846064), ('crore', 0.05096413229087701), ('completely', 0.050605593622077276), ('rs', 0.04770937463868866), ('worth', 0.04481801277595875), ('projects', 0.044547768664556106), ('combustion', 0.04249385533625614), ('do', 0.03457425042769521), ('has', 0.03407005113558289), ('engine', 0.033306426132792596), ('also', 0.030137239920223432), ('three', 0.030034935975968947), ('now', 0.0262924030287817


 2680
[('left', 0.1508003656393553), ('area', 0.14764678069846976), ('expensive', 0.14165680648814813), ('almost', 0.13014788578110567), ('state', 0.11112163828999774), ('as', 0.0842488074780304), ('such', 0.07728896956492787), ('they', 0.06370584687410712), ('an', 0.04463574548490881), ('for', 0.024971755410840856), ('of', 0.015363320680970386), ('the', 0.008412077609137907), ('', 1e-05), ('', 1e-05), ('', 1e-05)]

 2681
[('clinical', 0.12631779116471523), ('deferral', 0.12631779116471523), ('elective', 0.12631779116471523), ('neurology', 0.12631779116471523), ('surgeries', 0.12631779116471523), ('cardiology', 0.11534443808523602), ('areas', 0.0660453940473384), ('various', 0.061869174082762705), ('across', 0.04608941907029531), ('such', 0.03792832048567189), ('as', 0.020671874580227432), ('and', 0.007660755925301192), ('of', 0.007539302875845497), ('to', 0.005262365023745543), ('', 1e-05)]

 2682
[('blindfolded', 0.17491952558396795), ('tied', 0.15972409114017722), ('thing', 0.12580


 3030
[('automation', 0.04137336098643669), ('digitalization', 0.03416347849296076), ('automated', 0.028030864914689275), ('robotics', 0.0268592514182841), ('increasing', 0.02546117446693331), ('platforms', 0.023475065117408836), ('assistants', 0.02214413648971991), ('virtual', 0.01890834074945497), ('realms', 0.017977121108023412), ('algorithm', 0.01780378094828705), ('forefront', 0.017147044318712508), ('sophistication', 0.016472688548205702), ('highly', 0.016053606435098212), ('drones', 0.01581968253046011), ('robots', 0.01566144747177721)]

 3031
[('printed', 0.30061366723296873), ('dealerships', 0.20700401187341455), ('running', 0.1860516062665269), ('material', 0.16117996576888938), ('out', 0.08122515001742384), ('are', 0.04598341140232921), ('of', 0.01794218743844745), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)]

 3032
[('wing', 0.41313411500218616), ('avoid', 0.2124866589229493), ('big', 0.1750890539092701), ('need', 


 3335
[('cheered', 0.30923825651829395), ('narrative', 0.30923825651829395), ('online', 0.16781186209131502), ('any', 0.11599530999952791), ('chinese', 0.0876103436131209), ('the', 0.010105971259448282), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)]

 3336
[('equipment', 0.12351816739939155), ('tooling', 0.10620362572066731), ('obtain', 0.10291274776857419), ('welding', 0.09397261425902464), ('upgrades', 0.08998946736042561), ('installation', 0.08656507876609545), ('training', 0.07962577027152015), ('new', 0.06292616238171432), ('due', 0.053169227138843284), ('process', 0.05063432511378864), ('development', 0.04168220704756886), ('support', 0.04123567148480348), ('or', 0.025931666583536427), ('and', 0.014107313166272421), ('to', 0.013419967776658494)]

 3337
[('cjpt', 0.6725334267328865), ('need', 0.22908057847296517), ('will', 0.09838599479414836), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), (


 3654
[('consequence', 0.19009263198064735), ('twice', 0.17357909458185675), ('as', 0.09332583345615686), ('countries', 0.07465659079864856), ('much', 0.0716121415578976), ('cost', 0.06464688218769828), ('many', 0.06164712080358252), ('like', 0.06005996527432701), ('years', 0.059300178287447276), ('models', 0.057508043309465205), ('other', 0.047512355390749564), ('in', 0.021405408537573976), ('for', 0.018441486642112575), ('the', 0.00621226719183645), ('', 1e-05)]

 3655
[('coveted', 0.21605808745456043), ('equoia', 0.19728890492330228), ('cruiser', 0.158286271808759), ('name', 0.1311592010871838), ('land', 0.11668472486289395), ('won', 0.11311391386101419), ('have', 0.03534200441577573), ('that', 0.025006067848947072), ('the', 0.0070608237375635995), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)]

 3656
[('cooby', 0.1920343136110346), ('zhng', 0.1920343136110346), ('wrx', 0.16996515709993648), ('ost', 0.14292702606434665), ('owners', 0.13811746763108154

{0: 'assets_leveraged_injecting',
 1: '827_30_units',
 2: '_cut_plan',
 3: 'affordable_displays_superbike',
 4: 'acquire_decarbonised_transaction',
 5: 'challenges_unavoidable_ding',
 6: 'products_namely_array',
 7: 'speeding_shift_zero',
 8: 'hybrid_restraint_cost',
 9: 'being__',
 10: 'chain_supply_disruptions',
 11: 'earlier_fell_oybean',
 12: 'ventures_automakers_chinese',
 13: 'companies are now focussing immensely on talent retention and employee development programmes .',
 14: 'pandemic_measures_comply',
 15: 'obilio_br_sports',
 16: 'growth_market_markets',
 17: 'sales_189_increased',
 18: 'bought_buyers_serve',
 19: 'shortage_caused_by',
 20: 'globally_operate_world',
 21: 'failed_weak_affected',
 22: 'bev_goals_determination',
 23: 'fire_exacerbated_alerted',
 24: 'idle_prompting_joint',
 25: 'investment_sake_addressing',
 26: 'tight_import_hassle',
 27: 'output_gradually_substantially',
 28: 'dvances_architectures_electrical',
 29: 'clients_communications_multipronged',
 30:

In [44]:
topic_id = 10
print(topic2keywords[topic_id])
documents[documents['Topic']==topic_id]

chain_supply_disruptions


Unnamed: 0,Topic,Document
2352,10,potential supply chain disruptions as a result...
3111,10,a few ( unrelated ) supply - chain disruptions .
5502,10,had it not been for lingering supply chain woes .
6058,10,": [] lockdown may disrupt supply chain ,"
6785,10,", and supply chains look unlikely to return to..."
7420,10,", there may be disruptions in the supply chain..."
7687,10,; reports suggest there could be a shortage of...
12622,10,which can lead to supply shortages in case of ...


In [63]:
documents['Keywords'] = documents['Topic'].apply(lambda x: topic2keywords[int(x)])
documents

Unnamed: 0,Topic,Document,Keywords
0,693,to get similar to the low cost available in [] .,cheap_mass_inexpensive
1,1246,a possible power shortage in the coming winter,winter_severely_finns
2,3114,you do things the [] way,sort_things_you
3,832,cost of vehicles does not go up steeply .,steeply_waits_expensive
4,766,"once booked , a swipe of your ID card or passp...","once booked , a swipe of your ID card or passp..."
...,...,...,...
12869,123,” OURCE : CE,ource_arcus_ce
12870,2489,” topping work at the factory would harm tella...,harm_worker_violate
12871,1467,” The slowdown in [] is likely to have knock -...,sharply_slowed_second
12872,3273,” don ’ t expect to see it on ustralian street...,” don ’ t expect to see it on ustralian street...


In [66]:
documents.to_csv(os.path.join(graph_folder, 'node_topics.csv'), index=False, encoding='utf-8-sig')

In [55]:
tmp = copy(data)
tmp['cause_topic'] = list(documents['Topic'])[:int(len(documents)/2)]
tmp['effect_topic'] = list(documents['Topic'])[int(len(documents)/2):]
tmp

Unnamed: 0,effect,source,support,evidence,cause,cause_action,effect_action,cause_action_rem,effect_action_rem,evidence_ner,cause_store,effect_store,cause_topic,effect_topic
0,""" A U . S . facility would need a better U . S...",f-476,1,"""A U.S. facility would need a better U.S. supp...",to get similar to the low cost available in Ch...,to get similar to the low cost available in [L...,""" A [LOCATION] facility would need a better [L...",to get similar to the low cost available in [M...,""" A [MASK] facility would need a better [MASK]...","[(``, O), (A, O), (U.S., LOCATION), (facility,...","{'ORGANIZATION': [], 'LOCATION': ['China'], 'D...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...",693,216
1,""" As a result of great uncertainties , Finns s...",f-4849,2,"""As a result of great uncertainties, Finns sho...",a possible power shortage in the coming winter,a possible power shortage in the coming winter,""" As a result of great uncertainties , Finns s...",a possible power shortage in the coming winter,""" As a result of great uncertainties , Finns s...","[(``, O), (As, O), (a, O), (result, O), (of, O...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...",1246,1246
2,""" Being profitable in India is possible",f-29,1,"""Being profitable in India is possible if you ...",you do things the India way,you do things the [LOCATION] way,""" Being profitable in [LOCATION] is possible",you do things the [MASK] way,""" Being profitable in [MASK] is possible","[(``, O), (Being, O), (profitable, O), (in, O)...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...",3114,2090
3,""" But real challenge is bringing in maximum sa...",f-123,1,"""But real challenge is bringing in maximum saf...",cost of vehicles does not go up steeply .,cost of vehicles does not go up steeply .,""" But real challenge is bringing in maximum sa...",cost of vehicles does not go up steeply .,""" But real challenge is bringing in maximum sa...","[(``, O), (But, O), (real, O), (challenge, O),...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...",832,710
4,""" China ' s high - speed lines are ruthlessly ...",f-5198,1,"""China's high-speed lines are ruthlessly effic...","once booked , a swipe of your ID card or passp...","once booked , a swipe of your ID card or passp...",""" [LOCATION] ' s high - speed lines are ruthle...","once booked , a swipe of your ID card or passp...",""" [MASK] ' s high - speed lines are ruthlessly...","[(``, O), (China, LOCATION), ('s, O), (high-sp...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...","{'ORGANIZATION': [], 'LOCATION': ['China'], 'D...",766,2779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6432,” SOURCE : ACEA,f-1804,1,Measures will also need to be taken to stimula...,Measures will also need to be taken to stimula...,Measures will also need to be taken to stimula...,” SOURCE : ACEA,Measures will also need to be taken to stimula...,” SOURCE : ACEA,"[(Measures, O), (will, O), (also, O), (need, O...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...",419,123
6433,” Stopping work at the factory would harm Stel...,f-808,1,We have people in Russia and we love them also...,We have people in Russia and we love them also .,We have people in [LOCATION] and we love them ...,” Stopping work at the factory would harm Stel...,We have people in [MASK] and we love them also .,” Stopping work at the factory would harm Stel...,"[(We, O), (have, O), (people, O), (in, O), (Ru...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...",1438,2489
6434,” The slowdown in China is likely to have knoc...,f-4742,1,“This will lock China substantially out of the...,“ This will lock China substantially out of th...,“ This will lock [LOCATION] substantially out ...,” The slowdown in [LOCATION] is likely to have...,“ This will lock [MASK] substantially out of t...,” The slowdown in [MASK] is likely to have kno...,"[(“, O), (This, O), (will, O), (lock, O), (Chi...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...",3126,1467
6435,” don ’ t expect to see it on Australian stree...,f-4473,1,While the revealed car is “production ready” d...,VW prioritises markets with stricter emissions...,[ORGANIZATION] prioritises markets with strict...,” don ’ t expect to see it on Australian stree...,[MASK] prioritises markets with stricter emiss...,” don ’ t expect to see it on Australian stree...,"[(While, O), (the, O), (revealed, O), (car, O)...","{'ORGANIZATION': ['VW'], 'LOCATION': [], 'DATE...","{'ORGANIZATION': [], 'LOCATION': [], 'DATE': [...",56,3273


In [56]:
graph_folder = r"D:\66 CausalMap\SciLit_CausalMap\visualization\mir_nerclust"
headers = ['cause','effect','source','support','evidence']

rows = []
for i,row in tmp.iterrows():
    rows.append([
        str(int(row.cause_topic))+'>>'+topic2keywords[int(row.cause_topic)],
        str(int(row.effect_topic))+'>>'+topic2keywords[int(row.effect_topic)],
        str(row.source),
        int(row['support']),
        str(row.cause)+' --> '+ str(row.effect) + ';' + str(row.evidence)
    ])
graph_df = pd.DataFrame(rows, columns=headers)

# show only reliable info
graph_df[(graph_df['support']>1) | ((graph_df['cause'].str.contains('>>', na=True)) & (graph_df['effect'].str.contains('>>', na=True)))]

Unnamed: 0,cause,effect,source,support,evidence
0,693>>cheap_mass_inexpensive,216>>chain_supply_infrastructure,f-476,1,to get similar to the low cost available in Ch...
1,1246>>winter_severely_finns,1246>>winter_severely_finns,f-4849,2,a possible power shortage in the coming winter...
2,3114>>sort_things_you,2090>>boost_profitability_profit,f-29,1,"you do things the India way --> "" Being profit..."
3,832>>steeply_waits_expensive,710>>safety_improves_service,f-123,1,cost of vehicles does not go up steeply . --> ...
4,"766>>once booked , a swipe of your ID card or ...",2779>>rail_speed_high,f-5198,1,"once booked , a swipe of your ID card or passp..."
...,...,...,...,...,...
6432,419>>demand_meet_user,123>>ource_arcus_ce,f-1804,1,Measures will also need to be taken to stimula...
6433,1438>>friends_love_we,2489>>harm_worker_violate,f-808,1,We have people in Russia and we love them also...
6434,3126>>2024_break_substantially,1467>>sharply_slowed_second,f-4742,1,“ This will lock China substantially out of th...
6435,56>>regulations_emissions_stringent,3273>>” don ’ t expect to see it on ustralian ...,f-4473,1,VW prioritises markets with stricter emissions...


In [57]:
node_df = pd.concat([
    graph_df[['cause','source']].rename(columns={'cause':'node','source':'sources'}),
    graph_df[['effect','source']].rename(columns={'effect':'node','source':'sources'})
],axis=0).drop_duplicates().reset_index(drop=True)
node_df

Unnamed: 0,node,sources
0,693>>cheap_mass_inexpensive,f-476
1,1246>>winter_severely_finns,f-4849
2,3114>>sort_things_you,f-29
3,832>>steeply_waits_expensive,f-123
4,"766>>once booked , a swipe of your ID card or ...",f-5198
...,...,...
12272,123>>ource_arcus_ce,f-1804
12273,2489>>harm_worker_violate,f-808
12274,1467>>sharply_slowed_second,f-4742
12275,3273>>” don ’ t expect to see it on ustralian ...,f-4473


In [58]:
graph_df.to_csv(os.path.join(graph_folder,'graph.csv'), index=False, encoding='utf-8-sig')
node_df.to_csv(os.path.join(graph_folder,'node_df.csv'), index=False, encoding='utf-8-sig')