# Task1

In [5]:
import pandas as pd
from difflib import SequenceMatcher
import re
import os
from nltk import sent_tokenize

# Load datasets
annotated_df = pd.read_excel(r"D:\66 CausalMap\Panasonic-IDS\data\MarketIntelligenceReport_Annotation_Tasks.xlsx", engine='openpyxl', sheet_name='MarketIntelligenceReport_Task1')
data = pd.read_csv(r"D:\66 CausalMap\Panasonic-IDS\data\MIR.csv")
df = pd.read_csv(r"D:\66 CausalMap\Panasonic-IDS\data\MarketIntelligenceReport.csv")
df['row_id'] = df.index
cols = ['Translated']
# duplicates, except first
m1 = df.duplicated(cols)
# duplicates including first
m2 = df.duplicated(cols, keep=False)
df['first_occurrence'] = (
 df[cols].merge(df.loc[~m1, cols+['row_id']],how='left')['row_id'].astype(str).where(m1, m2)
)

# Get conversions
rowid2docid = {v:k for k,v in enumerate(data['row_id'])}
def get_doc_id(row_id):
    if row_id not in rowid2docid:
        old = row_id
        row_id = int(df[df['row_id']==row_id]['first_occurrence'].values[0])
        print(f'convert {old}-->{row_id}')
    return rowid2docid[row_id]

In [6]:
# verify annotations row_id tallies with original
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [7]:
def find_ref_sent_and_sent_id(original, annotated, verbose=False):
    found = False
    for sent in annotated.split('\n'):
        if ('<CAUSE>' in sent) and ('<EFFECT' in sent):
            found = True
            break

    if not found:
        raise ValueError('No annotations found.')

    annotated_sent = sent
    sent = re.sub('<CAUSE>','',sent)
    sent = re.sub('</CAUSE>','',sent)
    sent = re.sub('<EFFECT>','',sent)
    sent = re.sub('</EFFECT>','',sent)
    
    # original before EDA and deduplication
    found = False
    for sent_id,ref_sent in enumerate(original.split('\n')):
        if similar(ref_sent, sent)>0.90:
            found = True
            break

    if not found:
        print(original)
        print(annotated_sent)
        raise ValueError('Similar original sentence not found.')

    # original after EDA and deduplication
    found = False
    reparsed_sent = sent_tokenize(re.sub('\.(?!\s|\d|$)','. ', sent))[0]
    reparsed_original = sent_tokenize(re.sub('\.(?!\s|\d|$)','. ', original))
    for reparsed_sent_id,reparsed_ref_sent in enumerate(reparsed_original):
        if similar(reparsed_ref_sent, reparsed_sent)>0.90:
            found = True
            break

    if not found:
        print(original)
        print(annotated_sent)
        raise ValueError('Similar original reparsed sentence not found.')
        
    if verbose:
        print(sent_id)
        print(ref_sent)
        print(annotated_sent)
    
    return sent_id, ref_sent, reparsed_sent_id,reparsed_ref_sent, annotated_sent


for i,row in annotated_df.iterrows():
    original = data.loc[get_doc_id(row.row_id)]['Text']
    annotated = row.annotations
    break

find_ref_sent_and_sent_id(original, annotated, verbose=False)

(1,
 'From to Uber and , many foreign companies have had to launch low-price versions of their products in a bid to make inroads in the Indian market.\r',
 1,
 'From to Uber and , many foreign companies have had to launch low-price versions of their products in a bid to make inroads in the Indian market.',
 '<EFFECT> From to Uber and , many foreign companies have had to launch low-price versions of their products </EFFECT> in a bid <CAUSE> to make inroads in the Indian market</CAUSE>.')

In [8]:
def remove_weird_chars(t):
    t = re.sub(r'"','',t)
    t = re.sub(r"'",'',t)
    t = re.sub(r"â|€|”|-|—|™|’|™|\x9d|œ",'',t)
    return t

In [9]:
def find_substring_in_text(text, substring):
    start = 0
    end = len(text)

    substring = re.sub(' ','',substring)
    substring = remove_weird_chars(substring)

    current_string = ''
    found = False
    for i,c in enumerate(text):
        
        c = remove_weird_chars(c)
        
        if c!=' ':
            current_string+=c
        else:
            continue
        
        n = len(current_string)
        if current_string==substring[:n]:
            if n==len(substring):
                found = True
                end = i+1 # end is inclusive
                break
            continue
        else:
            current_string=c
            start = i
    
    if not found:
        raise ValueError
    
    return start,end

In [6]:
# reformat annotations to ensure same as 'ref'/'original'
def reformat_annotations(text, annotated):
    
    cause = re.search(r'<CAUSE>(.*?)</CAUSE>', annotated).group(1).strip()
    effect = re.search(r'<EFFECT>(.*?)</EFFECT>', annotated).group(1).strip()
    
    cause_start, cause_end = find_substring_in_text(text, cause)
    effect_start, effect_end = find_substring_in_text(text, effect)

    adjustment = 13
    text_w_pairs = text
    if cause_start<effect_start:
        effect_start+=adjustment
        effect_end+=adjustment
        text_w_pairs = text_w_pairs[:cause_start] + '<ARG0>' + text_w_pairs[cause_start:cause_end] + '</ARG0>'+ text_w_pairs[cause_end:]
        text_w_pairs = text_w_pairs[:effect_start] + '<ARG1>' + text_w_pairs[effect_start:effect_end] + '</ARG1>'+ text_w_pairs[effect_end:]
    else:
        cause_start+=adjustment
        cause_end+=adjustment
        text_w_pairs = text_w_pairs[:effect_start] + '<ARG1>' + text_w_pairs[effect_start:effect_end] + '</ARG1>'+ text_w_pairs[effect_end:]
        text_w_pairs = text_w_pairs[:cause_start] + '<ARG0>' + text_w_pairs[cause_start:cause_end] + '</ARG0>'+ text_w_pairs[cause_end:]

#     text_w_pairs = text
#     text_w_pairs = re.sub(cause,f'<ARG0>{cause}</ARG0>',text_w_pairs)
#     text_w_pairs = re.sub(effect,f'<ARG1>{effect}</ARG1>',text_w_pairs)
    
    return text_w_pairs


sent_id, text, reparsed_sent_id, reparsed_text, annotated_sent = find_ref_sent_and_sent_id(original, annotated, verbose=False)
reformat_annotations(text, annotated_sent)

'<ARG1>From to Uber and , many foreign companies have had to launch low-price versions of their products</ARG1> in a bid <ARG0>to make inroads in the Indian market</ARG0>.\r'

In [7]:
from nltk.tokenize import sent_tokenize

In [8]:
cols = ['corpus','doc_id','sent_id','eg_id','index','text','text_w_pairs','seq_label','pair_label','context','num_sents']
ref = []
prev_id = None
eg_id = 0

for i,row in annotated_df.iterrows():
    
    
    doc_id = get_doc_id(row.row_id)
    original = data.loc[doc_id]['Text']
    
    annotated = row.annotations
    
    # sanity check
    if not similar(original, annotated)>0.89:
        raise ValueError(original,'\n',annotated)
    
    sent_id, text, reparsed_sent_id, reparsed_text, annotated_sent = find_ref_sent_and_sent_id(original, annotated, verbose=False)
    text_w_pairs = reformat_annotations(text, annotated_sent)
    
    curr_id = f'MIR_{doc_id}_{reparsed_sent_id}'
    if curr_id==prev_id:
        eg_id+=1
    else:
        eg_id=0
    prev_id = curr_id
    
    if ('<ARG0>' not in text_w_pairs) or ('<ARG1>' not in text_w_pairs):
        raise ValueError(f'Cause/Effect Arg not found: \n{text_w_pairs}')
    
    ref.append([
        'MIR',
        doc_id,
        reparsed_sent_id,
        eg_id,
        f'{curr_id}_{eg_id}',
        text,
        text_w_pairs,
        1,
        1,
        '',
        1
    ])

ref = pd.DataFrame(ref, columns=cols)
ref

convert 235-->145
convert 235-->145
convert 235-->145
convert 292-->95


Unnamed: 0,corpus,doc_id,sent_id,eg_id,index,text,text_w_pairs,seq_label,pair_label,context,num_sents
0,MIR,51,1,0,MIR_51_1_0,"From to Uber and , many foreign companies have...","<ARG1>From to Uber and , many foreign companie...",1,1,,1
1,MIR,51,6,0,MIR_51_6_0,"""Given these conditions, it is understandable ...","<ARG0>""Given these conditions</ARG0>, it is un...",1,1,,1
2,MIR,78,0,0,MIR_78_0_0,Toyota in India has largely pivoted toward hyb...,<ARG1>Toyota in India has largely pivoted towa...,1,1,,1
3,MIR,78,2,0,MIR_78_2_0,The government keeps taxes on cars and motorbi...,<ARG0>The government keeps taxes on cars and m...,1,1,,1
4,MIR,143,1,0,MIR_143_1_0,The Japanese automaker pioneered so-called Jus...,The Japanese automaker pioneered so-called Jus...,1,1,,1
5,MIR,143,3,0,MIR_143_3_0,From fashion to food processing to pharmaceuti...,<ARG0>From fashion to food processing to pharm...,1,1,,1
6,MIR,143,5,0,MIR_143_5_0,As the pandemic has hampered factory operation...,<ARG0>As the pandemic has hampered factory ope...,1,1,,1
7,MIR,94,6,0,MIR_94_6_0,"To kick-start the shift, the report suggests b...","To kick-start the shift, the report suggests <...",1,1,,1
8,MIR,328,0,0,MIR_328_0_0,Russian vehicle sales plunged by 63 percent in...,Russian vehicle sales plunged by 63 percent in...,1,1,,1
9,MIR,328,0,1,MIR_328_0_1,Russian vehicle sales plunged by 63 percent in...,<ARG1>Russian vehicle sales plunged by 63 perc...,1,1,,1


In [9]:
ref.to_csv(os.path.join(r'D:\66 CausalMap\Panasonic-IDS\data','MIR_annotated.csv'), index=False, encoding='utf-8-sig')

### Group Data

In [10]:
# ADD MISSING ANNOTATIONS MANUALLY
# remember eg_id must be enumerating

In [10]:
identifier_cols = ['corpus','doc_id','sent_id']

ref = pd.read_csv(os.path.join(r'D:\66 CausalMap\Panasonic-IDS\data','MIR_annotated_adjusted_w_scores.csv')) # sorted
data2 = ref.copy()
grouped_causal_data = data2.pivot_table(
    index=identifier_cols+['text'], 
    columns='eg_id',
    aggfunc=lambda x: ' '.join(x), 
    values='text_w_pairs', 
    fill_value=0
).reset_index()

eg_id_cols = [c for c in grouped_causal_data.columns if c not in identifier_cols+['text']]
grouped_causal_data['causal_text_w_pairs'] = grouped_causal_data[eg_id_cols].values.tolist()
grouped_causal_data['eg_id'] = grouped_causal_data['causal_text_w_pairs'].apply(lambda x_list: min([i for i,x in enumerate(x_list) if x!=0]))
grouped_causal_data['causal_text_w_pairs'] = grouped_causal_data['causal_text_w_pairs'].apply(lambda x_list: [x for x in x_list if x!=0 and x!=""])
grouped_causal_data['num_rs'] = grouped_causal_data['causal_text_w_pairs'].apply(lambda x_list: int(len(x_list)))
grouped_causal_data = grouped_causal_data.drop(columns=eg_id_cols+['text'])
grouped_causal_data

eg_id,corpus,doc_id,sent_id,causal_text_w_pairs,eg_id.1,num_rs
0,MIR,51,1,"[<ARG1>From to Uber and , many foreign compani...",0,1
1,MIR,51,3,[<ARG1>Foreign or local electric cars (EVs) wi...,0,1
2,MIR,51,6,"[<ARG0>""Given these conditions</ARG0>, it is u...",0,1
3,MIR,78,0,[<ARG1>Toyota in India has largely pivoted tow...,0,1
4,MIR,78,2,[<ARG0>The government keeps taxes on cars and ...,0,1
5,MIR,78,8,"[Elon Musk, the billionaire founder of Tesla I...",0,1
6,MIR,94,1,[The report’s focus solely on electric vehicle...,0,1
7,MIR,94,6,"[To kick-start the shift, the report suggests ...",0,1
8,MIR,94,7,[It also recommends <ARG1>setting up battery s...,0,1
9,MIR,143,1,[The Japanese automaker pioneered so-called Ju...,0,1


In [11]:
grouped_ref = ref.merge(grouped_causal_data, how='right', on=identifier_cols+['eg_id'])
grouped_ref['num_rs'] = grouped_ref['num_rs'].fillna(0)
grouped_ref['num_rs'] = grouped_ref['num_rs'].astype(int)
grouped_ref

Unnamed: 0,corpus,doc_id,sent_id,eg_id,index,text,text_w_pairs,seq_label,pair_label,context,num_sents,token_overlap,Unnamed: 12,Unnamed: 13,Unnamed: 14,causal_text_w_pairs,num_rs
0,MIR,51,1,0,MIR_51_1_0,"From to Uber and , many foreign companies have...","<ARG1>From to Uber and , many foreign companie...",1,1,,1,1,1,True,,"[<ARG1>From to Uber and , many foreign compani...",1
1,MIR,51,3,0,MIR_51_3_0,Foreign or local electric cars (EVs) will grow...,<ARG1>Foreign or local electric cars (EVs) wil...,1,1,,1,1,1,True,,[<ARG1>Foreign or local electric cars (EVs) wi...,1
2,MIR,51,6,0,MIR_51_6_0,"""Given these conditions, it is understandable ...","<ARG0>""Given these conditions</ARG0>, it is un...",1,1,,1,1,1,True,,"[<ARG0>""Given these conditions</ARG0>, it is u...",1
3,MIR,78,0,0,MIR_78_0_0,Toyota in India has largely pivoted toward hyb...,<ARG1>Toyota in India has largely pivoted towa...,1,1,,1,1,1,True,,[<ARG1>Toyota in India has largely pivoted tow...,1
4,MIR,78,2,0,MIR_78_2_0,The government keeps taxes on cars and motorbi...,<ARG0>The government keeps taxes on cars and m...,1,1,,1,1,1,True,,[<ARG0>The government keeps taxes on cars and ...,1
5,MIR,78,8,0,MIR_78_8_0,"Elon Musk, the billionaire founder of Tesla In...","Elon Musk, the billionaire founder of Tesla In...",1,1,,1,1,1,True,,"[Elon Musk, the billionaire founder of Tesla I...",1
6,MIR,94,1,0,MIR_94_1_0,The report’s focus solely on electric vehicles...,The report’s focus solely on electric vehicles...,1,1,,1,1,1,True,,[The report’s focus solely on electric vehicle...,1
7,MIR,94,6,0,MIR_94_6_0,"To kick-start the shift, the report suggests b...","To kick-start the shift, the report suggests <...",1,1,,1,1,1,True,,"[To kick-start the shift, the report suggests ...",1
8,MIR,94,7,0,MIR_94_7_0,It also recommends setting up battery swapping...,It also recommends <ARG1>setting up battery sw...,1,1,,1,1,1,True,,[It also recommends <ARG1>setting up battery s...,1
9,MIR,143,1,0,MIR_143_1_0,The Japanese automaker pioneered so-called Jus...,The Japanese automaker pioneered so-called Jus...,1,1,,1,1,1,True,,[The Japanese automaker pioneered so-called Ju...,1


In [12]:
print('Sentences:', len(grouped_ref['num_rs']))
print('Relations:', sum(grouped_ref['num_rs']))

Sentences: 43
Relations: 49


In [13]:
grouped_ref.to_csv(os.path.join(r'D:\66 CausalMap\Panasonic-IDS\data','MIR_annotated_grouped.csv'), index=False, encoding='utf-8-sig')

# Get arguments only for Clustering

In [14]:
ref = pd.read_csv(os.path.join(r'D:\66 CausalMap\Panasonic-IDS\data','MIR_annotated_adjusted_w_scores.csv'))

cols = ['span','ce_label','index']
spans = []
ce_labels = []
indexes = []
for i,row in ref.iterrows():
    text_w_pairs = row['text_w_pairs']
    cause = re.search(r'<ARG0>(.*?)</ARG0>', text_w_pairs).group(1).strip()
    effect = re.search(r'<ARG1>(.*?)</ARG1>', text_w_pairs).group(1).strip()
    spans.extend([cause,effect])
    ce_labels.extend(['cause','effect'])
    indexes.extend([row['index'],row['index']])

task2 = pd.DataFrame([spans, ce_labels, indexes]).T
task2.columns = cols
task2

Unnamed: 0,span,ce_label,index
0,to make inroads in the Indian market,cause,MIR_51_1_0
1,"From to Uber and , many foreign companies have...",effect,MIR_51_1_0
2,there's price parity with its internal combust...,cause,MIR_51_3_0
3,Foreign or local electric cars (EVs) will grow...,effect,MIR_51_3_0
4,"""Given these conditions",cause,MIR_51_6_0
...,...,...,...
93,"we have emerged safer, stronger, and more resi...",effect,MIR_5484_2_0
94,ASEAN trade volumes have returned to pre-pande...,cause,MIR_5484_3_0
95,and the bloc’s economy is forecast to grow by ...,effect,MIR_5484_3_0
96,to be better prepared for future shocks,cause,MIR_5484_4_0


In [15]:
task2.to_csv(r"D:\66 CausalMap\Panasonic-IDS\data\MIR_spantopics.csv", index=False,encoding='utf-8-sig')

# Task2

In [4]:
import networkx as nx
import os
import re
import pandas as pd
import numpy as np

In [33]:
data = pd.read_csv(r"D:\66 CausalMap\Panasonic-IDS\data\MIR_spantopics.csv")
data['topic- Debdeep'] = data['topic- Debdeep'].apply(lambda x: str(x).lower())
data

Unnamed: 0,span,ce_label,index,topic- Jeffery,topic- Debdeep
0,to make inroads in the Indian market,cause,MIR_51_1_0,increase market,increased competition
1,"From to Uber and , many foreign companies have...",effect,MIR_51_1_0,lower price,price reduction
2,there's price parity with its internal combust...,cause,MIR_51_3_0,lower price,cost reduction
3,Foreign or local electric cars (EVs) will grow...,effect,MIR_51_3_0,increase production,increased demand
4,"""Given these conditions",cause,MIR_51_6_0,lower price,competition
...,...,...,...,...,...
93,"we have emerged safer, stronger, and more resi...",effect,MIR_5484_2_0,improve community,resilience
94,ASEAN trade volumes have returned to pre-pande...,cause,MIR_5484_3_0,increase investment,recovery
95,and the bloc’s economy is forecast to grow by ...,effect,MIR_5484_3_0,improve economy,better economy
96,to be better prepared for future shocks,cause,MIR_5484_4_0,better preparation,resilience


In [34]:
from collections import Counter
node_weights = data['topic- Debdeep']
node_weights = dict(Counter(node_weights))
node_weights

{'increased competition': 1,
 'price reduction': 1,
 'cost reduction': 4,
 'increased demand': 2,
 'competition': 1,
 'supply reduction': 3,
 'hybrid': 2,
 'taxation': 3,
 'uncertainly': 1,
 'incentives': 1,
 'improved infrastructure': 1,
 'low delivery time': 1,
 'risk mitigation': 1,
 'process improvement': 2,
 'innovation': 6,
 'strategy': 1,
 ' business hurdles': 1,
 'risk': 1,
 'transportation disruption': 2,
 'raw material shortage': 1,
 'global politics': 2,
 'long delay': 1,
 'sales decrese': 1,
 'policymaking': 3,
 'adoption': 1,
 'disintegration': 1,
 'low adoption': 1,
 'supply increase': 1,
 'expansion': 2,
 'green energy adoption': 3,
 'cost increase': 1,
 'compliance': 3,
 'infrastructure': 1,
 'improve safety': 2,
 'audit': 3,
 'awareness': 1,
 'sustainable environment': 1,
 'safety violation': 2,
 'accident': 1,
 'new supplier': 1,
 'fraud': 1,
 'supply shortage': 5,
 'production loss': 6,
 'imbalance': 1,
 'unemployment': 1,
 'business strategy': 1,
 'technology improv

In [35]:
data.columns

Index(['span', 'ce_label', 'index', 'topic- Jeffery ', 'topic- Debdeep'], dtype='object')

In [40]:
ref = pd.read_csv(os.path.join(r'D:\66 CausalMap\Panasonic-IDS\data','MIR_annotated_adjusted_w_scores.csv'))
graph_df = pd.pivot_table(data, values='topic- Debdeep', index=['index'], columns=['ce_label'], aggfunc=lambda x: ' '.join(x)).reset_index()
graph_df = graph_df.merge(ref[['text_w_pairs','index']], how='left', on='index')
graph_df

Unnamed: 0,index,cause,effect,text_w_pairs
0,MIR_143_1_0,low delivery time,risk mitigation,The Japanese automaker pioneered so-called Jus...
1,MIR_143_3_0,process improvement,innovation,<ARG0>From fashion to food processing to pharm...
2,MIR_143_3_1,strategy,process improvement,<ARG1>From fashion to food processing to pharm...
3,MIR_143_4_0,business hurdles,risk,But the tumultuous events of the past year hav...
4,MIR_143_5_0,transportation disruption,raw material shortage,<ARG0>As the pandemic has hampered factory ope...
5,MIR_1785_4_0,technology improvement,innovation,<ARG0>Technogoical advancements such us automa...
6,MIR_1785_4_1,innovation,obsolete components,Technogoical advancements such us automated co...
7,MIR_1785_5_0,green energy adoption,innovation,<ARG1>Implementation of automotive camless pis...
8,MIR_2060_8_0,product improvement,design innovation,The 2022 Maruti Suzuki Brezza will <ARG1>conti...
9,MIR_3225_4_0,talent,technology hub,<ARG1>Entering new locationsWe strategically c...


In [41]:
G = nx.DiGraph()

for node, weight in node_weights.items():
    G.add_node(node.split('>>')[-1], weight=weight)

for i,row in graph_df.iterrows():
    G.add_edge(
        row.cause.split('>>')[-1], 
        row.effect.split('>>')[-1], 
        weight=1,
        evidence=str(row.text_w_pairs)
    )

In [43]:
print(len(G.edges))
print(len(G.nodes))

print(list(G.edges)[0])
G.edges[list(G.edges)[0]]

43
60
('increased competition', 'price reduction')


{'weight': 1,
 'evidence': '<ARG1>From to Uber and , many foreign companies have had to launch low-price versions of their products</ARG1> in a bid <ARG0>to make inroads in the Indian market</ARG0>.\r'}

In [44]:
graph_folder = r"D:\66 CausalMap\SciLit_CausalMap\visualization\mir_paper"
nx.write_gml(G, os.path.join(graph_folder, "user_debdeep.gml"))