## Importing necessary libraries and utility functions 

In [None]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
# from data import combine_with_reverb, create_bertified_dataset
from utils import read_data
from copy import copy
import numpy as np 
from collections import Counter

## Creating intermediate data indices

In [None]:
combine_with_reverb()
create_bertified_dataset()

## Determine Question word family

In [None]:
QUESTION_WORDS = ['what', 'which', 'where', 'when', 'why', 'who', 'how', 'whom']

In [None]:
def question_word_id(string):
    for item in string.strip().lower().split():
        if item in QUESTION_WORDS:
            return item
    return 'None'

In [None]:
train_df = pd.read_excel('../data/train.xlsx'); valid_df = pd.read_excel('../data/valid.xlsx'); test_df = pd.read_excel('../data/test.xlsx')

In [None]:
test_df.head(3)

## Counting number of rels and args

In [None]:
all_questions = pd.concat([train_df, valid_df, test_df])

In [None]:
def get_unique_ent_rel(dataframe):
    arg1 = [eval(item)[0] for item in dataframe['triple'].to_list()]
    arg2 = [eval(item)[2] for item in dataframe['triple'].to_list()]
    rel = [eval(item)[1] for item in dataframe['triple'].to_list()]
    print(f'Unique arg1 : {len(set(arg1))}')
    print(f'Unique arg2 : {len(set(arg2))}')
    print(f'Unique rel : {len(set(rel))}')
    print(f'Unique args : {len(set(arg1+arg2))}')


In [None]:
get_unique_ent_rel(train_df)

In [None]:
get_unique_ent_rel(valid_df)

In [None]:
get_unique_ent_rel(test_df)

In [None]:
get_unique_ent_rel(all_questions)

## Histogram of question word distribution

In [None]:
def get_question_words_distribution(dataframe):
    dataframe['question words'] = dataframe['Question'].astype(str).apply(question_word_id)
    print(dataframe.groupby(['question words'])['Meaningful'].count())
    ax = dataframe.groupby(['question words'])['Meaningful'].count().plot.bar(x='lab', y='val', rot=0)
    return 0

In [None]:
get_question_words_distribution(train_df)

In [None]:
get_question_words_distribution(valid_df)

In [None]:
get_question_words_distribution(test_df)

In [None]:
null = pd.read_excel('../results/null.xlsx')
get_question_words_distribution(null)

## Counting Unique words in Qeustions

In [None]:
def get_count_of_unique_words(dataframe):
    tokenizer = lambda string:string.strip().lower().split()
    tokenized_questions = dataframe['Question'].astype(str).apply(tokenizer).to_list()
    flatten_tokenized_questions = [item for sublist in tokenized_questions for item in sublist]
    return len(set(flatten_tokenized_questions))

In [None]:
print(get_count_of_unique_words(train_df))
print(get_count_of_unique_words(valid_df))
print(get_count_of_unique_words(test_df))

## Question length histogram

In [None]:
def get_length(dataframe):
    tokenizer = lambda string:len(string.strip().lower().split())
    dataframe['length'] = dataframe['Question'].astype(str).apply(tokenizer)
    ax = dataframe.hist(column=['length'], bins=50, grid=True)
    ax[0, 0].set_xlim(1, 16)
    return dataframe['length']
     

In [None]:
tr = get_length(train_df)
va = get_length(valid_df)
te = get_length(test_df)

## length of Train, Valid, Test

In [None]:
train, valid, test = read_data()
print(len(train[0]), len(valid[0]), len(test[0]))

## Determining relation component count

In [None]:
def get_relation_component_count(dataset):
    results = []
    for item in dataset[1]:
#         print(item[2:])
        temp = ''.join(map(str, item[2:].tolist())).split('0')
        temp = list(filter(lambda item:item!='', temp))
        results.append(len(temp))
    components = []
    occurrences = []
    for item in set(results):
        components.append(str(item))
        occurrences.append(results.count(item))
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.bar(components,occurrences)
    plt.show() 
    return components,occurrences


In [None]:
tr = get_relation_component_count(train)
va = get_relation_component_count(valid)
te = get_relation_component_count(test)

## Determining length of relation based on whole question length

In [None]:
def get_relation_length(dataset):
    results = []
    for tok, rel in zip(dataset[0], dataset[1]):
        length = sum((tok!=0).astype(int))
        temp = sum(rel[2:])/length
        results.append(temp)
    results = map(lambda item:round(item, 1), results)
    results = list(results)
    components = []
    occurrences = []
    for item in sorted(set(results)):
        components.append(str(item))
        occurrences.append(results.count(item))
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
#     ax.set_xticks(ax.get_xticks()[::2])
    ax.bar(components,occurrences)
    plt.show() 
    return components,occurrences

In [None]:
tr = get_relation_length(train); va = get_relation_length(valid); te = get_relation_length(test) 

In [None]:
refrence = pd.read_excel('../data/intermediate.xlsx')
refrence.Question = refrence.Question.apply(lambda x:str(x).lower().strip())
error_dataframe = {
                    'Question':[],
                    'candidates':[],
                    'actual':[],
                    'node':[],
                    'edge':[]
                }
with open('../results/Valid_Set_Without.txt', 'r') as res:
    for line in res:
        if line.find('Question')!=-1:
            temp = eval(line.split(': ')[1].strip())
            error_dataframe['Question'].append(' '.join(temp))
        elif line.find('Sorted candidates')!=-1:
            error_dataframe['candidates'].append(eval(line.split(': ')[1].strip()))
        elif line.find('Node: ')!=-1:
            line = line.replace(', Edge','').split(': ')
            error_dataframe['node'].append(line[1])
            error_dataframe['edge'].append(line[2].strip())
        elif line.find('Actual line number')!=-1:
            error_dataframe['actual'].append(eval(line.split(': ')[1].strip()))
error_dataframe['Meaningful'] = [1 for _ in error_dataframe['actual']]
# for k,v in error_dataframe.items():
#     print(k, len(v), v[:5])
#     error_dataframe[k]=v[:5350]
error_df = pd.DataFrame(error_dataframe)            

In [None]:
error_analysis = pd.merge(error_df, refrence, how='inner', on='Question')

In [None]:
empty_condidates = error_analysis[error_analysis.candidates.apply(lambda x:len(x)==0)][['Question', 'node', 'edge', 'triple', 'Reverb_no']]
empty_condidates['triple'] = empty_condidates.triple.apply(lambda x:list(str(item).lower() for item in eval(x)))

In [None]:
# empty_condidates.to_excel('empty_candidates.xlsx')
empty_condidates.head()

In [None]:
from fuzzywuzzy import fuzz
node_precision, edge_precision = [], []
for index, row in empty_condidates.iterrows():
  try:
    
    temp = max([fuzz.ratio(item, row['node']) for item in row['triple']])
    node_precision.append(temp) 
    temp = max([fuzz.ratio(item, row['edge']) for item in row['triple']])
    edge_precision.append(temp) 
    
  except Exception as e:
    # raise e
    pass
  # break 
print(sum(node_precision)/len(node_precision))
print(sum(edge_precision)/len(edge_precision))


In [None]:
def rec_rank(key,dic):
    try:
        return dic[key]/float(sum(dic.values()))
    except:
        return 1/1000
def get_mean_rec_rank(dataframe):
    dataframe['rr'] = dataframe.apply(lambda row:rec_rank(row['actual'],Counter([item[0] for item in row['candidates']])), axis=1)
    print(dataframe['rr'].mean())

In [None]:
get_mean_rec_rank(error_df)

In [None]:
def get_was_born(string):
    tokenized = string.strip().lower().split()
    if ('was' in tokenized) and ('born' in tokenized):
            return True
    return False
error_df['was born']=error_df['Question'].apply(get_was_born)

In [None]:
get_null_candidates = lambda candidates:len(candidates)==0
null_questions = copy(error_df[error_df['candidates'].apply(get_null_candidates)])
not_null_questions = copy(error_df[~error_df['candidates'].apply(get_null_candidates)])

In [None]:
null_questions.to_excel('../results/null.xlsx')

In [None]:
get_question_words_distribution(null_questions)

In [None]:
def get_all_hit1(dataframe):
    index_list = []
    for index, row in dataframe.iterrows():
        if row['candidates'][0][0]==row['actual']:
                index_list.append(True)
        else:
            index_list.append(False)
    return index_list

In [None]:
not_null_questions[get_all_hit1(not_null_questions)].to_excel('../results/hit1.xlsx')

In [None]:
not_null_questions[[not elem for elem in get_all_hit1(not_null_questions)]].to_excel('../results/not_null_not_hit1.xlsx')

## Counting Unique rels and args in Reverb tuples

In [None]:
df = pd.read_csv(r'C:\git\reverb_wikipedia_tuples-1.1.txt', sep='\t', header=None)
reverb_columns_name = ['ExID', 'arg1', 'rel', 'arg2', 'narg1', 'nrel', 'narg2', 'csents', 'conf', 'urls']
df.columns = reverb_columns_name
df = df.dropna()
df = df.drop_duplicates()


In [None]:
print(f'Unique arg1 : {len(df["arg1"].unique())}')
print(f'Unique arg2 : {len(df["arg2"].unique())}')
print(f'Unique rel : {len(df["rel"].unique())}')
print(f'Unique args : {len(set(df["arg1"].unique().tolist()+df["arg2"].unique().tolist()))}')
print(f'Unique narg1 : {len(df["narg1"].unique())}')
print(f'Unique narg2 : {len(df["narg2"].unique())}')
print(f'Unique nrel : {len(df["nrel"].unique())}')
print(f'Unique nargs : {len(set(df["narg1"].unique().tolist()+df["narg2"].unique().tolist()))}')



In [None]:
normal_vocab = df["narg1"].unique().tolist()+df["narg2"].unique().tolist()+df["nrel"].unique().tolist()
normal_vocab = list(map(lambda x:x.split(), normal_vocab))
normal_vocab = [item for sublist in normal_vocab for item in sublist]
print(len(set(normal_vocab)))

In [None]:
vocab = df["arg1"].unique().tolist()+df["arg2"].unique().tolist()+df["rel"].unique().tolist()
vocab = list(map(lambda x:x.split(), vocab))
vocab = [item for sublist in vocab for item in sublist]
print(len(set(vocab)))

In [18]:
import pandas as pd
test = pd.read_excel(r'.xlsx', engine ='openpyxl')
valid = pd.read_excel(r'.xlsx', engine ='openpyxl')
rr = lambda row: 1/10000 if int(row['Reverb_no']) not in [item[0] for item in eval(row['sys'])] else 1/(1+[item[0] for item in eval(row['sys'])].index(row['Reverb_no']))
test['RR'] = test.apply(rr, axis=1)
valid['RR'] = valid.apply(rr, axis=1)
print(test['RR'].mean(), valid['RR'].mean())

0.8230408554867079 0.8347173823574736
