In [1]:
from prettytable import PrettyTable
import json
import numpy as np
import re
import pandas as pd
from pysyntime import SynTime

## PATHS OF INPUTS FILES

In [2]:
preprocessing_sentences_path = "inputs\\preprocessed_sentences.csv"
processing_sents_uniformDate_extractTML_path = "outputs\\processing_sents_uniformDate_extractTML.csv"

## Common Variables

In [3]:
synTime = SynTime()
reference_date = '1800-01-01'

months_date = "(January|February|March|April|May|June|July|August|September|October|November|December)"
separators_date = "([\\s|\.|-|,|_|\\|/|;]+)"

regex_day_month_year = "([0-9]{1,2})" + separators_date + months_date + separators_date + "([0-9]{4})"
regex_month_day_year = months_date + separators_date + "([0-9]{1,2})" + separators_date + "([0-9]{4})"
regex_year_month_day = "([0-9]{4})" + separators_date + months_date + separators_date + "([0-9]{1,2})"
regex_year_day_month = "([0-9]{4})" + separators_date + "([0-9]{1,2})" + separators_date + months_date
regex_month_year = months_date + separators_date + "([0-9]{4})"
regex_year_month = "([0-9]{4})" + separators_date + months_date
regex_day_month = "([0-9]{1,2})" + separators_date + months_date
regex_month_day = months_date + separators_date + "([0-9]{1,2})"
re
regex_year = "([0-9]{4})"


## Common Functions

In [18]:
def extractionTimeML(sentence, reference_date) :
    return re.findall("<TIMEX3.*?>(.*?)</TIMEX3>", synTime.extractTimexFromText(sentence, reference_date))

def monthStrToNumber(monthStr):
    monthsStr = {'jan': "01", 'feb': "02", 'mar': "03", 'apr': "04", 'may': "05", 'jun': "06", 'jul': "07", 'aug': "08", 'sep': "09", 'oct': "10", 'nov': "11", 'dec': "12" }  
    monthNumber = monthStr.strip()[:3].lower()
    try:
        out = monthsStr[monthNumber]
        return out
    except:
        raise ValueError('Not a month')

def extractionDayMonthYear(sentence, regex, pos_d, pos_m, pos_y) : 
    output_sentence = sentence
    dates = re.findall(regex, output_sentence, re.IGNORECASE)
    for date in dates :
        day = ("0"+date[pos_d].strip() if len(date[pos_d].strip()) == 1 else date[pos_d].strip()) if pos_d >= 0 else "01"
        month =  monthStrToNumber(date[pos_m].strip()) if pos_m >= 0 else "01"
        year = date[pos_y].strip() if pos_y >= 0 else "YYYY"
        output_sentence = re.sub("".join(date), year+"/"+month+"/"+day, output_sentence)
    return output_sentence

def uniformalizeDate(sentence) :
    output_sentence = extractionDayMonthYear(sentence, regex_day_month_year, 0, 2, 4)
    output_sentence = extractionDayMonthYear(output_sentence, regex_month_day_year, 2, 0, 4)
    output_sentence = extractionDayMonthYear(output_sentence, regex_year_month_day, 4, 2, 0)
    output_sentence = extractionDayMonthYear(output_sentence, regex_year_day_month, 2, 4, 0)
    output_sentence = extractionDayMonthYear(output_sentence, regex_month_year, -1, 0, 2)
    output_sentence = extractionDayMonthYear(output_sentence, regex_year_month, -1, 2, 0)
    output_sentence = extractionDayMonthYear(output_sentence, regex_day_month, 0, 2, -1)
    output_sentence = extractionDayMonthYear(output_sentence, regex_month_day, 2, 0, -1)
    return output_sentence

In [14]:
print(('1', ', ', 'October', ' ', '1859').join() )

1


## Processing

In [26]:
df = pd.read_csv(preprocessing_sentences_path)
sentences = df.values.tolist()

#id = "Q49061"
#sentences = (df.loc[df['author'] == id]).values.tolist()

authors = []
sents_id = []
orig_sents = []
preproc_sents_places = [] 
preproc_sents_people = []
uniformalize_date_sents = []
timeMLTexts = []

for sentence in sentences :    
    authors.append(sentence[0])
    sents_id.append(sentence[1])
    orig_sents.append(sentence[2])
    preproc_sents_places.append(sentence[3])
    preproc_sents_people.append(sentence[4])
    uniformalize_date_sents.append(uniformalizeDate(sentence[2]))
    timeMLTexts.append(" \\=> ".join(extractionTimeML(uniformalizeDate(sentence[2]), reference_date)))
    #timeMLTexts.append("hkba")
    #print("ID: ", sentence[0], ", SENTENCE: ", sentence[2], ", TimeMLText: ", extractionTimeML(uniformalizeDate(sentence[2]), reference_date))

data_processed = {
    "author": authors, 
    "sent_id": sents_id, 
    "orig_sent": orig_sents, 
    "preproc_sent_places": preproc_sents_places, 
    "preproc_sents_people": preproc_sents_people, 
    "uniformalize_date_sents": uniformalize_date_sents,
    "timeMLTexts": timeMLTexts
}

columns_data_processed = ["author", "sent_id", "orig_sent", "preproc_sent_places", "preproc_sents_people", "uniformalize_date_sents", "timeMLTexts"]

df_output = pd.DataFrame(data_processed, columns=columns_data_processed)

df_output.to_csv(processing_sents_uniformDate_extractTML_path, sep=',', index=False, header=True)

## Sort the TimeMLText

In [33]:
#def ordering(items) :
sent = 'Clarissa Minnie Thompson Allen between October 1, 1859 - November 23, 1941 was an American educator and author.'
extract = synTime.extractTimexFromText(sent, reference_date)
print(extract)

Clarissa Minnie Thompson Allen between <TIMEX3 tid="t1" type="DATE" value="1800-01-01">October 1, 1859 -</TIMEX3><TIMEX3 tid="t2" type="DATE" value="1800-01-01">- November 23, 1941</TIMEX3> was an American educator and author.


In [6]:
sentence_text = 'Clarissa Minnie Thompson Allen (<TIMEX3 tid="t1" type="DATE" value="1800-01-01">October 1, 1859</TIMEX3> – <TIMEX3 tid="t2" type="DATE" value="1800-01-01">November 23, 1941</TIMEX3>) was an American educator and author.'
print(re.findall("<TIMEX3.*?>(.*?)</TIMEX3>", sentence_text))

['October 1, 1859', 'November 23, 1941']


In [40]:
sent = 'Clarissa Minnie Thompson Allen between 1, October 1859 - November 23, 1941 was an American educator and author.'
print(uniformalizeDate(sent))

([0-9]{1,2})([\s|\.|-|,|_|\|/|;]+)(January|February|March|April|May|June|July|August|September|October|November|December)([\s|\.|-|,|_|\|/|;]+)([0-9]{4})
(January|February|March|April|May|June|July|August|September|October|November|December)([\s|\.|-|,|_|\|/|;]+)([0-9]{1,2})([\s|\.|-|,|_|\|/|;]+)([0-9]{4})
(January|February|March|April|May|June|July|August|September|October|November|December)([\s|\.|-|,|_|\|/|;]+)([0-9]{4})
([0-9]{4})([\s|\.|-|,|_|\|/|;]+)(January|February|March|April|May|June|July|August|September|October|November|December)
([0-9]{1,2})([\s|\.|-|,|_|\|/|;]+)(January|February|March|April|May|June|July|August|September|October|November|December)
(January|February|March|April|May|June|July|August|September|October|November|December)([\s|\.|-|,|_|\|/|;]+)([0-9]{1,2})
Clarissa Minnie Thompson Allen between 10-01-1859 - 11-23-1941 was an American educator and author.
