In [8]:
import re
import spacy
from collections import Counter
from string import punctuation
from os import listdir
import time

current_year = time.strftime("%y", time.localtime())

nlp = spacy.load('en_core_web_lg')
nlp.vocab["CLASSIFIED"].is_stop = True # On ajoute CLASSIFIED au stop words

In [9]:
data_path = './echantillon-cablegate_converted/'
files = [f for f in listdir(data_path) if f[-4:] == '.txt']

In [17]:
res = {}
for i in (0, 4):
    with open(data_path + files[i], encoding="utf8", errors='ignore') as f:
        content = f.read()

        extracted_doc = {}
        
        extracted_doc['doc_name'] = files[i][:-4]
        extracted_doc['date'] = extract_date(content)
        extracted_doc['tags'] = extract_tags(content)
        extracted_doc['from'] = extract_from(content)
        extracted_doc['place_of_document'] = re.findall(r"[A-Z]+", extracted_doc['doc_name'])[-1] # -1 because we take the last one
        extracted_doc['subject'] = extract_subject(content)
        extracted_doc['most_common_words'] = extract_most_common_words(content)
        extracted_doc['keywords'] = extract_keywords(content)

        ner = dict([(str(x), x.label_) for x in nlp(content).ents])
        extracted_doc['people_involved'] = [key for key in ner.keys() if ner[key] == 'PERSON' ]
        extracted_doc['place_involved'] = [key for key in ner.keys() if ner[key] == 'GPE' or ner[key] == 'LOC' ]
        extracted_doc['entity_involved'] = [key for key in ner.keys() if ner[key] == 'ORG' or ner[key] == 'NORP' ]


        res[i] = extracted_doc


VZCZCXYZ0005
RR RUEHWEB
DE RUEHHK #4795/01 3540903
ZNR UUUUU ZZH
R 200903Z DEC 06
FM AMCONSUL HONG KONG
TO RUCPDOC/USDOC WASHDC
INFO RUEHC/SECSTATE WASHDC 9871
RHMFIUU/HQ BICE WASHINGTON DC
UNCLAS HONG KONG 004795 
 
SIPDIS 
 
USDOC FOR 532/OEA/LHINES/DFARROW 
USDOC FOR 3132 FOR FCS/OIO REGIONAL DIRECTOR WILLIAM 
ZARIT 
BICE FOR OFFICE OF STRATEGIC INVESTIGATIONS 
 
SIPDIS 
 
E.O. 12958: N/A 
TAGS: BMGT BEXP HK ETRD ETTC
SUBJECT: EXTRANCHECK: POST SHIPMENT VERIFICATION: 
ADVANCED ENERGY-SHENZHEN C/O BALTRANS LOGISTRIC 
 
REF: A) USDOC 05352 
 
1.Unauthorized disclosure of the information provided 
below is prohibited by Section 12C of the Export 
Administration Act. 
 
2. As per reftel A request and at the direction of the 
Office of Enforcement Analysis (OEA) of the USDOC 
Bureau of Industry and Security (BIS), Export Control 
Officer Philip Ankel (ECO) conducted a post shipment 
verification (PSV) at Advanced Energy-Shenzhen 
(Advanced Energy) C/O Baltrans Logistic (Baltrans), 59 
T

AttributeError: 'NoneType' object has no attribute 'group'

In [14]:
content="VZCZCXYZ0005\nRR RUEHWEB\nDE RUEHHK #4795/01 3540903\nZNR UUUUU ZZH\nR 200903Z DEC 06\nFM AMCONSUL HONG KONG\nTO RUCPDOC/USDOC WASHDC\nINFO RUEHC/SECSTATE WASHDC 9871\nRHMFIUU/HQ BICE WASHINGTON DC\nUNCLAS HONG KONG 004795"
re.search(r"[A-Z]\s[0-9]{6}[A-Z]\s[A-Z]{3}\s[0-9]{2}", content)

<re.Match object; span=(65, 81), match='R 200903Z DEC 06'>

In [16]:
def extract_date(content):
    print(content)
    date_line = re.search(r"[A-Z]\s[0-9]{6}[A-Z]\s[A-Z]{3}\s[0-9]{2}", content)
    date_line = date_line.group()
    date_month = date_line.split(" ")[2]
    date_year = date_line.split(" ")[3]
    
    months = ['JAN', 'FEV', 'MAR', "APR", "MAY", "JUN", "JUl", "AUG", "SEP", "OCT", "NOV", "DEC"]
    
    date = {}
    date['year'] = None
    date['month'] = None
    if date_month in months :
        if int(date_year) > int(current_year):
            year = "19"+date_year
        else:
            year="20"+date_year
        date['year'] = year
        date['month'] = date_month
    else :
        print("ERR : unexpected month =", date_months)
    return date

In [2]:
def extract_tags(content):
    tags_list = [line for line in content.splitlines() if line.startswith(r"TAGS:")]
    print(tags_list)
    return tags_list[0].split()[1:]

In [3]:
def extract_from(content):
    fm_list = [line for line in content.splitlines() if line.startswith("FM ")]
    return fm_list[0][3:-1] # y a toujours un espace à la fin

In [4]:
def extract_subject(content):
    subject_list = [line for line in content.splitlines() if line.startswith("SUBJECT: ")]
    return subject_list[0][9:-1]# y a toujours un espace à la fin

In [5]:
def extract_most_common_words(content):
    doc = nlp(content)
    # all tokens that arent stop words or punctuations
    words = [token.text
         for token in doc
         if not token.is_stop and not token.is_punct]

    # five most common tokens
    word_freq = Counter(words)
    common_words = word_freq.most_common(15)
    return [word for word in common_words if len(word[0])>1]

In [6]:
def get_hotwords(text):
    result = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN'] # 1
    doc = nlp(text.lower()) # 2
    for token in doc:
        # 3
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        # 4
        if(token.pos_ in pos_tag):
            result.append(token.text)
                
    return result # 5

In [7]:
def extract_keywords(content):
    output = set(get_hotwords(content))
    return [x[0] for x in Counter(output).most_common(5)]