# ALGORITMOS DE EXTRACCIÓN

En este script se encuentran los cuatro algoritmos construidos para extraer información de forma automática

## 0. Imports

In [None]:
import math
import numpy as np
import json
import pandas as pd
import re

import spacy
from spacy import displacy 
from collections import Counter
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

## 1. Background preparation

### 1.1. Load BD

In [None]:
data_path = ''

with open(data_path, encoding="utf8") as fp:
    db = json.loads(fp.read())
    
data = pd.DataFrame(db)

### 1.2. Insert Data_type column

In [None]:
structured_id = ['3', '5', '17', '30', '38', '39', '68', '75', '80', '84']
not_structured_id = ['1', '2', '16', '32', '33', '40', '41', '44', '45', '52', '55', '56', '57', '58', '59', '60', '61', '62', '74', '78', '79', '81', '82', '90', '94', '100', '106']
semi_structured_id = ['4', '6', '9', '10', '13', '15', '21', '27', '31', '35', '36', '37', '42', '63', '66', '76', '77', '83', '87', '88', '91', '92', '103']

df = data.copy()
df['Data_type'] = np.nan

for idx in range(len(df)):
    if df['Plat Id'][idx] in structured_id:
        df['Data_type'][idx] = 'structured'
    if df['Plat Id'][idx] in not_structured_id:
        df['Data_type'][idx] = 'not-structured'
    if df['Plat Id'][idx] in semi_structured_id:
        df['Data_type'][idx] = 'semi-structured'

### 1.3. Select english data

In [None]:
english_data = df[df['Language'] == 'English']
english_data = english_data.reset_index(drop=True)

### 1.4. Select one random sample and clean text

In [None]:
def select_random_sample(data):
    random_sample = data.sample(1)
    df = pd.DataFrame()
    #title
    df['Title'] = random_sample['TITLE']
    #data type
    df['Data_type'] = random_sample['Data_type']
    #Plat country
    df['Country'] = random_sample['Plat country']
    #description
    text_list = random_sample['DESCRIPTION']
    text_list = text_list.tolist()
    text_list = text_list[0]
    text = ''
    for sentence in text_list:
        text = text + sentence
    print(text)
    df['Description'] = text
    
    # = df.reset_index()
    return df

In [None]:
def clean(text):
    
    # removing new line characters
    text = re.sub('\n ','',str(text))
    text = re.sub('\n',' ',str(text))
    text = text.replace('\\n',' ')
    # removing hyphens
    text = re.sub("-",' ',str(text))
    text = re.sub("— ",'',str(text))
    # removing quotation marks
    text = re.sub('\"','',str(text))
    # removing salutations
    text = re.sub("Mr\.",'Mr',str(text))
    text = re.sub("Mrs\.",'Mrs',str(text))
    # removing any reference to outside text
    #text = re.sub("<\[\(\[].*?[\)\]]>", "", str(text))
    text = re.sub("</a>", "", str(text))
    text = re.sub("<a", "", str(text))    
    return text

## 2. ALGORITHMS

### 2.1. Load Corpus Train

In [None]:
#!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')

### 2.2. OBJECTIVE

In [None]:
def extract_objective(sample, doc):
    
    objective = None
    #create list of string sentence
    sents_list = []
    for sent in doc.sents:
        sents_list.append(sent.text)
        
    #lemmatizamos las no stop words
    #es una lista de strings 
    words_list = []
    for token in doc:
        if token.is_stop == False and token.is_alpha==True:
            words_list.append(token.lemma_)
                
        
    #palabras clave
    words = ['goal', 'purpose', 'objective', 'intention', 'ambition', 'promote', 'dedicate']
    synonyms = []
    for w in words:
        for synset in wordnet.synsets(w):
            for i in synset.lemmas():
                if i.name() not in synonyms and i.name()!='use':
                    synonyms.append(i.name())
    
    
    
    #Miramos si alguna palabra de la lista words esta en el texto
    for token in doc:
        if token.text in words:
            for sentence in sents_list:
                    if token.text in sentence.split(' '):
                        objective = sentence
                        return objective
    
    
    #Mirar si alguna palabra de words_list esta en synonyms, si lo esta entonces devolver esa frase donde esta esa word
    for token in doc:
        if token.is_stop == False and token.is_alpha==True:
            if (token.text in synonyms) or (token.lemma_ in synonyms):
                for sentence in sents_list:
                    if token.text in sentence.split(' '):
                        objective = sentence
                        return objective
    
    #Mirar la similitud entre las palabras de synonyms y todas las palabras de words_list y si supera un umbral
    #devolver la frase donde esta esa word
    threshold = 0.7
    tokens_list = []
    for token in doc:
        word1 = nlp(token.text)
        for synonym in synonyms:
            word2 = nlp(synonym)
            if word1.similarity(word2) >= threshold:
                tokens_list.append(token.text)
                
                
    for sent in sents_list:
        for tok in tokens_list:
            if tok in sent:
                objective = sent
                return sent

### 2.3. GELOCATION

In [None]:
def extract_location(sample, doc):
    locations = []
    location = []
    for ent in doc.ents:
        if ent.label_ in ['LOC']:
            locations.append(ent.text)
        if ent.label_ in ['GPE']:
            locations.append(ent.text)

        #intento de añadir gentilicios
        '''if ent.label_ in ['NORP']:
            lemmatizer = WordNetLemmatizer()
            word = ent.text
            word = word.lower()
            word = lemmatizer.lemmatize(word)
            locations.append(word)'''

    if len(locations) == 0:
        return None
    
    else:
        locations = Counter(locations).most_common(1)
        for t in locations:
            location.append(t[0])
    
    return location

### 2.4. ORGANIZATION

In [None]:
def extract_organization(sample, doc):
    organizations = []
    organization = []
    for ent in doc.ents:
        if ent.label_ in ['ORG']:
            #si la entidad es identificada como organización pero es igual que el titulo, la descartamos 
            #porque no es una organizacion sino el nombre del proyecto
            if sample['Title'].tolist()[0] not in ent.text:
                if ent not in organizations:
                    organizations.append(ent)
    if len(organizations) == 0:
        return [None]
    else:
        organizations = Counter(organizations).most_common(1)
        for t in organizations:
            organization.append(t[0])

    return organization

### 2.5. PARTICIPANTS

In [None]:
def extract_participants(sample, doc):
    
    sents_list = []
    for sent in doc.sents:
        sents_list.append(sent.text)
    
    categories = ['students', 'University students', 'kids', 'adults', 'community', '18 years', 'group of', 'undergraduate students']
    
    #Gentilicios
    loc = extract_location(rs, doc)
    
    participants = []
    
    for sent in sents_list:
        for category in categories:
            if category in sent:
                participants.append(category)
    
    '''for token in doc:
        if token.text in categories:
            participants.append(token.text)'''
        
    if len(participants) == 0: #Si no especifica devolvemos Anyone
        return "Anyone"
    
    elif categories[6] in participants:
        for sent in doc.sents:
            s = sent.text
            if categories[6] in s:
                words_list = s.split(' ')
                for i in range(len(words_list)):
                    if words_list[i] == 'group' and words_list[i+1] == 'of':
                        return(words_list[i] + ' ' + words_list[i+1] + ' '+ words_list[i+2])
    
    elif categories[4] in participants:
        if loc == None:
                participant = 'Area community'
        else:
            participant = "Area community (" + loc[0] + ')'
        return participant
    
    elif (categories[3] in participants) or (categories[5] in participants): #si existe "adults" o "18 years" se categoriza como adults
        return 'Adults'      
    
    else:
        return participants

### 2.6. DATES

In [None]:
def extract_dates(sample, doc):
    dates = []
    for ent in doc.ents:
        if ent.label_ in ['DATE']:
            dates.append(ent)
        if ent.label_ in ['TIME']:
            dates.append(ent)
    return dates

## 3. PROCESSING TEXT

### 3.1. Select one random text

In [None]:
rs = select_random_sample(english_data)
text = rs['Description'].to_list()
text = clean(text)

### 3.2. Create nlp

In [None]:
doc = nlp(text)

#para visualizar entidades nombradas del texto
#displacy.render(doc, style='ent')

#para saber que significan las entidades:
#spacy.explain("FAC")

In [None]:
#spacy.explain("GPE")

### 3.3. Apply algorithms

In [None]:
print('LOCATION:', extract_location(rs, doc))
print('ORGANIZATION/S:', extract_organization(rs, doc))
print('DATE/S or DURATION:', extract_dates(rs, doc))
print('PARTICIPANTS: ', extract_participants(rs, doc))

In [None]:
print('OBJECTIVE: ', extract_objective(rs, doc))

## 4. CSV CREATION

### 4.1. Preprocessing

In [None]:
def clean_data(data):
    df = pd.DataFrame()
    #title
    df['Title'] = data['TITLE']
    #data type
    df['Data_type'] = data['Data_type']
    #Plat country
    df['Country'] = data['Plat country']
    #Description
    df['Description'] = data['DESCRIPTION']
    
    #Add algorithms columns
    df['Objective'] = ''
    df['Geolocalization'] = ''
    df['Organization'] = ''
    df['Participants'] = ''
    
    df = df.reset_index()
    
    return df

In [None]:
def create_temp_register(data, index):
    df = pd.DataFrame()
    df = data.iloc[[index]]
    return df

In [None]:
def extract_description(data, index):
    text_list = [data['Description'][index]]
    #text_list = text_list.tolist()
    text_list = text_list[0]
    text = ''
    for sentence in text_list:
        if type(sentence) == str:
            text = text + sentence
        elif type(sentence) == int:
            text = text + str(sentence)
    return text

In [None]:
d = clean_data(english_data)
text = extract_description(d, 0)
text = clean(text)
#------------
x = create_temp_register(d, 0)
doc = nlp(text)
loc = extract_participants(x, doc)
print(type(loc))

In [None]:
if type(d['Description'][2468]) == float:
    print('yes')

### 4.2. Information extraction (all english data)

In [None]:
def db_creation(data):
    #Select only the columns need it
    imp_data = clean_data(data)
    for i in data.index:
        print(i)
        if type(imp_data['Description'][i]) == float:
            imp_data['Geolocalization'][i] = None
            imp_data['Objective'][i] = None
            imp_data['Organization'][i] = None
            imp_data['Participants'][i] = None
            
        else:
            text = extract_description(imp_data, i)
            text = clean(text)
            doc = nlp(text)
            temp_data = create_temp_register(imp_data, i)
            imp_data['Geolocalization'][i] = extract_location(temp_data, doc)
            #imp_data['Objective'][i] = extract_objective(temp_data, doc)
            imp_data['Organization'][i] = extract_organization(temp_data, doc)
            imp_data['Participants'][i] = extract_participants(temp_data, doc)
    return imp_data
    

In [None]:
def objective_creation(data):
    #Select only the columns need it
    imp_data = clean_data(data)
    objective = [None]*data.index
    for i in range(200):
        print(i)
        if type(imp_data['Description'][i]) == float:
            pass
        else:
            text = extract_description(imp_data, i)
            text = clean(text)
            doc = nlp(text)
            temp_data = create_temp_register(imp_data, i)
            objective[i] = extract_objective(temp_data, doc)
    
    return objective

In [None]:
ie = db_creation(english_data)

In [None]:
obj_list = objective_creation(english_data)
obj_list

In [None]:
none_list = [None]*(len(english_data)-len(obj_list))
obj_list = obj_list + none_list
obj_list[0]

In [None]:
ie['Description'] = english_data['DESCRIPTION']
ie['Objective'] = obj_list

### 4.3. CSV

In [None]:
ie.to_csv('InformationExtraction_reduced.csv', index = False, sep=';', encoding='utf-8')

## 5. MOST COMMON WORDS

### 5.1. List to string

In [None]:
text = ''
for i in range(200):
    if obj_list[i] != None:
        text = text + obj_list[i]
        text = text + ' '

### 5.2. Clean text (remove stop words)

In [None]:
from gensim.parsing.preprocessing import STOPWORDS

Add the most tipic words in objective seentences

In [None]:
words = ['goal', 'purpose', 'objective', 'intention', 'ambition', 'promote', 'dedicate', 'aims']
synonyms = []
for w in words:
    for synset in wordnet.synsets(w):
        for i in synset.lemmas():
            if i.name() not in synonyms and i.name()!='use':
                synonyms.append(i.name())

In [None]:
all_stopwords_gensim = STOPWORDS.union(set(synonyms))

In [None]:
clean_text = ''
for word in text.split(' '):
    if word not in all_stopwords_gensim:
        clean_text = clean_text + word + ' '

### 5.2. WordCloud

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

In [None]:
wordcloud = WordCloud(max_words=150, background_color="white").generate(clean_text)
plt.figure(figsize = (15,15))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
wordcloud.to_file("WordCloud.png")