# Tratamiento de datos
Este cuaderno reune todas las transformaciones que se le hacen a los datos, de los cuales se nutrirán el resto de cuadernos, ya sea para el estudio de los datos o para las pruebas de diferentes técnicas: distancia de Levenshtein, ML, DL...

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


Importamos librerías necesarias

In [None]:
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import json

Importamos datos en formato JSON, tal y como salen de la base de datos de Elastic Search

In [None]:
f = open('/content/drive/MyDrive/TFM/notebooks/sin_procesar_27-bueno.json')
data = json.load(f)

Transformamos los datos en un dataframe

In [None]:
df = pd.DataFrame.from_records(data)
df.head()

Unnamed: 0,_id,_index,_score,_source,_ignored
0,941239bc-aa3f-4bc5-aa42-9a32d10b205e,document.v2,2.381428,"{'180_day_usage_count': 1, 'abstract': 'BACKGR...",
1,93198d25-b626-4748-ace8-f95b0ad60484,document.v2,2.381428,"{'180_day_usage_count': 0, 'abstract': 'BACKGR...",
2,83d53e21-3df6-4143-afd8-7d98cd4e4a3b,document.v2,2.381428,"{'180_day_usage_count': None, 'abstract': 'The...",
3,2d323be9-4499-4511-8bb9-231e3abe613b,document.v2,2.381428,"{'180_day_usage_count': None, 'abstract': 'Hun...",
4,e65015a2-a186-4cb2-9f36-e6fdfe768fdb,document.v2,2.381428,"{'180_day_usage_count': None, 'abstract': 'Hun...",


Obviamos alguna información relativa a la base de datos.

In [None]:
data_source = [i['_source'] for i in data]

df = pd.DataFrame.from_records(data_source)
df['source.database'] = [i['database'] for i in df['source']]
df.head()

Unnamed: 0,180_day_usage_count,abstract,article_number,article_title,authors,concept,dataset,document_type,doi,domain,...,author_keywords,doi_link,funding_orgs,keywords_plus,language,research_areas,unique_wos_id,affiliations,cited_reference_count,source.database
0,1.0,BACKGROUND: Ultrasonound is used to identify a...,,Assistive artificial intelligence for ultrasou...,"[{'affiliations': [], 'email_address': None, '...",algun comentario,"[{'batch': 'un dataset_001', 'name': 'un datas...",Journal Article,10.1016/j.bja.2022.06.031,TECNOLOGIA,...,,,,,,,,,,WOS
1,0.0,BACKGROUND: Ultrasound-guided regional anaesth...,,Evaluation of the impact of assistive artifici...,"[{'affiliations': [], 'email_address': None, '...",algun comentario,"[{'batch': 'un dataset_001', 'name': 'un datas...",Journal Article; Comment,10.1016/j.bja.2022.07.049,TECNOLOGIA,...,,,,,,,,,,WOS
2,,The identification of molecular biomarkers in ...,,Cerebrospinal fluid biomarkers for assessing H...,"[{'affiliations': [], 'email_address': None, '...",algun comentario,"[{'batch': 'un dataset_001', 'name': 'un datas...",Article,10.1093/braincomms/fcac309,SALUD,...,"[huntington disease, biomarkers, csf, neurofil...",http://dx.doi.org/10.1093/braincomms/fcac309,[],"[neurofilament light protein, projection neuro...",English,[],WOS:000897941400003,,,WOS
3,,Huntington disease (HD) is a neurodegenerative...,105652.0,Cerebrospinal fluid mutant huntingtin is a bio...,"[{'affiliations': [], 'email_address': 'ncaron...",algun comentario,"[{'batch': 'un dataset_001', 'name': 'un datas...",Article,10.1016/j.nbd.2022.105652,SALUD,...,"[huntington disease, biomarker, neurodegenerat...",http://dx.doi.org/10.1016/j.nbd.2022.105652,[{'name': 'Canadian Institutes of Health Resea...,"[gene, motor, expression, repeat, bachd, model]",English,[],WOS:000820445700002,,,WOS
4,,Huntington disease (HD) is a neurodegenerative...,,Mutant Huntingtin Is Cleared from the Brain vi...,"[{'affiliations': [], 'email_address': None, '...",algun comentario,"[{'batch': 'un dataset_001', 'name': 'un datas...",Article,10.1523/JNEUROSCI.1865-20.2020,SALUD,...,"[biomarker, cerebrospinal fluid, glymphatic sy...",http://dx.doi.org/10.1523/JNEUROSCI.1865-20.2020,[{'name': 'Canadian Institutes of Health Resea...,"[yac128 mouse model, cerebrospinal-fluid, poly...",English,[],WOS:000613907100006,,,WOS


In [None]:
characteristics = []
domains = []
sources = []
names = []
for article in data:
  # Characteristics
  authors = article['_source']['authors']
  for author in authors:
    name = ''
    if 'name' in author:
        if author['name'] is not None:
            name = author['name']
    keywords = ''
    if 'author_keywords' in article['_source']:
        keywords = ' '.join(article['_source']['author_keywords'])
    coauthors = ' '.join([a['name'] for a in article['_source']['authors'] if a['name'] != author['name']])
    institutions = ' '.join([i['name'] for i in article['_source']['institutions'] if len(article['_source']['institutions']) > 0])
    # Concatenamos las caracteristicas
    article_title = ''
    if 'article_title' in article['_source']:
        if article['_source']['article_title'] is not None:
            article_title = article['_source']['article_title']
    abstract = ''
    if 'abstract' in article['_source']:
        if article['_source']['abstract'] is not None:
            abstract = article['_source']['abstract']
    publication_year = ''
    if 'publication_year' in article['_source']:
        if article['_source']['publication_year'] is not None:
            publication_year = str(article['_source']['publication_year'])
    c = ' '.join([keywords, publication_year, article_title, coauthors, institutions, abstract])
    characteristics.append(c)
    # Domains
    domains.append(article['_source']['domain'])
    # Sources
    sources.append(article['_source']['source']['database'])
    # Names
    names.append(name)

In [None]:
print('Características: ' + str(len(characteristics)))
print('Dominios: ' + str(len(domains)))
print('Fuentes: ' + str(len(sources)))

Características: 395
Dominios: 395
Fuentes: 395


Guardamos los datos transformados en un DataFrame, y este en un archivo CSV.
Este archivo será editado a mano para enlazar los autores a través de un mismo ID.

In [None]:
df_characteristics = pd.DataFrame({'id': None, 'characteristics': characteristics,  'domain': domains, 'source': sources, 'name': names})
df_characteristics.to_csv('/content/drive/MyDrive/TFM/notebooks/characteristics_27_bueno_nombres.csv')

Obtenemos los datos del Excel, los cuales ya vienen con un id que relaciona las líneas que se refieren a la misma persona.
También se han modificado los nombres de los autores, teniendo en cuenta las diferentes formas que suelen usarse para firmar.

In [None]:
df_authors = pd.read_excel('/content/drive/MyDrive/TFM/notebooks/characteristics_30_nombres_modificados.xlsx')
df_authors

Unnamed: 0,temp_id,id,characteristics,domain,source,name
0,286.0,1.0,Modeling and Optimizing the Impact of Process ...,TECNOLOGIA,IEEE,C. I. Lang
1,291.0,1.0,One Class Process Anomaly Detection Using Kern...,TECNOLOGIA,IEEE,Cristopher I. Lang
2,302.0,1.0,Intelligent Optimization of Dosing Uniformity ...,TECNOLOGIA,IEEE,"Lang, CI"
3,290.0,2.0,Modeling and Optimizing the Impact of Process ...,TECNOLOGIA,IEEE,D. S. Boning
4,301.0,2.0,One Class Process Anomaly Detection Using Kern...,TECNOLOGIA,IEEE,Duane S. Boning
...,...,...,...,...,...,...
391,379.0,,Joint Task Offloading and Resource Allocation ...,TECNOLOGIA,IEEE,S. Li
392,380.0,,Joint Task Offloading and Resource Allocation ...,TECNOLOGIA,IEEE,J. Liu
393,385.0,,"DNN Deployment, Task Offloading, and Resource ...",TECNOLOGIA,IEEE,Z. Chen
394,386.0,,"DNN Deployment, Task Offloading, and Resource ...",TECNOLOGIA,IEEE,Z. Hao


Añadimos los nombres a la concatenación de características.

In [None]:
for index, row in df_authors.iterrows() :
  # row['characteristics'] = ' '.join([row['name'], row['characteristics']])
  df_authors.loc[index, row['characteristics']] = ' '.join([row['name'], row['characteristics']])
df_authors

  df_authors.loc[index, row['characteristics']] = ' '.join([row['name'], row['characteristics']])
  df_authors.loc[index, row['characteristics']] = ' '.join([row['name'], row['characteristics']])
  df_authors.loc[index, row['characteristics']] = ' '.join([row['name'], row['characteristics']])
  df_authors.loc[index, row['characteristics']] = ' '.join([row['name'], row['characteristics']])
  df_authors.loc[index, row['characteristics']] = ' '.join([row['name'], row['characteristics']])
  df_authors.loc[index, row['characteristics']] = ' '.join([row['name'], row['characteristics']])
  df_authors.loc[index, row['characteristics']] = ' '.join([row['name'], row['characteristics']])
  df_authors.loc[index, row['characteristics']] = ' '.join([row['name'], row['characteristics']])
  df_authors.loc[index, row['characteristics']] = ' '.join([row['name'], row['characteristics']])
  df_authors.loc[index, row['characteristics']] = ' '.join([row['name'], row['characteristics']])
  df_authors.loc[ind

TypeError: ignored

Guardamos en un fichero.

In [None]:
df_authors.to_csv('/content/drive/MyDrive/TFM/notebooks/characteristics_30_nombres_modificados_concatenados.csv')

## Dataset confrontado: nombre y caracteristicas

In [None]:
df_authors = pd.read_csv('/content/drive/MyDrive/TFM/notebooks/characteristics_30_nombres_modificados_concatenados.csv', index_col = 0)
df_authors

Unnamed: 0.1,Unnamed: 0,id,characteristics,domain,source,name
0,0,31.0,"Bowness, James Simeon Assistive artificial int...",TECNOLOGIA,WOS,"Bowness, James Simeon"
1,1,32.0,"Burckett-St Laurent, D Assistive artificial in...",TECNOLOGIA,WOS,"Burckett-St Laurent, D"
2,2,,"Hernandez, Nadia Assistive artificial intellig...",TECNOLOGIA,WOS,"Hernandez, Nadia"
3,3,,"Keane, Pearse A Assistive artificial intellige...",TECNOLOGIA,WOS,"Keane, Pearse A"
4,4,,"Lobo, Clara Assistive artificial intelligence ...",TECNOLOGIA,WOS,"Lobo, Clara"
...,...,...,...,...,...,...
390,390,20.0,"Yuan'An Liu DNN Deployment, Task Offloading, a...",TECNOLOGIA,IEEE,Yuan'An Liu
391,391,24.0,Z. Ye Deep Negative Correlation Multisource Do...,TECNOLOGIA,IEEE,Z. Ye
392,392,25.0,Jianbo Yu Deep Negative Correlation Multisourc...,TECNOLOGIA,IEEE,Jianbo Yu
393,393,24.0,Zhuang Ye Multiscale Weighted Morphological Ne...,TECNOLOGIA,IEEE,Zhuang Ye


Tokenizamos

In [None]:
!pip install nltk



In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Downloading stopwords ad punkt packages...
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
docs_tokenized = []
ids = []
for index, row in df_authors.iterrows() :
  word_tokens = word_tokenize(row['characteristics'])
  # converts the words in word_tokens to lower case and then checks whether they are present in stop_words or not
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  docs_tokenized.append(filtered_sentence)
  ids.append(row['id'])

authors_tokenized = [" ".join(item) for item in docs_tokenized]

In [None]:
authors_tokenized[0]

'Bowness , James Simeon Assistive artificial intelligence ultrasound image interpretation regional anaesthesia : external validation study . BACKGROUND : Ultrasonound used identify anatomical structures regional anaesthesia guide needle insertion injection local anaesthetic . ScanNav Anatomy Peripheral Nerve Block ( Intelligent Ultrasound , Cardiff , UK ) artificial intelligence-based device produces colour overlay real-time B-mode ultrasound highlight anatomical structures interest . evaluated accuracy artificial-intelligence colour overlay perceived influence risk adverse events block failure.METHODS : Ultrasound-guided regional anaesthesia experts acquired 720 videos 40 volunteers ( across nine anatomical regions ) without using device . artificial-intelligence colour overlay subsequently applied . Three experts independently reviewed video ( original unmodified video ) assess accuracy colour overlay relation key anatomical structures ( true positive/negative false positive/negative

In [None]:
dataset_tokenized = pd.DataFrame({'id': ids, 'characteristics': authors_tokenized, 'name': df_authors['name']})
dataset_tokenized.head()

Unnamed: 0,id,characteristics,name
0,31.0,"Bowness , James Simeon Assistive artificial in...","Bowness, James Simeon"
1,32.0,"Burckett-St Laurent , Assistive artificial int...","Burckett-St Laurent, D"
2,,"Hernandez , Nadia Assistive artificial intelli...","Hernandez, Nadia"
3,,"Keane , Pearse Assistive artificial intelligen...","Keane, Pearse A"
4,,"Lobo , Clara Assistive artificial intelligence...","Lobo, Clara"


In [None]:
df_characteristics = pd.DataFrame(None, columns = ["author", "author_name", "candidate", "candidate_name", "label"])
for index_author, author in dataset_tokenized.iterrows():
  for index_candidate, candidate in dataset_tokenized.iterrows():
    if author['id'] == candidate['id']:
      df_characteristics = pd.concat([pd.DataFrame([[author['characteristics'], author['name'], candidate['characteristics'], candidate['name'], 1]], columns = df_characteristics.columns), df_characteristics], ignore_index = True)
    elif index_author == index_candidate:
      df_characteristics = pd.concat([pd.DataFrame([[author['characteristics'], author['name'], candidate['characteristics'], candidate['name'], 1]], columns = df_characteristics.columns), df_characteristics], ignore_index = True)
    else:
      df_characteristics = pd.concat([pd.DataFrame([[author['characteristics'], author['name'], candidate['characteristics'], candidate['name'], 0]], columns = df_characteristics.columns), df_characteristics], ignore_index = True)

df_characteristics.head()

KeyboardInterrupt: ignored

## Dataset confrontado: por nombre
Estamos ante un problema de todos contra todos, por lo que para confeccinar el dataset, necesitamos tener por cada línea la siguiente info:
- Autor que queremos enlazar
- Autor candidato
- 0/1 en función de si es o no la misma persona


  Para la **distancia de Levenshtein** necesitaremos solo los nombres y la etiqueta.

In [None]:
df_authors = pd.read_csv('/content/drive/MyDrive/TFM/notebooks/characteristics_30_nombres_modificados_concatenados.csv', index_col = 0)
df_authors

Unnamed: 0.1,Unnamed: 0,id,characteristics,domain,source,name
0,0,31.0,"Bowness, James Simeon Assistive artificial int...",TECNOLOGIA,WOS,"Bowness, James Simeon"
1,1,32.0,"Burckett-St Laurent, D Assistive artificial in...",TECNOLOGIA,WOS,"Burckett-St Laurent, D"
2,2,,"Hernandez, Nadia Assistive artificial intellig...",TECNOLOGIA,WOS,"Hernandez, Nadia"
3,3,,"Keane, Pearse A Assistive artificial intellige...",TECNOLOGIA,WOS,"Keane, Pearse A"
4,4,,"Lobo, Clara Assistive artificial intelligence ...",TECNOLOGIA,WOS,"Lobo, Clara"
...,...,...,...,...,...,...
390,390,20.0,"Yuan'An Liu DNN Deployment, Task Offloading, a...",TECNOLOGIA,IEEE,Yuan'An Liu
391,391,24.0,Z. Ye Deep Negative Correlation Multisource Do...,TECNOLOGIA,IEEE,Z. Ye
392,392,25.0,Jianbo Yu Deep Negative Correlation Multisourc...,TECNOLOGIA,IEEE,Jianbo Yu
393,393,24.0,Zhuang Ye Multiscale Weighted Morphological Ne...,TECNOLOGIA,IEEE,Zhuang Ye


In [None]:
dataset_lev = pd.DataFrame(None, columns = ["author", "candidate", "label"])
for index_author, author in df_authors.iterrows():
  for index_candidate, candidate in df_authors.iterrows():
    if author['id'] == candidate['id']:
      dataset_lev = pd.concat([pd.DataFrame([[author['name'], candidate['name'], 1]], columns = dataset_lev.columns), dataset_lev], ignore_index = True)
    elif index_author == index_candidate:
      dataset_lev = pd.concat([pd.DataFrame([[author['name'], candidate['name'], 1]], columns = dataset_lev.columns), dataset_lev], ignore_index = True)
    else:
      dataset_lev = pd.concat([pd.DataFrame([[author['name'], candidate['name'], 0]], columns = dataset_lev.columns), dataset_lev], ignore_index = True)

dataset_lev.head()

KeyboardInterrupt: ignored

In [None]:
len(dataset_lev)

Guardamos este dataset en un fichero, para que podamos acceder a él desde otros cuadernos sin tener que volver a procesarlo.

In [None]:
dataset_lev.to_csv('/content/drive/MyDrive/TFM/notebooks/autores_lev.csv')

### Sin los autores que no se repiten
Para que no haya tanto desbalance entre los que tienen etiqueta 1 y 0 en el dataset de entrenamiento.

In [None]:
clean_lev = pd.DataFrame(None, columns = ["author", "candidate", "label"])
plain_lev = pd.DataFrame(None, columns = ["id", "author", "domain", "source"])
for index_author, author in df_authors.iterrows():
  if not pd.isna(author['id']):
    plain_lev = pd.concat([pd.DataFrame([[author['id'], author['name'], author['domain'], author['source']]], columns = plain_lev.columns), plain_lev], ignore_index = True)
    for index_candidate, candidate in df_authors.iterrows():
      if author['id'] == candidate['id']:
        clean_lev = pd.concat([pd.DataFrame([[author['name'], candidate['name'], 1]], columns = clean_lev.columns), clean_lev], ignore_index = True)
      elif index_author == index_candidate:
        clean_lev = pd.concat([pd.DataFrame([[author['name'], candidate['name'], 1]], columns = clean_lev.columns), clean_lev], ignore_index = True)
      else:
        clean_lev = pd.concat([pd.DataFrame([[author['name'], candidate['name'], 0]], columns = clean_lev.columns), clean_lev], ignore_index = True)
print(clean_lev.head())
print(plain_lev.head())

  author    candidate label
0  J. Yu        J. Yu     1
1  J. Yu    Zhuang Ye     0
2  J. Yu    Jianbo Yu     1
3  J. Yu        Z. Ye     0
4  J. Yu  Yuan'An Liu     0
     id       author      domain source
0  25.0        J. Yu  TECNOLOGIA   IEEE
1  24.0    Zhuang Ye  TECNOLOGIA   IEEE
2  25.0    Jianbo Yu  TECNOLOGIA   IEEE
3  24.0        Z. Ye  TECNOLOGIA   IEEE
4  20.0  Yuan'An Liu  TECNOLOGIA   IEEE


In [None]:
clean_lev.to_csv('/content/drive/MyDrive/TFM/notebooks/autores_lev_clean.csv')

In [None]:
plain_lev.to_csv('/content/drive/MyDrive/TFM/notebooks/autores_lev_plain.csv')

## Dataset tokenizado: por características concatenadas
Para las otras pruebas, queremos tokenizar el string de características concatenadas y deshacernos de las stopwords, antes de proceder a generar los embeddings según las diferentes técnicas.

In [None]:
df_authors = pd.read_csv('/content/drive/MyDrive/TFM/notebooks/characteristics_30_nombres_modificados_concatenados.csv')
df_authors

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,characteristics,domain,source,name
0,0,0,31.0,"Bowness, James Simeon Assistive artificial int...",TECNOLOGIA,WOS,"Bowness, James Simeon"
1,1,1,32.0,"Burckett-St Laurent, D Assistive artificial in...",TECNOLOGIA,WOS,"Burckett-St Laurent, D"
2,2,2,,"Hernandez, Nadia Assistive artificial intellig...",TECNOLOGIA,WOS,"Hernandez, Nadia"
3,3,3,,"Keane, Pearse A Assistive artificial intellige...",TECNOLOGIA,WOS,"Keane, Pearse A"
4,4,4,,"Lobo, Clara Assistive artificial intelligence ...",TECNOLOGIA,WOS,"Lobo, Clara"
...,...,...,...,...,...,...,...
390,390,390,20.0,"Yuan'An Liu DNN Deployment, Task Offloading, a...",TECNOLOGIA,IEEE,Yuan'An Liu
391,391,391,24.0,Z. Ye Deep Negative Correlation Multisource Do...,TECNOLOGIA,IEEE,Z. Ye
392,392,392,25.0,Jianbo Yu Deep Negative Correlation Multisourc...,TECNOLOGIA,IEEE,Jianbo Yu
393,393,393,24.0,Zhuang Ye Multiscale Weighted Morphological Ne...,TECNOLOGIA,IEEE,Zhuang Ye


In [None]:
!pip install nltk



In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Downloading stopwords ad punkt packages...
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
docs_tokenized = []
ids = []
for index, row in df_authors.iterrows() :
  word_tokens = word_tokenize(row['characteristics'])
  # converts the words in word_tokens to lower case and then checks whether they are present in stop_words or not
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  docs_tokenized.append(filtered_sentence)
  ids.append(row['id'])

authors_tokenized = [" ".join(item) for item in docs_tokenized]

In [None]:
authors_tokenized[0]

'Bowness , James Simeon Assistive artificial intelligence ultrasound image interpretation regional anaesthesia : external validation study . BACKGROUND : Ultrasonound used identify anatomical structures regional anaesthesia guide needle insertion injection local anaesthetic . ScanNav Anatomy Peripheral Nerve Block ( Intelligent Ultrasound , Cardiff , UK ) artificial intelligence-based device produces colour overlay real-time B-mode ultrasound highlight anatomical structures interest . evaluated accuracy artificial-intelligence colour overlay perceived influence risk adverse events block failure.METHODS : Ultrasound-guided regional anaesthesia experts acquired 720 videos 40 volunteers ( across nine anatomical regions ) without using device . artificial-intelligence colour overlay subsequently applied . Three experts independently reviewed video ( original unmodified video ) assess accuracy colour overlay relation key anatomical structures ( true positive/negative false positive/negative

In [None]:
dataset_tokenized = pd.DataFrame({'id': ids, 'characteristics': authors_tokenized})
dataset_tokenized.head()

Unnamed: 0,id,characteristics
0,31.0,"Bowness , James Simeon Assistive artificial in..."
1,32.0,"Burckett-St Laurent , Assistive artificial int..."
2,,"Hernandez , Nadia Assistive artificial intelli..."
3,,"Keane , Pearse Assistive artificial intelligen..."
4,,"Lobo , Clara Assistive artificial intelligence..."


In [None]:
dataset_characteristics_tokenized = pd.DataFrame(None, columns = ["author", "candidate", "label"])
for index_author, author in dataset_tokenized.iterrows():
  for index_candidate, candidate in dataset_tokenized.iterrows():
    if author['id'] == candidate['id']:
      dataset_characteristics_tokenized = pd.concat([pd.DataFrame([[author['characteristics'], candidate['characteristics'], 1]], columns = dataset_characteristics_tokenized.columns), dataset_characteristics_tokenized], ignore_index = True)
    elif index_author == index_candidate:
      dataset_characteristics_tokenized = pd.concat([pd.DataFrame([[author['characteristics'], candidate['characteristics'], 1]], columns = dataset_characteristics_tokenized.columns), dataset_characteristics_tokenized], ignore_index = True)
    else:
      dataset_characteristics_tokenized = pd.concat([pd.DataFrame([[author['characteristics'], candidate['characteristics'], 0]], columns = dataset_characteristics_tokenized.columns), dataset_characteristics_tokenized], ignore_index = True)

dataset_characteristics_tokenized.head()

Unnamed: 0,author,candidate,label
0,J. Yu Multiscale Weighted Morphological Networ...,J. Yu Multiscale Weighted Morphological Networ...,1
1,J. Yu Multiscale Weighted Morphological Networ...,Zhuang Ye Multiscale Weighted Morphological Ne...,0
2,J. Yu Multiscale Weighted Morphological Networ...,Jianbo Yu Deep Negative Correlation Multisourc...,1
3,J. Yu Multiscale Weighted Morphological Networ...,Z. Ye Deep Negative Correlation Multisource Do...,0
4,J. Yu Multiscale Weighted Morphological Networ...,"Yuan'An Liu DNN Deployment , Task Offloading ,...",0


In [None]:
dataset_characteristics_tokenized.to_csv('/content/drive/MyDrive/TFM/notebooks/characteristics_tokenizadas_confrontadas.csv')

### Sin los autores que no se repiten
Para que no haya tanto desbalance entre los que tienen etiqueta 1 y 0 en el dataset de entrenamiento.

In [None]:
clean_characteristics_tokenized = pd.DataFrame(None, columns = ["author", "candidate", "label"])
plain_characteristics_tokenized = pd.DataFrame(None, columns = ["id", "author"])
for index_author, author in df_authors.iterrows():
  if not pd.isna(author['id']):
    plain_characteristics_tokenized = pd.concat([pd.DataFrame([[author['id'], author['characteristics']]], columns = plain_characteristics_tokenized.columns), plain_characteristics_tokenized], ignore_index = True)
    for index_candidate, candidate in df_authors.iterrows():
      if author['id'] == candidate['id']:
        clean_characteristics_tokenized = pd.concat([pd.DataFrame([[author['characteristics'], candidate['characteristics'], 1]], columns = clean_characteristics_tokenized.columns), clean_characteristics_tokenized], ignore_index = True)
      elif index_author == index_candidate:
        clean_characteristics_tokenized = pd.concat([pd.DataFrame([[author['characteristics'], candidate['characteristics'], 1]], columns = clean_characteristics_tokenized.columns), clean_characteristics_tokenized], ignore_index = True)
      else:
        clean_characteristics_tokenized = pd.concat([pd.DataFrame([[author['characteristics'], candidate['characteristics'], 0]], columns = clean_characteristics_tokenized.columns), clean_characteristics_tokenized], ignore_index = True)
print(clean_characteristics_tokenized.head())
print(plain_characteristics_tokenized.head())

                                              author  \
0  J. Yu Multiscale Weighted Morphological Networ...   
1  J. Yu Multiscale Weighted Morphological Networ...   
2  J. Yu Multiscale Weighted Morphological Networ...   
3  J. Yu Multiscale Weighted Morphological Networ...   
4  J. Yu Multiscale Weighted Morphological Networ...   

                                           candidate label  
0  J. Yu Multiscale Weighted Morphological Networ...     1  
1  Zhuang Ye Multiscale Weighted Morphological Ne...     0  
2  Jianbo Yu Deep Negative Correlation Multisourc...     1  
3  Z. Ye Deep Negative Correlation Multisource Do...     0  
4  Yuan'An Liu DNN Deployment, Task Offloading, a...     0  
     id                                             author
0  25.0  J. Yu Multiscale Weighted Morphological Networ...
1  24.0  Zhuang Ye Multiscale Weighted Morphological Ne...
2  25.0  Jianbo Yu Deep Negative Correlation Multisourc...
3  24.0  Z. Ye Deep Negative Correlation Multisource Do...
4 

In [None]:
clean_characteristics_tokenized.to_csv('/content/drive/MyDrive/TFM/notebooks/characteristics_tokenizadas_confrontadas_clean.csv')

In [None]:
plain_characteristics_tokenized.to_csv('/content/drive/MyDrive/TFM/notebooks/characteristics_tokenizadas_plain_clean.csv')

# Conjuntos de datos
Vamos a separa la tabla de autores confrontados en 3 subconjuntos:
*   Train (70%)
*   Test (15%)
*   Validation (15%)
En todos los grupos habrá el mismo porcentaje de casos positivos que negativos.





In [None]:
df_authors = pd.read_csv('/content/drive/MyDrive/TFM/notebooks/characteristics_tokenizadas_confrontadas.csv', index_col = 0)

In [None]:
df_positives = df_authors.loc[df_authors['label'] == 1]
len_positives = len(df_positives)
df_negatives = df_authors.loc[df_authors['label'] == 0]
len_negatives = len(df_negatives)

Casos positivos

In [None]:
df_positives.head()

Unnamed: 0,author,candidate,label
0,J. Yu Multiscale Weighted Morphological Networ...,J. Yu Multiscale Weighted Morphological Networ...,1
2,J. Yu Multiscale Weighted Morphological Networ...,Jianbo Yu Deep Negative Correlation Multisourc...,1
396,Zhuang Ye Multiscale Weighted Morphological Ne...,Zhuang Ye Multiscale Weighted Morphological Ne...,1
398,Zhuang Ye Multiscale Weighted Morphological Ne...,Z. Ye Deep Negative Correlation Multisource Do...,1
790,Jianbo Yu Deep Negative Correlation Multisourc...,J. Yu Multiscale Weighted Morphological Networ...,1


In [None]:
train_positives = df_positives.sample(frac = 0.7)
rest_positives = df_positives.drop(train_positives.index)
test_positives = rest_positives.sample(frac = 0.5)
val_positives = rest_positives.drop(test_positives.index)

In [None]:
print('Valores esperados vs obtenidos')
print(int(len_positives * 0.7))
print(len(train_positives))
print(int(len_positives * 0.15))
print(len(test_positives))
print(int(len_positives * 0.15))
print(len(val_positives))

Valores esperados vs obtenidos
555
555
118
119
118
119


Casos negativos

In [None]:
df_negatives.head()

Unnamed: 0,author,candidate,label
1,J. Yu Multiscale Weighted Morphological Networ...,Zhuang Ye Multiscale Weighted Morphological Ne...,0
3,J. Yu Multiscale Weighted Morphological Networ...,Z. Ye Deep Negative Correlation Multisource Do...,0
4,J. Yu Multiscale Weighted Morphological Networ...,"Yuan'An Liu DNN Deployment , Task Offloading ,...",0
5,J. Yu Multiscale Weighted Morphological Networ...,"B. Tang DNN Deployment , Task Offloading , Res...",0
6,J. Yu Multiscale Weighted Morphological Networ...,"Fan Wu DNN Deployment , Task Offloading , Reso...",0


In [None]:
train_negatives = df_negatives.sample(frac = 0.7)
rest_negatives = df_negatives.drop(train_negatives.index)
test_negatives = rest_negatives.sample(frac = 0.5)
val_negatives = rest_negatives.drop(test_negatives.index)

In [None]:
print('Valores esperados vs obtenidos')
print(int(len_negatives * 0.7))
print(len(train_negatives))
print(int(len_negatives * 0.15))
print(len(test_negatives))
print(int(len_negatives * 0.15))
print(len(val_negatives))

Valores esperados vs obtenidos
108662
108662
23284
23285
23284
23285


Hay demasiadas filas para los casos negativos, así que vamos a escoger solo el 10% de cada conjunto.

In [None]:
final_train_negatives = train_negatives.sample(frac = 0.1)
print(len(final_train_negatives))
final_test_negatives = test_negatives.sample(frac = 0.1)
print(len(final_test_negatives))
final_val_negatives = val_negatives.sample(frac = 0.1)
print(len(final_val_negatives))

10866
2328
2328


In [None]:
train_set = pd.concat([train_positives, final_train_negatives])
train_set = train_set.sample(frac = 1)
train_set.to_csv('/content/drive/MyDrive/TFM/notebooks/train_set.csv')
train_set.head()

Unnamed: 0,author,candidate,label
152215,"Sleep , N Assistive artificial intelligence ul...","Li , F Melatonin inhibits cytosolic mitochondr...",0
86127,"Heslegrave , A. Brain-derived neurotrophic fac...",. B. Guo Deep-Learning-Based Surrogate Model T...,0
27404,M. Wu Bi-LSTM-Based Two-Stream Network Machine...,"Zhang , Diagnostic value alpha-fetoprotein , L...",0
79463,A. Heslegrave Mutant huntingtin neurofilament ...,X. Li Bi-LSTM-Based Two-Stream Network Machine...,0
108579,"Fonteh , Alfred N Alpha desynchronization simp...","Anderson , C Cerebrospinal fluid mutant huntin...",0


In [None]:
test_set = pd.concat([test_positives, final_test_negatives])
test_set = test_set.sample(frac = 1)
test_set.to_csv('/content/drive/MyDrive/TFM/notebooks/test_set.csv')
test_set.head()

Unnamed: 0,author,candidate,label
127409,"Kerman , Nutritional metabolism cerebral bioen...",R.I. Scahill Brain-derived neurotrophic factor...,0
132285,"Black , Hailey F Mutant Huntingtin Cleared Bra...","Caron , Nicholas S. Cerebrospinal fluid mutant...",0
82798,E. De Vita Mutant huntingtin neurofilament lig...,"De Vita , E Longitudinal evaluation proton mag...",1
16604,W. D. Lu Physical Unclonable Function Systems ...,J. Liu Joint Task Offloading Resource Allocati...,0
93080,"Wijeratne , Peter . Multi-Study Model-Based Ev...","Shapiro , JI Oxidized HDL , Adipokines , Endot...",0


In [None]:
val_set = pd.concat([val_positives, final_val_negatives])
val_set = val_set.sample(frac = 1)
val_set.to_csv('/content/drive/MyDrive/TFM/notebooks/val_set.csv')
val_set.head()

Unnamed: 0,author,candidate,label
78708,"Wild , Edward J Mutant huntingtin neurofilamen...",Cristopher I. Lang One Class Process Anomaly D...,0
155568,"Burckett-St Laurent , Assistive artificial int...","Casal , L Mutant Huntingtin Cleared Brain via ...",0
24775,Z. Wang Image Reconstruction Based Multilevel ...,"Shenoi , Urine dicarboxylic acids change pre-s...",0
83225,M. Arridge Mutant huntingtin neurofilament lig...,"King , KS Alpha desynchronization simple worki...",0
54733,"Oberly , P Melatonin inhibits cytosolic mitoch...",E.B . Johnson Brain-derived neurotrophic facto...,0


# Conjuntos de datos "limpios"