## Data Wrangling a la base de datos

In [1]:
import pandas as pd
import re

import warnings
warnings.simplefilter("ignore")

In [2]:
dta = pd.read_csv("./Bases de datos/dta.csv", delimiter=";", index_col="Unnamed: 0", usecols=["Unnamed: 0","article_name", "authors", "volume_name", "volumen_date"])

dta.head()

Unnamed: 0,article_name,authors,volume_name,volumen_date
0,Editorial Board,[],Volume 74,December 2022
1,Congestion in a public health service: A macro...,"['[Kelly, Mark]', '[Kuhn, Michael]']",Volume 74,December 2022
2,The wage dispersion effects of international m...,"['[Sargent, Kristina]']",Volume 74,December 2022
3,Balanced-budget rules and macroeconomic stabil...,"['[Guo, Jang-Ting]', '[Zhang, Yan]']",Volume 74,December 2022
4,Illiquid investments and the non-monotone rela...,"['[Salas, Sergio]', '[Odell, Kathleen]']",Volume 74,December 2022


Eliminando algunos archivos que no tienen autores, y que posiblemente sean anuncios o portadas

In [3]:
dta.drop(dta[dta["authors"] == "[]"].index, inplace=True)

Renombraré observaciones de la columnas `volumen_date` que contengan informacion irrelevante al igual que en `volume_date`

In [4]:
dta["volumen_date"] = dta["volumen_date"].str.replace(r'[^(]*\(|\)[^)]*', '')
dta["volume_name"] = dta["volume_name"].str.split(',').str[0]

Creando años de publicacion `year`

In [5]:
dta["year"] = dta["volumen_date"].str.split().str[-1]

Creacion de `keywords`

In [6]:
dta["keywords"] = dta["article_name"].str.lower()

# Eliminar palabras
words = [
    "in", "a", "the", "of", "as", "with", "its",
    "their", "and", "vs."
]

dta["keywords"] = dta["keywords"].str.replace(r"\s*(?<!\w)(?:{})(?!\w)".format("|".join([re.escape(x) for x in words])), " ")

# Eliminar caracteres
characters = [
    ":", ",", ".", ";", '“','”'
]

for c in characters:
     dta["keywords"] = dta["keywords"].str.replace(c, '')

# Palabras a lista
dta["keywords"] = dta["keywords"].str.replace(r'\s+', ' ', regex=True)
dta["keywords"] = dta["keywords"].str.strip()
dta["keywords"] = [x.split(" ") for x in dta["keywords"]]

# Eliminando el titulo
dta.drop(["article_name"], axis=1, inplace=True)

In [7]:
dta.head()

Unnamed: 0,authors,volume_name,volumen_date,year,keywords
1,"['[Kelly, Mark]', '[Kuhn, Michael]']",Volume 74,December 2022,2022,"[congestion, public, health, service, macro, a..."
2,"['[Sargent, Kristina]']",Volume 74,December 2022,2022,"[wage, dispersion, effects, international, mig..."
3,"['[Guo, Jang-Ting]', '[Zhang, Yan]']",Volume 74,December 2022,2022,"[balanced-budget, rules, macroeconomic, stabil..."
4,"['[Salas, Sergio]', '[Odell, Kathleen]']",Volume 74,December 2022,2022,"[illiquid, investments, non-monotone, relation..."
5,"['[Kasuga, Hidefumi]', '[Morita, Yuichi]']",Volume 74,December 2022,2022,"[health, gap, effect, on, economic, outcomes]"


In [8]:
dta.to_csv("./Bases de datos/dta_f.csv")
dta.to_excel("./Bases de datos/dta_f.xlsx")