### Import Libraries


In [2]:
import pandas as pd
import numpy as np
import ast
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/c02g40n7q05p/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/c02g40n7q05p/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


### Functions

In [3]:
wnl = WordNetLemmatizer()
def clean_text(text_series):
    econ_stopwords = ['model', 'using', 'paper']
    text_tokens = word_tokenize(text_series)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    tokens_without_sw = [word for word in tokens_without_sw if not word in econ_stopwords]
    tokens_without_sw_lemma = [wnl.lemmatize(word, pos="v") for word in tokens_without_sw if not word in econ_stopwords]

    # removing stopwords and econ stopwords
    text_series = " ".join(tokens_without_sw_lemma)
    # removing double quotes from text
    text_series = text_series.replace('"', '')
    # removing single quotes from text
    text_series = text_series.replace("'", '')
    # removing comma from text
    text_series = text_series.replace(',', '')
    # removing dot from text
    text_series = text_series.replace('.', '')
    # removing double dot from text
    text_series = text_series.replace(':', '')
    # removing percentage from text
    text_series = text_series.replace('%', '')
    # remove numbers from text
    text_series = re.sub(r'[0-9]+', '', text_series)

    return text_series

In [4]:
# Notebook parameters
data_name = 'papers_w_JELcode.csv'
data_path = '../data/'
output_data = 'traning_data_cleaned'
version = '_v03'

In [5]:
data = pd.read_csv(data_path + data_name)

print(data.shape)

(3126, 71)


In [6]:
print(data.columns)
#data.drop(columns=['Column1'], inplace=True)
data.head()

Index(['title_x', 'idpaper_2', 'journal', 'journal_num', 'year', 'month',
       'volume', 'issue', 'idpaper', 'jelcodes', 'keywords', 'abstract',
       'start_page', 'end_page', 'num_words', 'num_words_90_flag', 'idauthor',
       'author', 'prop_women', 'num_authors', 'gender_group_type',
       'sole_or_coauthors', 'num_pages', 'num_pages_dmean',
       'flesch_kincaid_grade_level', 'log_flesch_kincaid_grade_level',
       'flesch_kincaid_reading_ease', 'log_flesch_kincaid_reading_ease',
       'dale_chall', 'log_dale_chall', 'coleman_liau_index',
       'log_coleman_liau_index', 'automated_readability_index',
       'log_automated_readability_index', 'american_economic_review',
       'econometrica', 'journal_of_pol_economy',
       'quarterly_journal_of_economics', 'review_of_economic_studies',
       'coauthors', 'single_author', 'both_genders', 'only_females',
       'only_males', 'num_sentences', 'num_syllables', 'observation',
       'authors', 'jelcodes_letter', 'jelcodes_te

Unnamed: 0,title_x,idpaper_2,journal,journal_num,year,month,volume,issue,idpaper,jelcodes,...,jel_dummy_L,jel_dummy_M,jel_dummy_N,jel_dummy_O,jel_dummy_P,jel_dummy_Q,jel_dummy_R,jel_dummy_Y,jel_dummy_Z,tags
0,optimal adoption of complementary technologies,120009011529,American Economic Review,1,2000,March,90,1,1200090000.0,E22|G31|O33|D24,...,0,0,0,1,0,0,0,0,0,"['O', 'D', 'G', 'E']"
1,collateral damage: effects of the japanese ban...,120009013045,American Economic Review,1,2000,March,90,1,1200090000.0,G21|E44,...,0,0,0,0,0,0,0,0,0,"['G', 'E']"
2,endogenous inequality in integrated labor mark...,120009014672,American Economic Review,1,2000,March,90,1,1200090000.0,J41| J71,...,0,0,0,0,0,0,0,0,0,['J']
3,"labor-market integration, investment in risky ...",120009017395,American Economic Review,1,2000,March,90,1,1200090000.0,R23|J24|J31|J61,...,0,0,0,0,0,0,1,0,0,"['J', 'R']"
4,unequal societies: income distribution and the...,1200090196129,American Economic Review,1,2000,March,90,1,1200090000.0,D31|P16|I22|E62,...,0,0,0,0,1,0,0,0,0,"['P', 'E', 'I', 'D']"


In [7]:
df = data[["title_x", "abstract", "tags", "idauthor"]]
df.head()

Unnamed: 0,title_x,abstract,tags,idauthor
0,optimal adoption of complementary technologies,When a production process requires two extreme...,"['O', 'D', 'G', 'E']","['p00681', 'p01338']"
1,collateral damage: effects of the japanese ban...,The Japanese banking crisis provides a natural...,"['G', 'E']","['p01546', 'p02544']"
2,endogenous inequality in integrated labor mark...,We consider a market with red and green worker...,['J'],"['p00544', 'p01874', 'p03092']"
3,"labor-market integration, investment in risky ...",This paper presents a general-equilibrium mode...,"['J', 'R']",['p01266']
4,unequal societies: income distribution and the...,This paper develops a theory of inequality and...,"['P', 'E', 'I', 'D']",['p04639']


In [8]:
print(type(df['tags'].iloc[0]))
df['tags'].iloc[0]

<class 'str'>


"['O', 'D', 'G', 'E']"

In [9]:
ast.literal_eval(df['tags'].iloc[0])

['O', 'D', 'G', 'E']

## Text Preprocessing

In [10]:
# change tag's column to a list
df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x))
df.head()

Unnamed: 0,title_x,abstract,tags,idauthor
0,optimal adoption of complementary technologies,When a production process requires two extreme...,"[O, D, G, E]","['p00681', 'p01338']"
1,collateral damage: effects of the japanese ban...,The Japanese banking crisis provides a natural...,"[G, E]","['p01546', 'p02544']"
2,endogenous inequality in integrated labor mark...,We consider a market with red and green worker...,[J],"['p00544', 'p01874', 'p03092']"
3,"labor-market integration, investment in risky ...",This paper presents a general-equilibrium mode...,"[J, R]",['p01266']
4,unequal societies: income distribution and the...,This paper develops a theory of inequality and...,"[P, E, I, D]",['p04639']


In [11]:
# joining text from title and abstract in column all_text
df['all_text'] = df.title_x.astype(str).str.cat(df.abstract.astype(str), sep=' ')

In [13]:
# text preprocessing
df['cleaned_abstract'] = df['abstract'].apply(lambda x: str(x).lower())
df['cleaned_all_text'] = df['all_text'].apply(lambda x: str(x).lower())

df['cleaned_abstract'] = df['cleaned_abstract'].apply(lambda x: clean_text(x))
df['cleaned_all_text'] = df['cleaned_all_text'].apply(lambda x: clean_text(x))
df.head()


Unnamed: 0,title_x,abstract,tags,idauthor,all_text,cleaned_abstract,cleaned_all_text
0,optimal adoption of complementary technologies,When a production process requires two extreme...,"[O, D, G, E]","['p00681', 'p01338']",optimal adoption of complementary technologies...,production process require two extremely compl...,optimal adoption complementary technologies pr...
1,collateral damage: effects of the japanese ban...,The Japanese banking crisis provides a natural...,"[G, E]","['p01546', 'p02544']",collateral damage: effects of the japanese ban...,japanese bank crisis provide natural experimen...,collateral damage effect japanese bank crisis...
2,endogenous inequality in integrated labor mark...,We consider a market with red and green worker...,[J],"['p00544', 'p01874', 'p03092']",endogenous inequality in integrated labor mark...,consider market red green workers label payof...,endogenous inequality integrate labor market t...
3,"labor-market integration, investment in risky ...",This paper presents a general-equilibrium mode...,"[J, R]",['p01266'],"labor-market integration, investment in risky ...",present general-equilibrium human capital inve...,labor-market integration investment risky hum...
4,unequal societies: income distribution and the...,This paper develops a theory of inequality and...,"[P, E, I, D]",['p04639'],unequal societies: income distribution and the...,develop theory inequality social contract aim ...,unequal societies income distribution social ...


In [14]:
#save data for training set
df.to_csv(data_path + output_data + version + '.csv', index_label=False)