### Import Libraries


In [1]:
import pandas as pd
import numpy as np
import ast
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marce\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marce\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Functions

In [7]:
wnl = WordNetLemmatizer()
def clean_text(text_series):
    econ_stopwords = ['model', 'using', 'paper']
    text_tokens = word_tokenize(text_series)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    tokens_without_sw = [word for word in tokens_without_sw if not word in econ_stopwords]
    tokens_without_sw_lemma = [wnl.lemmatize(word, pos="v") for word in tokens_without_sw if not word in econ_stopwords]

    # removing stopwords and econ stopwords
    text_series = " ".join(tokens_without_sw_lemma)
    # removing double quotes from text
    text_series = text_series.replace('"', '')
    # removing single quotes from text
    text_series = text_series.replace("'", '')
    # removing comma from text
    text_series = text_series.replace(',', '')
    # removing dot from text
    text_series = text_series.replace('.', '')
    # removing double dot from text
    text_series = text_series.replace(':', '')
    # removing percentage from text
    text_series = text_series.replace('%', '')
    # remove numbers from text
    text_series = re.sub(r'[0-9]+', '', text_series)

    return text_series

def split(word):
    return [char for char in word]

In [3]:
# Notebook parameters
data_name = 'papers_wo_JELcode.csv'
data_path = '../data/'
output_data = 'test_data_cleaned'
version = '_v01'

In [11]:
df = pd.read_csv(data_path + data_name)

print(df.shape)

(1862, 70)


In [12]:
print(df.columns)
#data.drop(columns=['Column1'], inplace=True)
df.head()


Index(['title_x', 'idpaper_2', 'journal', 'journal_num', 'year', 'month',
       'volume', 'issue', 'idpaper', 'jelcodes', 'keywords', 'abstract',
       'start_page', 'end_page', 'num_words', 'num_words_90_flag', 'idauthor',
       'author', 'prop_women', 'num_authors', 'gender_group_type',
       'sole_or_coauthors', 'num_pages', 'num_pages_dmean',
       'flesch_kincaid_grade_level', 'log_flesch_kincaid_grade_level',
       'flesch_kincaid_reading_ease', 'log_flesch_kincaid_reading_ease',
       'dale_chall', 'log_dale_chall', 'coleman_liau_index',
       'log_coleman_liau_index', 'automated_readability_index',
       'log_automated_readability_index', 'american_economic_review',
       'econometrica', 'journal_of_pol_economy',
       'quarterly_journal_of_economics', 'review_of_economic_studies',
       'coauthors', 'single_author', 'both_genders', 'only_females',
       'only_males', 'num_sentences', 'num_syllables', 'observation',
       'authors', 'jelcodes_letter', 'jelcodes_te

Unnamed: 0,title_x,idpaper_2,journal,journal_num,year,month,volume,issue,idpaper,jelcodes,...,jel_dummy_K,jel_dummy_L,jel_dummy_M,jel_dummy_N,jel_dummy_O,jel_dummy_P,jel_dummy_Q,jel_dummy_R,jel_dummy_Y,jel_dummy_Z
1012,"the cyclically of sales, regular and effective...",1201510539931029,American Economic Review,1,2015,March,105,3,120000000000.0,,...,0,0,0,0,0,0,0,0,0,0
1500,the econometrics of ultra-high-frequency data,22000681122,Econometrica,2,2000,,68,1,220006800.0,,...,0,0,0,0,0,0,0,0,0,0
1501,a three-step method for choosing the number of...,220006812351,Econometrica,2,2000,,68,1,2200068000.0,,...,0,0,0,0,0,0,0,0,0,0
1502,latent separability: grouping goods without we...,220006815384,Econometrica,2,2000,,68,1,2200068000.0,,...,0,0,0,0,0,0,0,0,0,0
1503,bargaining and reputation,2200068185117,Econometrica,2,2000,,68,1,2200068000.0,,...,0,0,0,0,0,0,0,0,0,0


## Text Preprocessing


In [13]:
# joining text from title and abstract in column all_text
df['all_text'] = df.title_x.astype(str).str.cat(df.abstract.astype(str), sep=' ')

In [14]:
# text preprocessing
df['cleaned_abstract'] = df['abstract'].apply(lambda x: str(x).lower())
df['cleaned_all_text'] = df['all_text'].apply(lambda x: str(x).lower())

df['cleaned_abstract'] = df['cleaned_abstract'].apply(lambda x: clean_text(x))
df['cleaned_all_text'] = df['cleaned_all_text'].apply(lambda x: clean_text(x))
df.head()


Unnamed: 0,title_x,idpaper_2,journal,journal_num,year,month,volume,issue,idpaper,jelcodes,...,jel_dummy_N,jel_dummy_O,jel_dummy_P,jel_dummy_Q,jel_dummy_R,jel_dummy_Y,jel_dummy_Z,all_text,cleaned_abstract,cleaned_all_text
1012,"the cyclically of sales, regular and effective...",1201510539931029,American Economic Review,1,2015,March,105,3,120000000000.0,,...,0,0,0,0,0,0,0,"the cyclically of sales, regular and effective...",study cyclical properties sales regular price...,cyclically sales regular effective price bus...
1500,the econometrics of ultra-high-frequency data,22000681122,Econometrica,2,2000,,68,1,220006800.0,,...,0,0,0,0,0,0,0,the econometrics of ultra-high-frequency data ...,ultra-high-frequency data define full record t...,econometrics ultra-high-frequency data ultra-h...
1501,a three-step method for choosing the number of...,220006812351,Econometrica,2,2000,,68,1,2200068000.0,,...,0,0,0,0,0,0,0,a three-step method for choosing the number of...,consider problem choose number bootstrap repet...,three-step method choose number bootstrap repe...
1502,latent separability: grouping goods without we...,220006815384,Econometrica,2,2000,,68,1,2200068000.0,,...,0,0,0,0,0,0,0,latent separability: grouping goods without we...,develop new concept separability overlap group...,latent separability group goods without weak ...
1503,bargaining and reputation,2200068185117,Econometrica,2,2000,,68,1,2200068000.0,,...,0,0,0,0,0,0,0,bargaining and reputation The paper develops a...,develop reputation base theory bargain idea i...,bargain reputation develop reputation base the...


In [15]:
#save data for training set
df.to_csv(data_path + output_data + version + '.csv', index_label=False)