### Import libraries

In [1]:
import pandas as pd
import numpy as np
import ast

import scipy.stats as stats
from sklearn.utils.fixes import loguniform
import scipy as sp
from scipy.sparse import hstack
from collections import Counter

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
#from xgboost import XGBClassifier
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
#from utility_functions import *
from sklearn.ensemble import RandomForestClassifier, VotingClassifier


In [2]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mejia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mejia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
def Accuracy(y_true, y_pred):
    """
    Accuracy based on Jaccard Similarity Score
    :param y_true: ground truth
    :param y_pred: prediction
    :return: Jaccard Similarity Score
    """
    jaccard = np.minimum(y_true, y_pred).sum(axis=1) / np.maximum(y_true, y_pred).sum(axis=1)
    return jaccard.mean()


def print_ml_score(y_test, y_pred, clf):
    print('Classifier: ', clf.__class__.__name__)
    print('Accuracy Score: {}'.format(Accuracy(y_test, y_pred)))
    print("-----------------------------------")


def train_model(classifier, feature_vector_train, label_train, feature_vector_test, label_test):
    # fit the training set on the classifier
    clf = MultiOutputClassifier(classifier)
    clf.fit(feature_vector_train, label_train)

    # predict the labels on test set
    predictions = clf.predict(feature_vector_test)
    #print(pd.DataFrame(predictions, columns = multilabel.classes_))
    return print_ml_score(label_test, predictions, classifier)

wnl = WordNetLemmatizer()
def clean_text(text_series):
    econ_stopwords = ['model', 'using', 'paper']
    text_tokens = word_tokenize(text_series)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    tokens_without_sw = [word for word in tokens_without_sw if not word in econ_stopwords]
    tokens_without_sw_lemma = [wnl.lemmatize(word, pos="v") for word in tokens_without_sw if not word in econ_stopwords]

    # removing stopwords and econ stopwords
    text_series = " ".join(tokens_without_sw_lemma)
    # removing double quotes from text
    text_series = text_series.replace('"', '')
    # removing single quotes from text
    text_series = text_series.replace("'", '')
    # removing comma from text
    text_series = text_series.replace(',', '')
    # removing dot from text
    text_series = text_series.replace('.', '')
    # removing double dot from text
    text_series = text_series.replace(':', '')
    # removing percentage from text
    text_series = text_series.replace('%', '')
    # remove numbers from text
    text_series = re.sub(r'[0-9]+', '', text_series)

    return text_series

In [11]:
# Notebook parameters
data_name = 'traning_data_cleaned_v03.csv'
data_path = 'data/'

In [12]:
# load data

df = pd.read_csv(data_path + data_name)
#df.drop(columns=['Unnamed: 0'], inplace=True)
df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x))
print(df.shape)
df.head()

(3126, 7)


Unnamed: 0,title_x,abstract,tags,idauthor,all_text,cleaned_abstract,cleaned_all_text
0,optimal adoption of complementary technologies,When a production process requires two extreme...,"[O, D, G, E]","['p00681', 'p01338']",optimal adoption of complementary technologies...,production process require two extremely compl...,optimal adoption complementary technologies pr...
1,collateral damage: effects of the japanese ban...,The Japanese banking crisis provides a natural...,"[G, E]","['p01546', 'p02544']",collateral damage: effects of the japanese ban...,japanese bank crisis provide natural experimen...,collateral damage effect japanese bank crisis...
2,endogenous inequality in integrated labor mark...,We consider a market with red and green worker...,[J],"['p00544', 'p01874', 'p03092']",endogenous inequality in integrated labor mark...,consider market red green workers label payof...,endogenous inequality integrate labor market t...
3,"labor-market integration, investment in risky ...",This paper presents a general-equilibrium mode...,"[J, R]",['p01266'],"labor-market integration, investment in risky ...",present general-equilibrium human capital inve...,labor-market integration investment risky hum...
4,unequal societies: income distribution and the...,This paper develops a theory of inequality and...,"[P, E, I, D]",['p04639'],unequal societies: income distribution and the...,develop theory inequality social contract aim ...,unequal societies income distribution social ...


In [13]:
df['keep'] = df.tags.apply(lambda tags: 'Y' not in tags)
df = df[df.keep == True]
df['keep'] = df.tags.apply(lambda tags: 'A' not in tags)
df = df[df.keep == True]
df['keep'] = df.tags.apply(lambda tags: 'B' not in tags)
df = df[df.keep == True]
df.head()

Unnamed: 0,title_x,abstract,tags,idauthor,all_text,cleaned_abstract,cleaned_all_text,keep
0,optimal adoption of complementary technologies,When a production process requires two extreme...,"[O, D, G, E]","['p00681', 'p01338']",optimal adoption of complementary technologies...,production process require two extremely compl...,optimal adoption complementary technologies pr...,True
1,collateral damage: effects of the japanese ban...,The Japanese banking crisis provides a natural...,"[G, E]","['p01546', 'p02544']",collateral damage: effects of the japanese ban...,japanese bank crisis provide natural experimen...,collateral damage effect japanese bank crisis...,True
2,endogenous inequality in integrated labor mark...,We consider a market with red and green worker...,[J],"['p00544', 'p01874', 'p03092']",endogenous inequality in integrated labor mark...,consider market red green workers label payof...,endogenous inequality integrate labor market t...,True
3,"labor-market integration, investment in risky ...",This paper presents a general-equilibrium mode...,"[J, R]",['p01266'],"labor-market integration, investment in risky ...",present general-equilibrium human capital inve...,labor-market integration investment risky hum...,True
4,unequal societies: income distribution and the...,This paper develops a theory of inequality and...,"[P, E, I, D]",['p04639'],unequal societies: income distribution and the...,develop theory inequality social contract aim ...,unequal societies income distribution social ...,True


## Machine Learning: Abstract

### Feature Engineering
In this step, raw text data will be transformed into feature vectors using different
text representation.

In [14]:
# Convert outcome variable to one-hot encoding type
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df['tags'])
pd.DataFrame(y, columns = multilabel.classes_)

Unnamed: 0,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,Z
0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
4,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3087,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3088,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
3089,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3090,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0


In [15]:
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                                   ngram_range=(1,3), max_features=3000)

X_tfidf_ngram = tfidf_vect_ngram.fit_transform(df['cleaned_all_text'])

print('X with TfIdf Ngram Vec Shape: ', X_tfidf_ngram.shape)

X with TfIdf Ngram Vec Shape:  (3092, 3000)


In [16]:
xtrain_tfidf_ngram, xtest_tfidf_ngram, ytrain_tfidf_ngram, ytest_tfidf_ngram = train_test_split(X_tfidf_ngram, y,
                                                                        test_size=0.2,
                                                                        random_state=123)

In [17]:
list(zip(list(np.sum(ytest_tfidf_ngram, axis=0)), list(np.sum(ytrain_tfidf_ngram, axis=0))))

[(128, 523),
 (289, 1215),
 (139, 545),
 (84, 293),
 (94, 431),
 (86, 329),
 (78, 295),
 (136, 524),
 (31, 95),
 (114, 460),
 (21, 97),
 (28, 112),
 (102, 384),
 (12, 61),
 (22, 93),
 (29, 148),
 (12, 78)]

### Building Models

In this section we test several machine learning models in order to get the best one.

In [18]:
# set models
sgd = SGDClassifier(class_weight='balanced')
lr = LogisticRegression(solver = 'lbfgs', class_weight='balanced')
svc = LinearSVC(class_weight='balanced')
rf = RandomForestClassifier()
naive = MultinomialNB()

In [27]:
eclf1 = VotingClassifier(estimators=[
        ('lr', lr), ('rf', rf), ('sgd', sgd), ('naive', naive)], voting='hard')
train_model(eclf1, xtrain_tfidf_ngram, ytrain_tfidf_ngram, xtest_tfidf_ngram, ytest_tfidf_ngram)

Classifier:  VotingClassifier
Accuracy Score: 0.49614970382337104
-----------------------------------


In [25]:
# training using tfidf ngram vectorizer
for classifier in [sgd, lr, svc, naive]:
    train_model(classifier, xtrain_tfidf_ngram, ytrain_tfidf_ngram, xtest_tfidf_ngram, ytest_tfidf_ngram)

Classifier:  SGDClassifier
Accuracy Score: 0.5134625740441573
-----------------------------------
Classifier:  LogisticRegression
Accuracy Score: 0.5748038310639281
-----------------------------------
Classifier:  LinearSVC
Accuracy Score: 0.5434571890145395
-----------------------------------
Classifier:  MultinomialNB
Accuracy Score: 0.4272482498653743
-----------------------------------


In [26]:
clf = ClassifierChain(LogisticRegression(solver = 'lbfgs', class_weight='balanced'))
clf.fit(xtrain_tfidf_ngram, ytrain_tfidf_ngram)

# predict the labels on test set

predictions = clf.predict(xtest_tfidf_ngram)

In [27]:
predictions

array([[1., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [28]:
print_ml_score(ytest_tfidf_ngram, predictions, LogisticRegression(solver = 'lbfgs', class_weight='balanced'))

Classifier:  LogisticRegression
Accuracy Score: 0.5490698002410442
-----------------------------------


In [29]:
from sklearn.metrics import classification_report


In [30]:
print(classification_report(ytest_tfidf_ngram, predictions))



              precision    recall  f1-score   support

           0       0.62      0.72      0.67       128
           1       0.76      0.74      0.75       289
           2       0.70      0.79      0.74       139
           3       0.76      0.83      0.80        84
           4       0.66      0.78      0.72        94
           5       0.50      0.66      0.57        86
           6       0.60      0.83      0.70        78
           7       0.62      0.76      0.68       136
           8       0.45      0.45      0.45        31
           9       0.56      0.75      0.64       114
          10       0.20      0.62      0.30        21
          11       0.28      0.43      0.34        28
          12       0.56      0.65      0.60       102
          13       0.10      0.17      0.12        12
          14       0.52      0.55      0.53        22
          15       0.41      0.69      0.51        29
          16       0.17      0.50      0.25        12

   micro avg       0.59   