### Import libraries

In [32]:
import pandas as pd
import numpy as np
import ast

import scipy.stats as stats
from sklearn.utils.fixes import loguniform
import scipy as sp
from scipy.sparse import hstack
from collections import Counter

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.multiclass import OneVsRestClassifier

#from utility_functions import *

In [41]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marce\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marce\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def Accuracy(y_true, y_pred):
    """
    Accuracy based on Jaccard Similarity Score
    :param y_true: ground truth
    :param y_pred: prediction
    :return: Jaccard Similarity Score
    """
    jaccard = np.minimum(y_true, y_pred).sum(axis=1) / np.maximum(y_true, y_pred).sum(axis=1)
    return jaccard.mean()


def print_ml_score(y_test, y_pred, clf):
    print('Classifier: ', clf.__class__.__name__)
    print('Accuracy Score: {}'.format(Accuracy(y_test, y_pred)))
    print("-----------------------------------")


def train_model(classifier, feature_vector_train, label_train, feature_vector_test, label_test):
    # fit the training set on the classifier
    clf = OneVsRestClassifier(classifier)
    clf.fit(feature_vector_train, label_train)

    # predict the labels on test set
    predictions = clf.predict(feature_vector_test)

    return print_ml_score(label_test, predictions, classifier)

wnl = WordNetLemmatizer()
def clean_text(text_series):
    econ_stopwords = ['model', 'using', 'paper']
    text_tokens = word_tokenize(text_series)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    tokens_without_sw = [word for word in tokens_without_sw if not word in econ_stopwords]
    tokens_without_sw_lemma = [wnl.lemmatize(word, pos="v") for word in tokens_without_sw if not word in econ_stopwords]

    # removing stopwords and econ stopwords
    text_series = " ".join(tokens_without_sw_lemma)
    # removing double quotes from text
    text_series = text_series.replace('"', '')
    # removing single quotes from text
    text_series = text_series.replace("'", '')
    # removing comma from text
    text_series = text_series.replace(',', '')
    # removing dot from text
    text_series = text_series.replace('.', '')
    # removing double dot from text
    text_series = text_series.replace(':', '')
    # removing percentage from text
    text_series = text_series.replace('%', '')
    # remove numbers from text
    text_series = re.sub(r'[0-9]+', '', text_series)

    return text_series

In [33]:
# Notebook parameters
data_name = 'traning_data_cleaned_v02.csv'
data_path = '../data/'

In [34]:
# load data

df = pd.read_csv(data_path + data_name)
#df.drop(columns=['Unnamed: 0'], inplace=True)
df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x))
print(df.shape)
df.head()

(3126, 6)


Unnamed: 0,abstract,tags,title_x,all_text,cleaned_abstract,cleaned_all_text
0,When a production process requires two extreme...,"[O, D, G, E]",optimal adoption of complementary technologies,optimal adoption of complementary technologies...,production process require two extremely compl...,optimal adoption complementary technologies pr...
1,The Japanese banking crisis provides a natural...,"[G, E]",collateral damage: effects of the japanese ban...,collateral damage: effects of the japanese ban...,japanese bank crisis provide natural experimen...,collateral damage effect japanese bank crisis...
2,We consider a market with red and green worker...,[J],endogenous inequality in integrated labor mark...,endogenous inequality in integrated labor mark...,consider market red green workers label payof...,endogenous inequality integrate labor market t...
3,This paper presents a general-equilibrium mode...,"[J, R]","labor-market integration, investment in risky ...","labor-market integration, investment in risky ...",present general-equilibrium human capital inve...,labor-market integration investment risky hum...
4,This paper develops a theory of inequality and...,"[P, E, I, D]",unequal societies: income distribution and the...,unequal societies: income distribution and the...,develop theory inequality social contract aim ...,unequal societies income distribution social ...


## Machine Learning: Abstract

In [35]:
# Convert outcome variable to one-hot encoding type
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df['tags'])

In [36]:
# view classes
pd.DataFrame(y, columns = multilabel.classes_)

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,Y,Z
0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
4,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3121,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3122,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
3123,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3124,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0


### Feature Engineering
In this step, raw text data will be transformed into feature vectors using different
text representation.

In [37]:
# count vectorizer
count_vec = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=3000)
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=3000)
# ngram level tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                                   ngram_range=(1,3), max_features=3000)

# fit to text
X_count = count_vec.fit_transform(df['cleaned_abstract'])
X_tfidf = tfidf_vect.fit_transform(df['cleaned_abstract'])
X_tfidf_ngram = tfidf_vect_ngram.fit_transform(df['cleaned_abstract'])

# print shapes
print('X with Count Vec Shape: ', X_count.shape)
print('X with TfIdf Vec Shape: ', X_tfidf.shape)
print('X with TfIdf Ngram Vec Shape: ', X_tfidf_ngram.shape)

X with Count Vec Shape:  (3126, 3000)
X with TfIdf Vec Shape:  (3126, 3000)
X with TfIdf Ngram Vec Shape:  (3126, 3000)


In [38]:
xtrain_count, xtest_count, ytrain_count, ytest_count = train_test_split(X_count, y,
                                                                        test_size=0.2,
                                                                        random_state=123)

xtrain_tfidf, xtest_tfidf, ytrain_tfidf, ytest_tfidf = train_test_split(X_tfidf, y,
                                                                        test_size=0.2,
                                                                        random_state=123)
xtrain_tfidf_ngram, xtest_tfidf_ngram, ytrain_tfidf_ngram, ytest_tfidf_ngram = train_test_split(X_tfidf_ngram, y,
                                                                        test_size=0.2,
                                                                        random_state=123)

### Building Models

In this section we test several machine learning models in order to get the best one.

In [39]:
# set models
sgd = SGDClassifier()
lr = LogisticRegression(solver = 'lbfgs')
svc = LinearSVC()
rf = RandomForestClassifier()
naive = MultinomialNB()
xgboost = XGBClassifier()

In [42]:
# training using count vectorizer
for classifier in [sgd, lr, svc, rf, naive, xgboost]:
    train_model(classifier, xtrain_count, ytrain_count, xtest_count, ytest_count)

Classifier:  SGDClassifier
Accuracy Score: 0.45645253308991324
-----------------------------------
Classifier:  LogisticRegression
Accuracy Score: 0.47565038795070747
-----------------------------------
Classifier:  LinearSVC
Accuracy Score: 0.45297809219534463
-----------------------------------
Classifier:  RandomForestClassifier
Accuracy Score: 0.40511182108626204
-----------------------------------
Classifier:  MultinomialNB
Accuracy Score: 0.5073124869450748
-----------------------------------
Classifier:  XGBClassifier
Accuracy Score: 0.4725696029210406
-----------------------------------


In [43]:
# training using tfidf vectorizer
for classifier in [sgd, lr, svc, rf, naive, xgboost]:
    train_model(classifier, xtrain_tfidf, ytrain_tfidf, xtest_tfidf, ytest_tfidf)

Classifier:  SGDClassifier
Accuracy Score: 0.48872660885440433
-----------------------------------
Classifier:  LogisticRegression
Accuracy Score: 0.3840255591054313
-----------------------------------
Classifier:  LinearSVC
Accuracy Score: 0.4994941427050053
-----------------------------------
Classifier:  RandomForestClassifier
Accuracy Score: 0.4274380039555758
-----------------------------------
Classifier:  MultinomialNB
Accuracy Score: 0.3527156549520767
-----------------------------------
Classifier:  XGBClassifier
Accuracy Score: 0.45638977635782746
-----------------------------------


In [44]:
# training using tfidf ngram vectorizer
for classifier in [sgd, lr, svc, rf, naive, xgboost]:
    train_model(classifier, xtrain_tfidf_ngram, ytrain_tfidf_ngram, xtest_tfidf_ngram, ytest_tfidf_ngram)

Classifier:  SGDClassifier
Accuracy Score: 0.48867336071808914
-----------------------------------
Classifier:  LogisticRegression
Accuracy Score: 0.3898828541001065
-----------------------------------
Classifier:  LinearSVC
Accuracy Score: 0.4976304579339723
-----------------------------------
Classifier:  RandomForestClassifier
Accuracy Score: 0.4519435569755058
-----------------------------------
Classifier:  MultinomialNB
Accuracy Score: 0.3891373801916933
-----------------------------------
Classifier:  XGBClassifier
Accuracy Score: 0.4796135706678838
-----------------------------------


## Machine Learning: Abstract +  Title

In [45]:
# count vectorizer
count_vec = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=3000)
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=3000)
# ngram level tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                                   ngram_range=(1,3), max_features=3000)

# fit to text
X_count = count_vec.fit_transform(df['cleaned_all_text'])
X_tfidf = tfidf_vect.fit_transform(df['cleaned_all_text'])
X_tfidf_ngram = tfidf_vect_ngram.fit_transform(df['cleaned_all_text'])

# print shapes
print('X with Count Vec Shape: ', X_count.shape)
print('X with TfIdf Vec Shape: ', X_tfidf.shape)
print('X with TfIdf Ngram Vec Shape: ', X_tfidf_ngram.shape)

X with Count Vec Shape:  (3126, 3000)
X with TfIdf Vec Shape:  (3126, 3000)
X with TfIdf Ngram Vec Shape:  (3126, 3000)


In [46]:
xtrain_count, xtest_count, ytrain_count, ytest_count = train_test_split(X_count, y,
                                                                        test_size=0.2,
                                                                        random_state=123)

xtrain_tfidf, xtest_tfidf, ytrain_tfidf, ytest_tfidf = train_test_split(X_tfidf, y,
                                                                        test_size=0.2,
                                                                        random_state=123)
xtrain_tfidf_ngram, xtest_tfidf_ngram, ytrain_tfidf_ngram, ytest_tfidf_ngram = train_test_split(X_tfidf_ngram, y,
                                                                        test_size=0.2,
                                                                        random_state=123)

In [48]:
# training using count vectorizer
for classifier in [sgd, lr, svc, rf, naive, xgboost]:
    train_model(classifier, xtrain_count, ytrain_count, xtest_count, ytest_count)

Classifier:  SGDClassifier
Accuracy Score: 0.4698482428115016
-----------------------------------
Classifier:  LogisticRegression
Accuracy Score: 0.4866879659211927
-----------------------------------
Classifier:  LinearSVC
Accuracy Score: 0.4620378822455499
-----------------------------------
Classifier:  RandomForestClassifier
Accuracy Score: 0.43477103301384445
-----------------------------------
Classifier:  MultinomialNB
Accuracy Score: 0.5111364230373814
-----------------------------------
Classifier:  XGBClassifier
Accuracy Score: 0.47750266240681577
-----------------------------------


In [49]:
# training using tfidf vectorizer
for classifier in [sgd, lr, svc, rf, naive, xgboost]:
    train_model(classifier, xtrain_tfidf, ytrain_tfidf, xtest_tfidf, ytest_tfidf)

Classifier:  SGDClassifier
Accuracy Score: 0.5060588772250114
-----------------------------------
Classifier:  LogisticRegression
Accuracy Score: 0.3982428115015974
-----------------------------------
Classifier:  LinearSVC
Accuracy Score: 0.517891373801917
-----------------------------------
Classifier:  RandomForestClassifier
Accuracy Score: 0.43610223642172524
-----------------------------------
Classifier:  MultinomialNB
Accuracy Score: 0.36110223642172523
-----------------------------------
Classifier:  XGBClassifier
Accuracy Score: 0.47839647040925
-----------------------------------


In [50]:
# training using tfidf ngram vectorizer
for classifier in [sgd, lr, svc, rf, naive, xgboost]:
    train_model(classifier, xtrain_tfidf_ngram, ytrain_tfidf_ngram, xtest_tfidf_ngram, ytest_tfidf_ngram)

Classifier:  SGDClassifier
Accuracy Score: 0.5150844363304427
-----------------------------------
Classifier:  LogisticRegression
Accuracy Score: 0.40814696485623
-----------------------------------
Classifier:  LinearSVC
Accuracy Score: 0.5138711395101171
-----------------------------------
Classifier:  RandomForestClassifier
Accuracy Score: 0.46890308839190625
-----------------------------------
Classifier:  MultinomialNB
Accuracy Score: 0.4078274760383387
-----------------------------------
Classifier:  XGBClassifier
Accuracy Score: 0.48876464323748675
-----------------------------------


In [74]:
clf = OneVsRestClassifier(MultinomialNB())
clf.fit(xtrain_tfidf_ngram, ytrain_tfidf_ngram)

# predict the labels on test set
predictions_proba = clf.predict_proba(xtest_tfidf_ngram)
predictions = clf.predict(xtest_tfidf_ngram)

In [75]:
predictions

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [76]:
predictions_proba

array([[5.98415507e-04, 3.06810232e-04, 3.93300957e-01, ...,
        7.12616326e-03, 0.00000000e+00, 2.09828287e-03],
       [5.42272967e-04, 2.51180405e-04, 4.01969759e-02, ...,
        7.60882594e-03, 0.00000000e+00, 1.84714786e-03],
       [1.83267169e-03, 1.05214196e-03, 3.30301945e-01, ...,
        1.47626088e-02, 0.00000000e+00, 5.28516758e-03],
       ...,
       [1.45343208e-03, 6.76623362e-04, 4.75996197e-02, ...,
        1.72370218e-02, 0.00000000e+00, 6.06482133e-03],
       [1.19369403e-03, 5.40495957e-04, 7.70728502e-01, ...,
        5.97033866e-03, 0.00000000e+00, 5.30738098e-03],
       [1.26221051e-03, 6.23002459e-04, 4.18347106e-01, ...,
        1.18097278e-02, 0.00000000e+00, 7.26695412e-03]])

In [105]:
test_pred = (clf.predict_proba(xtest_tfidf_ngram) >= 0.23).astype(int)
test_pred

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [106]:
print_ml_score(ytest_tfidf_ngram, test_pred, MultinomialNB())

Classifier:  MultinomialNB
Accuracy Score: 0.5315343070135403
-----------------------------------


In [107]:
from sklearn.metrics import classification_report


In [109]:
print(classification_report(ytest_tfidf_ngram, test_pred))



              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         2
           2       0.57      0.67      0.61       128
           3       0.61      0.95      0.74       309
           4       0.65      0.78      0.71       141
           5       0.85      0.67      0.75        85
           6       0.74      0.75      0.75       101
           7       0.60      0.35      0.44        85
           8       0.72      0.63      0.67        76
           9       0.65      0.79      0.72       134
          10       1.00      0.05      0.09        21
          11       0.61      0.67      0.64       109
          12       0.00      0.00      0.00        28
          13       0.00      0.00      0.00        33
          14       0.63      0.67      0.65       106
          15       0.00      0.00      0.00        21
          16       1.00      0.08      0.15        25
          17       1.00    