In [175]:
# Import libraries

import re
import json
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('omw-1.4')

import warnings

warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guusl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\guusl\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [176]:
# Prediction model using CountVectorizer() and LogisticRegression(), which is insufficient to train all data from training set

# Opening JSON files

f = open('train.json')
data = json.load(f)

g = open('test.json', )
test = json.load(g)

In [177]:
def createDataFrame(ds, additional_cols=[]):
    df = pd.DataFrame(ds)
    df['year'] = df['year'].astype(str)
    df['labels'] = df[['title', 'abstract', 'year', 'venue']].apply(lambda x: ','.join(x), axis=1)
    merged = df.drop(labels=['title', 'abstract', 'year', 'venue'] + additional_cols, axis=1)
    merged.head()

    # This replaces capital letters, and some symbols. It lowers all the texts, strips and splits and converts it to strings
    labels = merged['labels'].str.replace('[^A-Za-z]', ' ').str.lower().str.strip().str.split()
    merged['labels'] = [','.join(map(str, l)) for l in labels]

    return df, merged

In [178]:
# Pre-processing training/validation data
# It combines the columns that are given in the original dataset, to have only one independent variable column
data, merged_data = createDataFrame(data, ['paperId'])
data.head()

Unnamed: 0,paperId,title,authorId,authorName,abstract,year,venue,labels
0,0b341b6938308a6d5f47edf490f6e46eae3835fa,Detecting linguistic idiosyncratic interests i...,3188285,Masoud Rouhizadeh,Children with autism spectrum disorder often e...,2014,CLPsych@ACL,Children with autism spectrum disorder often e...
1,c682727ee058aadbe9dbf838dcb036322818f588,Bigrams and BiLSTMs Two Neural Networks for Se...,2782720,Yuri Bizzoni,We present and compare two alternative deep ne...,2018,Fig-Lang@NAACL-HLT,We present and compare two alternative deep ne...
2,0f9b5b32229a7245e43754430c0c88f8e7f0d8af,In Factuality: Efficient Integration of Releva...,144748442,Peter Vickers,Visual Question Answering (VQA) methods aim at...,2021,ACL,Visual Question Answering (VQA) methods aim at...
3,7e8b4cfdc03b59ece2d6b33a217f0abd47f708d9,Variational Graph Autoencoding as Cheap Superv...,46331602,Irene Li,Coreference resolution over semantic graphs li...,2022,ACL,Coreference resolution over semantic graphs li...
4,07588dd5d0252c7abc99b3834a81bf23741ead4b,LIMIT-BERT : Linguistics Informed Multi-Task BERT,30887404,Junru Zhou,"In this paper, we present Linguistics Informed...",2019,FINDINGS,"In this paper, we present Linguistics Informed..."


In [179]:
merged_data.head()

Unnamed: 0,authorId,authorName,labels
0,3188285,Masoud Rouhizadeh,"children,with,autism,spectrum,disorder,often,e..."
1,2782720,Yuri Bizzoni,"we,present,and,compare,two,alternative,deep,ne..."
2,144748442,Peter Vickers,"visual,question,answering,vqa,methods,aim,at,l..."
3,46331602,Irene Li,"coreference,resolution,over,semantic,graphs,li..."
4,30887404,Junru Zhou,"in,this,paper,we,present,linguistics,informed,..."


In [180]:
# Pre-processing test data (same process as above, but for test dataset)
test, merged_test = createDataFrame(test)

In [181]:
# Replace stopwords, symbols
# cleans out all the stopwords (the, a, an, etc.)
# removes all symbols and numbers

replace = re.compile('[/(){}\[\]\|@,;]')
replace_symbols = re.compile('[^0-9a-z #+_]')
replace_stopwords = set(stopwords.words('english'))

def clean_labels(labels):
    labels = labels.lower()  # lowercase labels
    labels = replace.sub(' ', labels)  # replace REPLACE_BY_SPACE_RE symbols by space in labels
    labels = replace_symbols.sub('', labels)  # delete symbols which are in BAD_SYMBOLS_RE from labels
    labels = ' '.join(word for word in labels.split() if word not in replace_stopwords)  # delete stopwords from labels
    return labels

merged_data['labels'] = merged_data['labels'].apply(clean_labels)  # applies the above loop to the data
merged_test['labels'] = merged_test['labels'].apply(clean_labels)  # applies the loop to test data

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()  # seperates the sentences to get single words
lemmatizer = nltk.stem.WordNetLemmatizer()  # lemmatizes text

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

merged_data['labels'] = merged_data['labels'].apply(lemmatize_text)
merged_data['labels'] = [','.join(map(str, l)) for l in merged_data['labels']]

In [182]:
# selects only the authors that are appearing in the dataset at least 4 times
merged_data = merged_data.groupby('authorId').filter(lambda x: x['authorId'].shape[0] >= 4)

In [183]:
merged_data['labels']  # preview of the data

1        present,compare,two,alternative,deep,neural,ar...
6        describe,grammarless,method,simultaneously,bra...
8        paper,describes,limsi,participation,wmt,shared...
10       little,work,modeling,morphological,well,formed...
13       transfer,learning,pre,trained,neural,language,...
                               ...                        
12104    multilingual,representation,embed,word,many,la...
12112    count,based,distributional,semantic,model,suff...
12116    study,event,detection,problem,new,type,extensi...
12119    transformer,widely,used,state,art,machine,tran...
12120    pre,trained,word,embeddings,improve,performanc...
Name: labels, Length: 3268, dtype: object

In [184]:
# Determine independent and dependent variables
X = merged_data['labels']
y = merged_data['authorId']

In [185]:
X.head()

1     present,compare,two,alternative,deep,neural,ar...
6     describe,grammarless,method,simultaneously,bra...
8     paper,describes,limsi,participation,wmt,shared...
10    little,work,modeling,morphological,well,formed...
13    transfer,learning,pre,trained,neural,language,...
Name: labels, dtype: object

In [186]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.15, random_state=42)
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
tfidf_transformer = TfidfTransformer()  # transforms the hashed vectorized texted by using tfidf transformer

X_train_hashed = vectorizer.fit_transform(X_train)
X_test_hashed = vectorizer.transform(X_test)

X_train = tfidf_transformer.fit_transform(X_train_hashed)
X_test = tfidf_transformer.transform(X_test_hashed)
X_train

<2777x143803 sparse matrix of type '<class 'numpy.float64'>'
	with 349581 stored elements in Compressed Sparse Row format>

In [187]:
def create_model(batchsize, epoch):
    model = SGDClassifier()  # The classifier to use, this is supposed to be best applicable to large datasets
    batches = int(X_train.shape[0] / batchsize) + 1
    samples = X_train.shape[0]
    for i in range(epoch):
        for j in range(batches):
            model.partial_fit(X_train[j * batchsize:min(samples, (j + 1) * batchsize)],
                              Y_train[j * batchsize:min(samples, (j + 1) * batchsize)],
                              classes=np.unique(y))
    return model

def best_model():
    best_acc = -1
    best_epoch = -1
    best_batch = -1

    for n_batches in [50, 100, 250, 500, 1000]:
        for n_epoch in [5, 10, 25, 50]:
            print("Testing with ", n_batches, " batches and ", n_epoch, " epochs")
            attempt = create_model(n_batches, n_epoch)

            accuracy = attempt.score(X_test, Y_test)
            if accuracy > best_acc:
                best_acc = accuracy
                best_epoch = n_epoch
                best_batch = n_batches

            print("Accuracy:", accuracy)

    print("Determined best: batches=", best_batch, ", epochs=", best_epoch)
    return create_model(best_batch, best_epoch)

best = best_model()
print("Accuracy on testing data :", best.score(X_test, Y_test))

Testing with  50  batches and  5  epochs
Accuracy: 0.2505091649694501
Testing with  50  batches and  10  epochs
Accuracy: 0.26272912423625255
Testing with  50  batches and  25  epochs
Accuracy: 0.2545824847250509
Testing with  50  batches and  50  epochs
Accuracy: 0.2525458248472505
Testing with  100  batches and  5  epochs
Accuracy: 0.2525458248472505
Testing with  100  batches and  10  epochs
Accuracy: 0.25865580448065173
Testing with  100  batches and  25  epochs
Accuracy: 0.25661914460285135
Testing with  100  batches and  50  epochs
Accuracy: 0.2545824847250509
Testing with  250  batches and  5  epochs
Accuracy: 0.2606924643584521
Testing with  250  batches and  10  epochs
Accuracy: 0.26476578411405294
Testing with  250  batches and  25  epochs
Accuracy: 0.2545824847250509
Testing with  250  batches and  50  epochs
Accuracy: 0.2525458248472505
Testing with  500  batches and  5  epochs
Accuracy: 0.2525458248472505
Testing with  500  batches and  10  epochs
Accuracy: 0.2545824847250

KeyboardInterrupt: 

In [None]:
# https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
# https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html

# https://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html#sphx-glr-auto-examples-applications-plot-out-of-core-classification-py
# https://medium.com/mlearning-ai/out-of-core-multi-label-text-classification-with-scikit-learn-14afa4c1bb75
# https://towardsdatascience.com/how-to-make-sgd-classifier-perform-as-well-as-logistic-regression-using-parfit-cc10bca2d3c4

In [None]:
new_data = merged_test['labels']  # the test data labels that we will apply to transform to numerical
X_new = vectorizer.transform(new_data)  # transforms the data by using the vectorizer
y_pred = best.predict(X_new)  # predicts the new values of the test set

In [None]:
merged_test['prediction'] = y_pred.tolist()  # puts the predicted data to list
final = merged_test.set_axis(['paperId', 'labels', 'authorId'], axis=1, inplace=False)  # changes the axis labels
final = final.drop(labels=['labels'], axis=1)  # drops the labels column to get final result of only paperId & authorId

In [None]:
final