In [1]:
# Import libraries

import re
import json
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score

nltk.download('stopwords')
nltk.download('omw-1.4')

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import RandomizedSearchCV
from sklearn import linear_model

from collections import Counter
from sklearn.metrics import r2_score, mean_absolute_error
from datetime import datetime
from scipy import sparse
import scipy as sp

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jasoncharnock/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jasoncharnock/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# Opening JSON files

f = open('train.json')
data = json.load(f)

g = open('test.json', )
test = json.load(g)

In [3]:
data = pd.DataFrame(data)
test = pd.DataFrame(test)
data.head()

Unnamed: 0,paperId,title,authorId,authorName,abstract,year,venue
0,0b341b6938308a6d5f47edf490f6e46eae3835fa,Detecting linguistic idiosyncratic interests i...,3188285,Masoud Rouhizadeh,Children with autism spectrum disorder often e...,2014,CLPsych@ACL
1,c682727ee058aadbe9dbf838dcb036322818f588,Bigrams and BiLSTMs Two Neural Networks for Se...,2782720,Yuri Bizzoni,We present and compare two alternative deep ne...,2018,Fig-Lang@NAACL-HLT
2,0f9b5b32229a7245e43754430c0c88f8e7f0d8af,In Factuality: Efficient Integration of Releva...,144748442,Peter Vickers,Visual Question Answering (VQA) methods aim at...,2021,ACL
3,7e8b4cfdc03b59ece2d6b33a217f0abd47f708d9,Variational Graph Autoencoding as Cheap Superv...,46331602,Irene Li,Coreference resolution over semantic graphs li...,2022,ACL
4,07588dd5d0252c7abc99b3834a81bf23741ead4b,LIMIT-BERT : Linguistics Informed Multi-Task BERT,30887404,Junru Zhou,"In this paper, we present Linguistics Informed...",2019,FINDINGS


In [4]:
replace = re.compile('[/(){}\[\]\|@,;]')
replace_symbols = re.compile('[^0-9a-z #+_]')
replace_stopwords = set(stopwords.words('english'))

def clean_labels(labels):
    labels = labels.lower()  # lowercase labels
    labels = labels.replace('-', ' ')
    labels = replace.sub(' ', labels)  # replace REPLACE_BY_SPACE_RE symbols by space in labels
    labels = replace_symbols.sub('', labels)  # delete symbols which are in BAD_SYMBOLS_RE from labels
    labels = ' '.join(word for word in labels.split() if word not in replace_stopwords)  # delete stopwords from labels
    labels = labels.replace(' ', ',')
    return labels

data['title'] = data['title'].apply(clean_labels)
data['abstract'] = data['abstract'].apply(clean_labels)

test['title'] = test['title'].apply(clean_labels)
test['abstract'] = test['abstract'].apply(clean_labels)

In [5]:
data.head(20)
data.loc[data['authorId'] == '4160376']

# Use text blob to filter out words that are not in dictionary 

Unnamed: 0,paperId,title,authorId,authorName,abstract,year,venue
4588,b04ad4b86f46b0c6670f0da898a1300f385bf3d2,"combining,recurrent,convolutional,neural,netwo...",4160376,Ngoc Thang Vu,"paper,investigates,two,different,neural,archit...",2016,NAACL


In [6]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()  # seperates the sentences to get single words
lemmatizer = nltk.stem.WordNetLemmatizer()  # lemmatizes text

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

data['title'] = data['title'].apply(lemmatize_text)
data['title'] = [','.join(map(str, l)) for l in data['title']]
data['abstract'] = data['abstract'].apply(lemmatize_text)
data['abstract'] = [','.join(map(str, l)) for l in data['abstract']]

test['title'] = test['title'].apply(lemmatize_text)
test['title'] = [','.join(map(str, l)) for l in test['title']]
test['abstract'] = test['abstract'].apply(lemmatize_text)
test['abstract'] = [','.join(map(str, l)) for l in test['abstract']]

In [7]:
#data.head(20)
data.loc[data['authorId'] == '4160376']

Unnamed: 0,paperId,title,authorId,authorName,abstract,year,venue
4588,b04ad4b86f46b0c6670f0da898a1300f385bf3d2,"combining,recurrent,convolutional,neural,netwo...",4160376,Ngoc Thang Vu,"paper,investigates,two,different,neural,archit...",2016,NAACL


In [8]:
test.head(20)

Unnamed: 0,paperId,title,abstract,year,venue
0,86e1aaa0c47659e08a896e9889384eb1e5401e6a,"exploring,linear,subspace,hypothesis,gender,bi...","bolukbasi,et,al,2016,presents,one,first,gender...",2020,EMNLP
1,8d3076c38f56df22052567f4783c670d8e860f09,"hyknow,end,end,task,oriented,dialog,modeling,h...","task,oriented,dialog,tod,systems,typically,man...",2021,FINDINGS
2,7c400ee676d427eeda1aad5c1c54c316f0b9773d,"multilingual,information,extraction,pipeline,i...","introduce,advanced,information,extraction,pipe...",2018,EMNLP
3,185e7d2a761594451b02ace240356dadad2aef78,"dice,loss,data,imbalanced,nlp,tasks","many,nlp,tasks,tagging,machine,reading,compreh...",2019,ACL
4,e4363d077a890c8d5c5e66b82fe69a1bbbdd5c80,"attention,guided,graph,convolutional,networks,...","dependency,trees,convey,rich,structural,inform...",2019,ACL
5,ca20f8146adf21294f048121f234c449299ef67e,"combining,multiple,knowledge,sources,discourse...","predict,discourse,segment,boundaries,linguisti...",1995,ACL
6,cbdb0d682a59933fb144124f1bfaec0ee3f3b04c,"unsupervised,opinion,summarization,noising,den...","supervised,training,high,capacity,models,large...",2020,ACL
7,477cc0e5533ed08768f83d41718feaf7cbcaf3a6,"towards,understanding,geometry,knowledge,graph...","knowledge,graph,kg,embedding,emerged,active,ar...",2018,ACL
8,53d8b356551a2361020a948f64454a6d599af69f,"prefix,tuning,optimizing,continuous,prompts,ge...","fine,tuning,de,facto,way,leveraging,large,pret...",2021,ACL
9,b9608e053896874d4f7d62f744057cf7105c5c90,"data,driven,enough,revisiting,interactive,inst...","modeling,traditional,nlg,tasks,data,driven,tec...",2018,HRI 2018


In [9]:
#data = data.sample(1000, replace = True)

In [10]:
data['authorId'].value_counts()

1750769       13
1747849       13
51042088      12
2854981       12
3422953       11
              ..
40192974       1
2013172        1
2106294609     1
5677323        1
144928136      1
Name: authorId, Length: 5625, dtype: int64

In [11]:
data['authorId'].head(20)

0        3188285
1        2782720
2      144748442
3       46331602
4       30887404
5       46649145
6        2390150
7      151474408
8        1696542
9      144518416
10    1667898858
11    2072874946
12    1721683964
13     145938140
14      31333199
15     145482266
16      46177105
17       1768065
18      25062613
19       1736049
Name: authorId, dtype: object

In [12]:
title = data['title']
abstract = data['abstract']
year = data['year']
venue = data['venue']
author_id = data['authorId']

In [13]:
title_test = test['title']
abstract_test = test['abstract']
year_test = test['year']
venue_test = test['venue']

In [14]:
vectorizer = HashingVectorizer(ngram_range=(1,2), n_features=2**18) # applies the hashing vectorizer (found to be the best for large text datasets)
transformer = TfidfTransformer()
encoder = LabelEncoder()
count = CountVectorizer()

In [15]:
title_vect = vectorizer.transform(title)
title = transformer.fit_transform(title_vect)
print("Transformed title shape: ", title.shape)
abstract_vect = vectorizer.transform(abstract)
abstract = transformer.fit_transform(abstract_vect)
print("Transformed abstract shape: ", abstract.shape)
venue_vect = vectorizer.transform(venue)
venue = transformer.fit_transform(venue_vect)
print("Transformed venue shape: ", venue.shape)
year = encoder.fit_transform(year)
print("Transformed year shape: ", year.shape)
print(year)

Transformed title shape:  (12129, 262144)
Transformed abstract shape:  (12129, 262144)
Transformed venue shape:  (12129, 262144)
Transformed year shape:  (12129,)
[35 39 42 ... 42 39 35]


In [16]:
title_vect = vectorizer.transform(title_test)
title_test = transformer.transform(title_vect)
print("Transformed title shape: ", title_test.shape)
abstract_vect = vectorizer.transform(abstract_test)
abstract_test = transformer.transform(abstract_vect)
print("Transformed abstract shape: ", abstract_test.shape)
venue_vect = vectorizer.transform(venue_test)
venue_test = transformer.transform(venue_vect)
print("Transformed venue shape: ", venue_test.shape)
year_test = encoder.fit_transform(year_test)
print("Transformed year shape: ", year_test.shape)
print(year_test)

Transformed title shape:  (6531, 262144)
Transformed abstract shape:  (6531, 262144)
Transformed venue shape:  (6531, 262144)
Transformed year shape:  (6531,)
[41 42 39 ... 39  0 40]


In [17]:
# Converting year to a numpy array so that it can be stacked to the label matrix

author_id = np.array(author_id)
author_id = np.reshape(author_id, (12129, 1))

print("Author id array shape: ", author_id.shape)

year = np.array(year)
year = np.reshape(year, (12129, 1))

print("Year array shape: ", year.shape)


Author id array shape:  (12129, 1)
Year array shape:  (12129, 1)


In [18]:
# Converting year to a numpy array so that it can be stacked to the label matrix

year_test = np.array(year_test)
year_test = np.reshape(year_test, (6531, 1))

print("Year array shape: ", year_test.shape)


Year array shape:  (6531, 1)


In [19]:
data = sp.sparse.hstack((abstract, title, venue, year))
X = abstract
y = author_id

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.01, random_state=42)
print(X_train)

  (0, 258795)	0.035023989585094055
  (0, 257647)	0.08848060101451363
  (0, 255850)	-0.060536479459789745
  (0, 255817)	-0.07618787982815582
  (0, 255613)	-0.0471491508805175
  (0, 254172)	0.09528209917977884
  (0, 252630)	0.09528209917977884
  (0, 251637)	-0.035500102219976244
  (0, 250310)	-0.06535445086022985
  (0, 249666)	0.06495128320749166
  (0, 247006)	-0.04006113636975368
  (0, 242543)	0.07770048147384456
  (0, 239330)	-0.06870938996185565
  (0, 237400)	0.02895130010528154
  (0, 231177)	-0.0696921031988147
  (0, 230022)	-0.049662008349155976
  (0, 229945)	-0.03855862468657214
  (0, 228395)	-0.08629100766778991
  (0, 225536)	0.05596019169550275
  (0, 225288)	-0.08450197963910976
  (0, 220555)	-0.060395290150901726
  (0, 219198)	-0.08450197963910976
  (0, 218769)	0.08629100766778991
  (0, 218287)	-0.043267924723303854
  (0, 217564)	-0.04859364954906101
  :	:
  (12006, 31739)	0.030951656807556124
  (12006, 30384)	-0.12544010648852477
  (12006, 26887)	0.07663939610124455
  (12006, 2

In [20]:
model = SGDClassifier(penalty = 'l2', loss = 'huber', learning_rate = 'adaptive', eta0 = 1, alpha = 0.001, random_state =42)
model.fit(X_train, y_train)
print("Accuracy on testing data :", model.score(X_val, y_val))

Accuracy on testing data : 0.08196721311475409


In [21]:
y_pred = model.predict(abstract_test) # predicts the new values of the test set

In [22]:
test['prediction'] = y_pred.tolist() # puts the predicted data to list
#final = test.set_axis(['paperId', 'title', 'abstract', 'year', 'venue', 'prediction'], axis=1, inplace=False) # changes the axis labels
#final = final.drop(labels = ['title', 'year', 'abstract', 'venue'], axis = 1) # drops the labels column to get final result of only paperId & authorId
test

Unnamed: 0,paperId,title,abstract,year,venue,prediction
0,86e1aaa0c47659e08a896e9889384eb1e5401e6a,"exploring,linear,subspace,hypothesis,gender,bi...","bolukbasi,et,al,2016,presents,one,first,gender...",2020,EMNLP,31461304
1,8d3076c38f56df22052567f4783c670d8e860f09,"hyknow,end,end,task,oriented,dialog,modeling,h...","task,oriented,dialog,tod,systems,typically,man...",2021,FINDINGS,146950185
2,7c400ee676d427eeda1aad5c1c54c316f0b9773d,"multilingual,information,extraction,pipeline,i...","introduce,advanced,information,extraction,pipe...",2018,EMNLP,1788050
3,185e7d2a761594451b02ace240356dadad2aef78,"dice,loss,data,imbalanced,nlp,tasks","many,nlp,tasks,tagging,machine,reading,compreh...",2019,ACL,3043830
4,e4363d077a890c8d5c5e66b82fe69a1bbbdd5c80,"attention,guided,graph,convolutional,networks,...","dependency,trees,convey,rich,structural,inform...",2019,ACL,51044403
...,...,...,...,...,...,...
6526,069ebed0ba7adec30faa5c5e008053cf3eefc589,"cas,french,corpus,clinical,cases","textual,corpora,extremely,important,various,nl...",2018,Louhi@EMNLP,2105490
6527,b6e9fdc3e7bc4d379ee733b07199fe2a8336dd94,"dependency,based,bilingual,language,models,reo...","paper,presents,novel,approach,improve,reorderi...",2014,EMNLP,143707112
6528,5019da491732e412fafea4e1511818fd684cc1f1,"complementary,strategies,low,resourced,morphol...","morphologically,rich,languages,challenging,nat...",2018,,2814303
6529,eca16c1c776406abd0d966653a705f945bd4b520,"ungrammaticality,extra,grammaticality,natural,...","among,components,included,natural,language,und...",1979,ACL,3326473


In [23]:
test['prediction'].value_counts()

15161448      110
1747849        86
1683363        68
2854981        66
1750769        65
             ... 
65737670        1
1678747         1
49276525        1
2060291042      1
153371561       1
Name: prediction, Length: 1741, dtype: int64

In [24]:
# To get the file into the predicted.json file required by teachers
#output = final.to_dict(orient='records')
#jsonString = json.dumps(output)
#jsonFile = open('predicted.json', 'w')
#jsonFile.write(jsonString)
#jsonFile.close()

In [25]:
loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'huber', 
        'epsilon_insensitive', 'squared_epsilon_insensitive']
penalty = ['l1', 'l2', 'elasticnet']
alpha = [0.0001, 0.001, 0.01, 0.1]
learning_rate = ['constant', 'optimal', 'invscaling', 'adaptive']
l1 = np.arange(0,1,0.01)
l2 = np.arange(0,1,0.01)
class_weight = [{0:i,1:j} for i,j in zip(l1,l2)]
eta0 = [0.1, 1, 10, 100]

param_distributions = dict(loss=loss,
                           penalty=penalty,
                           alpha=alpha,
                           learning_rate=learning_rate,
                           eta0=eta0)
sgd = linear_model.SGDClassifier()
random = RandomizedSearchCV(estimator=sgd, param_distributions=param_distributions, verbose=1, n_jobs=-1, n_iter=50, cv=3)
random_result = random.fit(X_train, y_train)

print('Best Score: ', random_result.best_score_)
print('Best Params: ', random_result.best_params_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


  y = column_or_1d(y, warn=True)
Process LokyProcess-4:
Traceback (most recent call last):
  File "/Users/jasoncharnock/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 595, in __call__
    return self.func(*args, **kwargs)
  File "/Users/jasoncharnock/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py", line 262, in __call__
    return [func(*args, **kwargs)
  File "/Users/jasoncharnock/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py", line 262, in <listcomp>
    return [func(*args, **kwargs)
  File "/Users/jasoncharnock/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/fixes.py", line 117, in __call__
    return self.function(*args, **kwargs)
  File "/Users/jasoncharnock/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jasoncharnock/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_stocha

KeyboardInterrupt: 

In [None]:
epoch = 1 # the amount of times that it will run
batchsize = 10000 # the amount of data you put in per batch
model = SGDClassifier(penalty = 'l2', loss = 'huber', learning_rate = 'adaptive', eta0 = 1, alpha = 0.001, random_state = 42) # The classifier to use, this is best applicable to large datasets
batches = int(X_train.shape[0]/batchsize) + 1
samples = X_train.shape[0]
for i in range(epoch):
    for j in range(batches):
        print('in j...', j, j*batchsize, '----2is:',samples, (j+1)*batchsize )
        model.partial_fit(X_train[j*batchsize:min(samples,(j+1)*batchsize)], 
                          y_train[j*batchsize:min(samples,(j+1)*batchsize)], 
                          classes=np.unique(y))
print("Done")
print ("Accuracy on testing data :", model.score(X_val, y_val))

In [None]:
#1.  We are convinced that this is a classification problem. Is there any way to approach this with regression? 
# Or is that only possible for binary classification

#2. Does it make sense to oversample the data that only occurs once? 
# Since we will split the data and this dataset is quite unbalanced, we will get unknown authors in the 
# validation set that are not trained yet. But when we read about oversampling they say that 
# this should be done after splitting

#3. Will it improve performance to add columns such as abstract length, amount of stopwords used, average word count, etc?

# 4. Is it adviced to use the tfidf or the countvectorizer? 

# 5. Is there anything we could improve for the preprocessing of the text? 
# We have already normalized, lemmatized and tokenized the text

# 6. Are we required to use BERT to get an accuracy above 15? This seems more like a deep learning method than ML

# 7. Wouldn't it be better to use all our training data to train for the test data?