In [1]:
# Import libraries

import re
import json
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, HashingVectorizer

nltk.download('stopwords')
nltk.download('omw-1.4')

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import RandomizedSearchCV
from sklearn import linear_model

from collections import Counter

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jasoncharnock/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jasoncharnock/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [13]:
# Opening JSON files

f = open('train.json')
data = json.load(f)

g = open('test.json', )
test = json.load(g)

In [14]:
def createDataFrame(ds, additional_cols=[]):
    df = pd.DataFrame(ds)
    df['year'] = df['year'].astype(str)
    df['labels'] = df[['title', 'abstract', 'year', 'venue']].apply(lambda x: ','.join(x), axis=1)
    merged = df.drop(labels=['title', 'abstract', 'year', 'venue'] + additional_cols, axis=1)
    merged.head()

    # This replaces capital letters, and some symbols. It lowers all the texts, strips and splits and converts it to strings
    labels = merged['labels'].str.replace('[^A-Za-z]', ' ').str.lower().str.strip().str.split()
    merged['labels'] = [','.join(map(str, l)) for l in labels]

    return df, merged

In [15]:
# Pre-processing training/validation data
# It combines the columns that are given in the original dataset, to have only one independent variable column
data, merged_data = createDataFrame(data, ['paperId'])
merged_data.describe()
merged_data.head(1)

Unnamed: 0,authorId,authorName,labels
0,3188285,Masoud Rouhizadeh,"detecting,linguistic,idiosyncratic,interests,i..."


In [16]:
# Pre-processing test data (same process as above, but for test dataset)
test, merged_test = createDataFrame(test, ['paperId'])

In [17]:
merged_data.head()

Unnamed: 0,authorId,authorName,labels
0,3188285,Masoud Rouhizadeh,"detecting,linguistic,idiosyncratic,interests,i..."
1,2782720,Yuri Bizzoni,"bigrams,and,bilstms,two,neural,networks,for,se..."
2,144748442,Peter Vickers,"in,factuality,efficient,integration,of,relevan..."
3,46331602,Irene Li,"variational,graph,autoencoding,as,cheap,superv..."
4,30887404,Junru Zhou,"limit,bert,linguistics,informed,multi,task,ber..."


In [18]:
# Replace stopwords, symbols
# cleans out all the stopwords (the, a, an, etc.)
# removes all symbols and numbers

replace = re.compile('[/(){}\[\]\|@,;]')
replace_symbols = re.compile('[^0-9a-z #+_]')
replace_stopwords = set(stopwords.words('english'))

def clean_labels(labels):
    labels = labels.lower()  # lowercase labels
    labels = replace.sub(' ', labels)  # replace REPLACE_BY_SPACE_RE symbols by space in labels
    labels = replace_symbols.sub('', labels)  # delete symbols which are in BAD_SYMBOLS_RE from labels
    labels = ' '.join(word for word in labels.split() if word not in replace_stopwords)  # delete stopwords from labels
    return labels

merged_data['labels'] = merged_data['labels'].apply(clean_labels)  # applies the above loop to the data
merged_test['labels'] = merged_test['labels'].apply(clean_labels)  # applies the loop to test data

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()  # seperates the sentences to get single words
lemmatizer = nltk.stem.WordNetLemmatizer()  # lemmatizes text

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

merged_data['labels'] = merged_data['labels'].apply(lemmatize_text)
merged_data['labels'] = [','.join(map(str, l)) for l in merged_data['labels']]

merged_test['labels'] = merged_test['labels'].apply(lemmatize_text)
merged_test['labels'] = [','.join(map(str, l)) for l in merged_test['labels']]

In [96]:
merged_data['labels'] = merged_data.labels.apply(lambda x: ', '.join(i[0] for i in Counter(x).most_common(5)))

In [19]:
merged_data['labels']

0        detecting,linguistic,idiosyncratic,interest,au...
1        bigram,bilstms,two,neural,network,sequential,m...
2        factuality,efficient,integration,relevant,fact...
3        variational,graph,autoencoding,cheap,supervisi...
4        limit,bert,linguistics,informed,multi,task,ber...
                               ...                        
12124    smbop,semi,autoregressive,bottom,semantic,pars...
12125    uw,stanford,system,description,aesw,shared,tas...
12126    raw,text,enhanced,universal,dependency,parsing...
12127    neural,network,acceptability,judgment,abstract...
12128    bridging,text,knowledge,frame,framenet,best,cu...
Name: labels, Length: 12129, dtype: object

In [20]:
merged_data = merged_data.sample(100000, replace = True)

In [21]:
# Determine independent and dependent variables 

X = merged_data['labels']
y = merged_data['authorId']

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.15, random_state=42)
vectorizer = HashingVectorizer(ngram_range=(1,2), n_features=2**18) 
# applies the hashing vectorizer (found to be the best for large text datasets)
X_train_hashed = vectorizer.transform(X_train)
tfidf_transformer=TfidfTransformer() # transforms the hashed vectorized texted by using tfidf transformer
# TFIDF works by proportionally increasing the number of times a word appears in the document 
# but is counterbalanced by the number of documents in which it is present
# https://www.analyticsvidhya.com/blog/2021/07/bag-of-words-vs-tfidf-vectorization-a-hands-on-tutorial/
X_train = tfidf_transformer.fit_transform(X_train_hashed)
X_test_hashed = vectorizer.transform(X_test)
X_test = tfidf_transformer.transform(X_test_hashed)
X_train

<85000x262144 sparse matrix of type '<class 'numpy.float64'>'
	with 12418115 stored elements in Compressed Sparse Row format>

In [15]:
# this is the part where the hyperparameter tuning happens

loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
penalty = ['l1', 'l2', 'elasticnet']
alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
learning_rate = ['constant', 'optimal', 'invscaling', 'adaptive']
l1 = np.arange(0,1,0.01)
l2 = np.arange(0,1,0.01)
class_weight = [{0:i,1:j} for i,j in zip(l1,l2)]
eta0 = [0.1, 1, 10, 100]

param_distributions = dict(loss=loss,
                           penalty=penalty,
                           alpha=alpha,
                           learning_rate=learning_rate,
                           eta0=eta0)
sgd = linear_model.SGDClassifier()
random = RandomizedSearchCV(estimator=sgd, param_distributions=param_distributions, verbose=1, n_jobs=-1, n_iter=30, cv=3)
random_result = random.fit(X_train, Y_train)

print('Best Score: ', random_result.best_score_)
print('Best Params: ', random_result.best_params_)

# https://www.kaggle.com/code/tboyle10/hyperparameter-tuning

# dict_keys(['alpha', 'average', 'class_weight', 'early_stopping', 'epsilon', 'eta0', 'fit_intercept', 
# 'l1_ratio', 'learning_rate', 'loss', 'max_iter', 'n_iter_no_change', 'n_jobs', 'penalty', 'power_t', 
# 'random_state', 'shuffle', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

Fitting 3 folds for each of 30 candidates, totalling 90 fits


KeyboardInterrupt: 

In [None]:
# This trains the model

epoch = 1 # the amount of times that it will run
batchsize = 1500 # the amount of data you put in per batch
model = SGDClassifier() # The classifier to use, this is supposed to be best applicable to large datasets
batches = int(X_train.shape[0]/batchsize) + 1
samples = X_train.shape[0]
for i in range(epoch):
    for j in range(batches):
        print('in j...', j, j*batchsize, '----2is:',samples, (j+1)*batchsize )
        model.partial_fit(X_train[j*batchsize:min(samples,(j+1)*batchsize)], 
                          Y_train[j*batchsize:min(samples,(j+1)*batchsize)], 
                          classes=np.unique(y))
print ("Accuracy on testing data :", model.score(X_test, Y_test))

In [None]:
# https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
# https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html

# https://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html#sphx-glr-auto-examples-applications-plot-out-of-core-classification-py
# https://medium.com/mlearning-ai/out-of-core-multi-label-text-classification-with-scikit-learn-14afa4c1bb75
# https://towardsdatascience.com/how-to-make-sgd-classifier-perform-as-well-as-logistic-regression-using-parfit-cc10bca2d3c4

In [None]:
new_data = merged_test['labels'] # the test data labels that we will apply to transform to numerical
X_new = vectorizer.transform(new_data) # transforms the data by using the vectorizer
y_pred = model.predict(X_new) # predicts the new values of the test set

In [None]:
merged_test['prediction'] = y_pred.tolist() # puts the predicted data to list
final = merged_test.set_axis(['paperId', 'labels', 'authorId'], axis=1, inplace=False) # changes the axis labels
final = final.drop(labels = ['labels'], axis = 1) # drops the labels column to get final result of only paperId & authorId

In [None]:
final

In [None]:
merged_data.loc[merged_data['authorId'] == '1747849']

In [None]:
# To get the file into the predicted.json file required by teachers
#output = final.to_dict(orient='records')
#jsonString = json.dumps(output)
#jsonFile = open('predicted.json', 'w')
#jsonFile.write(jsonString)
#jsonFile.close()