In [9]:
import pandas as pd
from tqdm import tqdm
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import spacy
from scipy.sparse import hstack
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler() 
from sklearn.preprocessing import LabelEncoder
import gensim.downloader as api
word_vectors = api.load("word2vec-google-news-300")

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def js_to_df(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            obj = json.loads(line.strip())
            data.append({'string': obj['string'], 'label': obj['label']})
    return pd.DataFrame(data)

train_df = js_to_df('scicite/train.jsonl')
test_df = js_to_df('scicite/test.jsonl')

In [11]:
X_train, y_train = train_df['string'], train_df['label']
X_test, y_test = test_df['string'], test_df['label']

# NN

In [12]:
classifier = MLPClassifier(alpha=0.1)

In [13]:
vectorizer = CountVectorizer()

In [15]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [17]:
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

classifier.fit(X_train_vec, y_train_encoded)

y_test_pred = classifier.predict(X_test_vec)
test_report = classification_report(y_test_encoded, y_test_pred)
print("Test Performence:\n", test_report)

Test Performence:
               precision    recall  f1-score   support

           0       0.82      0.85      0.83       997
           1       0.81      0.75      0.78       605
           2       0.69      0.71      0.70       259

    accuracy                           0.80      1861
   macro avg       0.77      0.77      0.77      1861
weighted avg       0.80      0.80      0.80      1861



## POS+DEP

In [19]:
nlp = spacy.load("en_core_web_sm")

def extract_pos_dep_features(corpus):
    pos_features = []
    dep_features = []
    for sentence in tqdm(corpus):
        doc = nlp(sentence)
        pos_tags = [token.pos_ for token in doc]
        dep_rels = [token.dep_ for token in doc]
        pos_features.append(" ".join(pos_tags))
        dep_features.append(" ".join(dep_rels))
    return pos_features, dep_features


def feature_engineering(corpus, fit_vectorizers=False):
    pos_features, dep_features = extract_pos_dep_features(corpus)
    
    global pos_vectorizer, dep_vectorizer, vectorizer
    if fit_vectorizers:
        pos_vectorizer = TfidfVectorizer()
        dep_vectorizer = TfidfVectorizer()
        vectorizer = CountVectorizer()
        pos_tfidf = pos_vectorizer.fit_transform(pos_features)
        dep_tfidf = dep_vectorizer.fit_transform(dep_features)
        tfidf_features = vectorizer.fit_transform(corpus)
    else:
        pos_tfidf = pos_vectorizer.transform(pos_features)
        dep_tfidf = dep_vectorizer.transform(dep_features)
        tfidf_features = vectorizer.transform(corpus)

    combined_features = hstack([pos_tfidf, dep_tfidf, tfidf_features])
    combined_features = combined_features.tocsr()
    return combined_features

In [20]:
train_features =  feature_engineering(X_train, fit_vectorizers=True)
test_features = feature_engineering(X_test, fit_vectorizers=False)

100%|██████████████████████████████████████| 8243/8243 [01:03<00:00, 128.90it/s]
100%|██████████████████████████████████████| 1861/1861 [00:14<00:00, 131.66it/s]


In [22]:
classifier.fit(train_features, y_train_encoded)

y_test_pred = classifier.predict(test_features)
test_report = classification_report(y_test_encoded, y_test_pred)
print("Test Performence:\n", test_report)

Test Performence:
               precision    recall  f1-score   support

           0       0.83      0.81      0.82       997
           1       0.80      0.77      0.78       605
           2       0.66      0.76      0.71       259

    accuracy                           0.79      1861
   macro avg       0.76      0.78      0.77      1861
weighted avg       0.79      0.79      0.79      1861



## number -> < NUM >

In [24]:
nlp = spacy.load('en_core_web_sm')

def number_tokenize(text, token_replacement='<NUM>'):
    doc = nlp(text)
    return " ".join([token_replacement if token.like_num else token.text for token in doc])

X_train_tokenized = X_train.apply(lambda x: number_tokenize(x))
X_test_tokenized = X_test.apply(lambda x: number_tokenize(x))

In [45]:
classifier = MLPClassifier(alpha=0.1)

vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train_tokenized)
X_test_vec = vectorizer.transform(X_test_tokenized)

classifier.fit(X_train_vec, y_train_encoded)

y_test_pred = classifier.predict(X_test_vec)
test_report = classification_report(y_test_encoded, y_test_pred)
print("Test Performence:\n", test_report)

Test Performence:
               precision    recall  f1-score   support

           0       0.83      0.83      0.83       997
           1       0.81      0.76      0.79       605
           2       0.65      0.74      0.69       259

    accuracy                           0.80      1861
   macro avg       0.77      0.78      0.77      1861
weighted avg       0.80      0.80      0.80      1861



## word embedding

In [29]:
def vectorize_text(text_list, model):
    vectorized = []
    for text in text_list:
        words = [word for word in text.lower().split() if word in model.key_to_index]
        if words:
            vectorized.append(np.mean(model[words], axis=0))
        else:
            vectorized.append(np.zeros(model.vector_size))
    return np.array(vectorized)

In [41]:
classifier = MLPClassifier(alpha=0.1, max_iter=400)

X_train_vec = vectorize_text(X_train, word_vectors)
X_test_vec = vectorize_text(X_test, word_vectors)

classifier.fit(X_train_vec, y_train_encoded)

y_test_pred = classifier.predict(X_test_vec)
test_report = classification_report(y_test_encoded, y_test_pred)
print("Test Performence:\n", test_report)

Test Performence:
               precision    recall  f1-score   support

           0       0.80      0.82      0.81       997
           1       0.81      0.75      0.78       605
           2       0.60      0.65      0.63       259

    accuracy                           0.77      1861
   macro avg       0.74      0.74      0.74      1861
weighted avg       0.78      0.77      0.77      1861



In [43]:
X_train_vec = vectorize_text(X_train_tokenized, word_vectors)
X_test_vec = vectorize_text(X_test_tokenized, word_vectors)

classifier.fit(X_train_vec, y_train_encoded)

y_test_pred = classifier.predict(X_test_vec)
test_report = classification_report(y_test_encoded, y_test_pred)
print("Test Performence:\n", test_report)

Test Performence:
               precision    recall  f1-score   support

           0       0.81      0.79      0.80       997
           1       0.79      0.78      0.78       605
           2       0.61      0.71      0.65       259

    accuracy                           0.77      1861
   macro avg       0.74      0.76      0.75      1861
weighted avg       0.78      0.77      0.77      1861

