## 0. Imports Preliminaries

In [5]:
#0. Preliminaries
import pandas as pd
import nltk
nltk.download('punkt')
from readability import Readability

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string
import numpy as np
from sklearn.model_selection import train_test_split
import re

In [3]:
import spacy

In [47]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [48]:
from transformers import pipeline
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student", max_length = 512, truncation = True)

distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    tokenizer = tokenizer,
    return_all_scores=True
)

  from .autonotebook import tqdm as notebook_tqdm


## 1. Load and Split Data

In [49]:
#1. Import Data
df = pd.read_csv("../data/chunked_author_data.csv")

In [93]:
df_train_val, df_test = train_test_split(df, train_size = 0.8, random_state = 42)
df_train, df_val = train_test_split(df_train_val, train_size = 0.8, random_state = 42)

## 2. Add Features

In [51]:
#Flesch 
def flesch_readability_scale(text):
    try:
      r = Readability(text)
      f = r.flesch()
      score_out = f.score
    #not possible if less than 100 words
    except:
      score_out = np.nan
      
    return score_out

In [52]:
# Sentiment Analysis (Positive Score)
tokenizer_kwargs = {'truncation':True,'max_length':512}
def sentiment_analysis_score(text):
    results_senti = distilled_student_sentiment_classifier(text, **tokenizer_kwargs)
    positive_score = [x['score'] for x in results_senti[0] if x['label'] == 'positive']
    score_out = positive_score[0] if len(positive_score) == 1 else np.nan
    return score_out                          

In [53]:
#punctuation n-grams
punct_vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize, use_idf=False, norm='l1', vocabulary=string.punctuation)

In [54]:
pos_vectorizer = TfidfVectorizer(ngram_range=(2, 4))

In [55]:
stopword_vectorizer = CountVectorizer(ngram_range=(1, 3), tokenizer=nltk.word_tokenize, vocabulary=stopwords.words("english"))

In [56]:
def remove_bullet_points(text):
    bulletpoint_delimiters = re.compile(r'(\(i\)|\(ii\)|•)')
    text = re.sub(bulletpoint_delimiters, "", text)
    return text

In [10]:
nlp = spacy.load("en_core_web_sm")

In [63]:
#remove proper nouns and POS-tag n grams
def POS_preprocessing(text):
    POS_string = ""
    cleaned_string = ""
    list_sentences = nltk.tokenize.sent_tokenize(text)
    for sentence in list_sentences:
        doc = nlp(sentence)
        for token in doc:
            #first add the text back
            string_out = "Propname" if token.pos_ == "PROPN" else token.text
            sep_out = "" if token.pos_ == "PUNCT" else " "
            cleaned_string = cleaned_string + sep_out + string_out
            #second 
            #POS_out = "" if token.pos_ == "PUNCT" else token.pos_
            POS_string = POS_string + " " + token.pos_
    return pd.Series({
        'cleaned_string': cleaned_string,
        'POS_string': POS_string
    })

## 3. Apply Features to Datasets

In [143]:
def preprocess_data(df, text_col, train = False):
    df = df.reset_index(drop = True)
    #these should be run first before cleaning punctuation and private words and stuff
    #df['flesch_score'] = df[text_col].apply(flesch_readability_scale)
    #commented out for now as takes long to run
    #df['sent_score'] = df[text_col].apply(sentiment_analysis_score)
    #train has to be run first - a catch statement for that
    df[text_col] = df[text_col].apply(remove_bullet_points)
    #removing double space should be after removing bullet points! leaves a double space sometimes
    df[['text', 'POS_string']] = df[text_col].apply(POS_preprocessing)

    try:
        punct_features = punct_vectorizer.fit_transform(df[text_col]) if train else punct_vectorizer.transform(df[text_col])
        punct_features_df = pd.DataFrame(punct_features.toarray(), columns=punct_vectorizer.get_feature_names_out()).reset_index(drop = True)
        df = pd.concat([df, punct_features_df], axis = 1)
    except Exception as e:
        print(f"Error In Generating Punctuation N-grams: {e}")

    try:
        POS_features = pos_vectorizer.fit_transform(df['POS_string']) if train else pos_vectorizer.transform(df['POS_string'])
        POS_features_df = pd.DataFrame(POS_features.toarray(), columns=pos_vectorizer.get_feature_names_out()).reset_index(drop = True)
        df = pd.concat([df, POS_features_df], axis = 1)
    except Exception as e:
        print(f"Error In Generating Punctuation N-grams: {e}")
    try:
        stopwords_features = stopword_vectorizer.fit_transform(df[text_col]) if train else stopword_vectorizer.transform(df[text_col])
        stopwords_features_df = pd.DataFrame(stopwords_features.toarray(), columns=stopword_vectorizer.get_feature_names_out()).reset_index(drop = True)
        df = pd.concat([df, stopwords_features_df], axis = 1)
    except Exception as e:
        print(f"Error In Generating Stopword N-grams: {e}")
    
    return df

In [120]:
df_train_processed = preprocess_data(df_train, 'Chunk', train = True)



In [144]:
df_val_processed = preprocess_data(df_val, 'Chunk', train = False)

# 4. Run Models

In [145]:
df_val_processed

Unnamed: 0.1,Unnamed: 0,Author,Pub,Chunk,text,POS_string,!,"""",#,$,...,shouldn,shouldn't,wasn,wasn't,weren,weren't,won,won't,wouldn,wouldn't
0,211,Timo Schick,[' Providing pretrained language models with s...,Providing pretrained language models with sim...,Providing pretrained language models with s...,SPACE NOUN VERB NOUN NOUN ADP ADJ NOUN NOUN C...,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,43,Aman Madaan,[' Reasoning about events and tracking their i...,Humans are adept at anticipating and reasonin...,Humans are adept at anticipating and reason...,SPACE NOUN AUX ADJ ADP VERB CCONJ VERB ADP NO...,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,200,Timo Schick,"[' When trained on large, unfiltered crawls fr...","In this paper, we have shown that large langu...","In this paper, we have shown that large lan...",SPACE ADP DET NOUN PRON AUX VERB SCONJ ADJ N...,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,98,Hugo Touvron,"[' Recently, neural networks purely based on a...","In this paper, we have introduced DeiT, which...","In this paper, we have introduced Propname,...",SPACE ADP DET NOUN PRON AUX VERB PROPN PRON...,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,205,Timo Schick,[' To obtain high-quality sentence embeddings ...,While pretrained language models (PLMs) achie...,While pretrained language models( Propname)...,SPACE SCONJ VERB NOUN NOUN PROPN VERB ADJ N...,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,210,Timo Schick,[' Pretraining deep neural network architectur...,"We have introduced WNLaMPro, a new dataset th...","We have introduced WNLaMPro, a new dataset ...",SPACE PRON AUX VERB ADJ DET ADJ NOUN PRON VE...,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,148,Zhiqing Sun,[' Numerical simulation of non-linear partial ...,"Specifically, in this paper, we focus on traje...","Specifically, in this paper, we focus on traj...",ADV ADP DET NOUN PRON VERB ADP VERB ADJ NO...,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,150,Zhiqing Sun,[' We propose a new paradigm to help Large Lan...,We propose a new paradigm to help Large Langu...,We propose a new paradigm to help Large Pro...,SPACE PRON VERB DET ADJ NOUN PART VERB ADJ PR...,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
8,114,Hugo Touvron,[' We propose a simple architecture to address...,Powers of layers consists in iterating a resi...,Propname of layers consists in iterating a ...,SPACE PROPN ADP NOUN VERB ADP VERB DET ADJ NO...,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9,129,Zhiqing Sun,[' Supervised Fine-Tuning (SFT) on response de...,The vanilla (stand-alone) reward models in RLH...,The vanilla( stand- alone) reward models in P...,DET NOUN VERB ADV NOUN NOUN ADP PROPN CCON...,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [126]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [146]:
X_train = df_train_processed.drop(columns = ["Author", "Chunk", "text", "POS_string", "Pub"])
y_train = df_train_processed['Author']
X_val = df_val_processed.drop(columns = ["Author", "Chunk", "text", "POS_string", "Pub"])
y_val = df_val_processed['Author']

In [134]:
def run_classifier(X_train, y_train, X_val, y_val, model):
    # Make predictions on the test set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    # Evaluate the accuracy of the model
    return y_pred

In [147]:
#1. SVC
SVC_model = SVC()
predictions_SVC = run_classifier(X_train, y_train, X_val, y_val, SVC_model)

In [155]:
len(np.where(y_val == predictions_SVC)[0])

39

In [156]:
clf = LogisticRegression(random_state=0)
predictions_LR = run_classifier(X_train, y_train, X_val, y_val, clf)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
