## 0. Imports Preliminaries

In [1]:
!pip install nltk py-readability-metrics spacy==3.6.1 keras==2.8

In [2]:
!pip install textatistic

In [3]:
#0. Preliminaries
import pandas as pd
import nltk, warnings
nltk.download('punkt')
warnings.filterwarnings('ignore')
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
from readability import Readability

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string
import numpy as np
from sklearn.model_selection import train_test_split
import re
import spacy

In [5]:
from nltk.corpus import stopwords
nltk.download('stopwords')

In [6]:
from textatistic import Textatistic

In [7]:
from transformers import pipeline
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student", max_length = 512, truncation = True)

distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    tokenizer = tokenizer,
    return_all_scores=True
)

## 1. Load Data

In [8]:
#1. Import Data
df = pd.read_csv("../data/chunked_author_data_UTF8.csv")

In [9]:
df['Author'].value_counts()

In [10]:
len(df)

## 2. Clean Data 

In [11]:
def regex(string: str):
    """
    Function that applies regular expressions to a string based on the specified model.
    :param string: The input string.
    :param model: The model to determine which regular expressions to apply.
    :return: The modified string.
    """
    string = re.sub(r'e\.g\.', 'eg', string)  # replace e.g. with eg
    string = re.sub(r'i\.e\.', 'ie', string)  # replace i.e. with ie
    string= re.sub(r'https?://[a-zA-Z0-9\n./-]+', "weblink", string)
    #string = re.sub(r'https?://[a-zA-Z0-9./-]+', "weblink", string) # remove links
    string = re.sub(r'-', ' ', string)  # replace - with space
    string = re.sub(r'[0-9]', '0', string)  # each digit will be represented as a 0
    string = re.sub(r'\(.*?\)', '', string) # remove parentheses and the text within
    string = re.sub(r'\[.*?\]', '', string) # remove brackets and the text within
    # expression to remove \n or \t
    string = re.sub(r'[\n\t]', " ", string)
    string = re.sub('[^A-Za-z0-9\s\.,\?!:;]+', '', string)# Remove special characters, so math formulas simplified.
    string = re.sub(r'\s\s+', ' ', string) # remove if there is more than 1 space, inlcuding new line \n and tab \t
    return string.strip() # Remove extra spaces at the begningining and end.

assert regex("a\t\n b") == "a b"
assert regex("q123") == "q000"
assert regex("a (something something) b (sth th)") == "a b"
assert regex("a [something something] b [sth12 th]") == "a b"
assert regex("2 + 5 % 2") == "0 0 0"
assert regex(",!;:.?") == ",!;:.?"

## 3. Add Features

In [12]:
# Sentiment Analysis (Positive Score)
tokenizer_kwargs = {'truncation':True,'max_length':512}
def sentiment_analysis_score(text):
    results_senti = distilled_student_sentiment_classifier(text, **tokenizer_kwargs)
    positive_score = [x['score'] for x in results_senti[0] if x['label'] == 'positive']
    score_out = positive_score[0] if len(positive_score) == 1 else np.nan
    return score_out                          

In [13]:
def lexical_diversity(text):
    words = nltk.word_tokenize(text)
    return len(set(words)) / len(words)

assert lexical_diversity('a a b') == 2/3

In [14]:
from nltk.tokenize import sent_tokenize, word_tokenize

def avg_word_per_sentence(text):
    return np.mean([len(word_tokenize(sentence)) for sentence in sent_tokenize(text)])

assert avg_word_per_sentence('I like muffins. Please buy me two of them.') == 5.5

In [15]:
def avg_word_length(text):
    return np.mean([len(word) for sentence in sent_tokenize(text) for word in word_tokenize(sentence)])

assert avg_word_length('I like giant muffins. Please buy me two of them.') == 3.25

In [16]:
nlp = spacy.load("en_core_web_sm")

In [17]:
#remove proper nouns and POS-tag n grams
def POS_preprocessing(text):
    POS_string = ""
    cleaned_string = ""
    list_sentences = nltk.tokenize.sent_tokenize(text)
    for sentence in list_sentences:
        doc = nlp(sentence)
        for token in doc:
            #first add the text back
            string_out = "Propname" if token.pos_ == "PROPN" else token.text
            sep_out = "" if token.pos_ == "PUNCT" else " "
            cleaned_string = cleaned_string + sep_out + string_out
            #second 
            #POS_out = "" if token.pos_ == "PUNCT" else token.pos_
            POS_string = POS_string + " " + token.pos_
    return pd.Series({
        'cleaned_string': cleaned_string,
        'POS_string': POS_string
    })

In [18]:
def remove_bullet_points(text):
    bulletpoint_delimiters = re.compile(r'(\(i\)|\(ii\)|•)')
    text = re.sub(bulletpoint_delimiters, "", text)
    return text

In [19]:
#df['flesch_score'] = df['Chunk'].apply(flesch_readability_scale)
def updated_Flesch_score(text):
    s = Textatistic(text)
    return s.flesch_score, s.word_count, s.sent_count, s.sybl_count

df[['flesch_score_v2','word_count', 'sent_count', 'sybl_count' ]] = df['Chunk'].apply(updated_Flesch_score).apply(pd.Series)

In [20]:
df['sent_score'] = df['Chunk'].apply(sentiment_analysis_score)

In [21]:
df['re_text'] = df['Chunk'].apply(regex)
#train has to be run first - a catch statement for that
df['re_text'] = df['re_text'].apply(remove_bullet_points)
#removing double space should be after removing bullet points! leaves a double space sometimes
df[['re_text', 'POS_string']] = df['re_text'].apply(POS_preprocessing)
df['lexical_diversity'] = df['re_text'].apply(lexical_diversity)
df['avg_word_per_sentence'] = df['re_text'].apply(avg_word_per_sentence)
df['avg_word_length'] = df['re_text'].apply(avg_word_length)

In [22]:
df['re_text'][38]

In [23]:
df.to_csv("../data/df_with_features.csv")

In [24]:
#df = pd.read_csv("../data/df_with_features.csv")

In [25]:
#df = df.drop(columns="text")

## Inspect Features

In [26]:
feature_stats = df[['Author', 'flesch_score_v2', 'word_count', 'sent_count', 'sybl_count', 'sent_score', 'lexical_diversity', 'avg_word_per_sentence', 'avg_word_length']].groupby('Author').mean().reset_index()

In [27]:
feature_stats

In [28]:
statistics = df[['Author', 'sybl_count', 'word_count', 'sent_count']].groupby("Author").mean().reset_index()
statistics['word_sent_ratio'] = statistics.word_count/statistics.sent_count
statistics['syl_word_ratio'] = statistics.sybl_count/statistics.word_count

In [29]:
statistics

In [30]:
#feature_stats.to_clipboard(float_format='%.2f', index=False)

In [31]:
#statistics.to_clipboard(float_format='%.2f', index=False)

### Compare with GPT-data

In [32]:
GPT_data = pd.read_csv("../data/GPT_data_UTF8_additional_2.csv")

In [33]:
GPT_data = GPT_data.reset_index(drop = True)
df_w_features_GPT = df[df['Author'] == "GPT-3.5"].reset_index(drop = True)
df_w_features_GPT['Author_org'] = ""

for df_row in df_w_features_GPT.index:
    text = df_w_features_GPT.loc[df_row, 'Chunk'][1:500]
    type_text= df_w_features_GPT.loc[df_row, 'Type']
    col_type = "GPT_abstract" if type_text == "abstract_chunked" else ("GPT_introduction" if type_text == "intro_chunked" else "GPT_conclusion")
    for texts in GPT_data.index:
        if text in GPT_data.loc[texts, col_type]:
            df_w_features_GPT.loc[df_row, 'Author_org'] = GPT_data.loc[texts, 'Author']

In [34]:
df_w_features_GPT.loc[df_w_features_GPT['Author_org'] == "", :]

In [35]:
#checked manually
df_w_features_GPT.loc[df_w_features_GPT['Author_org'] == "", "Author_org"] = "Hugo Touvron"

In [36]:
stats_GPT_v_authors = df_w_features_GPT[['Author_org', 'flesch_score_v2', 'sent_score', 'lexical_diversity', 'avg_word_per_sentence', 'avg_word_length']].groupby('Author_org').mean().reset_index()

In [37]:
stats_GPT_v_authors

In [38]:
stats_GPT_v_authors.to_csv("test_gpt.csv", float_format='%.2f', index=False)

# 3. Split Data

In [39]:
def select_equal_groups(group: pd.core.groupby.generic.DataFrameGroupBy, n: int):

    return group.sample(min(n, len(group)), random_state=42)

share_train = 0.7
share_test = 0.3
samples_train = int(len(df)*(share_train*(1-share_test)))
samples_per_group = int(samples_train/5)

df['ID'] = range(0, len(df))
df_abstract_intro = df[~df['Type'].str.contains("conclusion_chunked")]
df_train= df_abstract_intro.groupby("Author", group_keys=False)\
        .apply(select_equal_groups, samples_per_group)\
        .reset_index(drop=True)

df_test_val = df[~df['ID'].isin(df_train['ID'])]
df_val = df_test_val[~df_test_val['Type'].str.contains("conclusion_chunked")]
df_val, df_test_0  = train_test_split(df_val, train_size = 0.5, stratify = df_val['Author'], random_state = 42)
df_test_1 = df_test_val[df_test_val['Type'].str.contains("conclusion_chunked")]
#df_test, df_val  = train_test_split(df_test_val, train_size = 0.5, stratify = df_test_val['Author'], random_state = 42)

In [40]:
df_train['Author'].value_counts()

In [41]:
df_val['Author'].value_counts()

In [42]:
df_test_0['Author'].value_counts()

In [43]:
df_test_1['Author'].value_counts()

In [44]:
df_test = pd.concat([df_test_0, df_val])
print(df_test['Author'].value_counts())

In [45]:
len(df_test_1)

In [46]:
#punctuation n-grams
punct_vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize, use_idf=False, norm='l1', vocabulary=string.punctuation)

In [47]:
pos_vectorizer = TfidfVectorizer(ngram_range=(2, 4))

In [48]:
stopword_vectorizer = TfidfVectorizer(ngram_range=(1, 3), tokenizer=nltk.word_tokenize, vocabulary=stopwords.words("english"))

In [49]:
word_vectorizer = TfidfVectorizer(ngram_range=(1, 3))

# 4. Tokenize Datasets

In [50]:
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
def clean_punct(test_str):
    for ele in test_str:
        if ele in punc:
            test_str = test_str.replace(ele, "")
    return test_str

In [51]:
def remove_stopwords(example_text):
    word_tokens = word_tokenize(example_text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stopwords.words("english")]
    return " ".join(filtered_sentence)

In [52]:
def preprocess_data(df, text_col, train = False):
    df = df.reset_index(drop = True)
    #these should be run first before cleaning punctuation and private words and stuff
    #df['flesch_score'] = df[text_col].apply(flesch_readability_scale)
    #commented out for now as takes long to run
    try:
        punct_features = punct_vectorizer.fit_transform(df[text_col]) if train else punct_vectorizer.transform(df[text_col])
        columns = [f'punct_{c}' for c in punct_vectorizer.get_feature_names_out()]
        punct_features_df = pd.DataFrame(punct_features.toarray(), columns=columns).reset_index(drop = True)
        df = pd.concat([df, punct_features_df], axis = 1)
    except Exception as e:
        print(f"Error In Generating Punctuation N-grams: {e}")
    try:
        POS_features = pos_vectorizer.fit_transform(df['POS_string']) if train else pos_vectorizer.transform(df['POS_string'])
        columns = [f'pos_{c}' for c in pos_vectorizer.get_feature_names_out()]
        POS_features_df = pd.DataFrame(POS_features.toarray(), columns=columns).reset_index(drop = True)
        df = pd.concat([df, POS_features_df], axis = 1)
    except Exception as e:
        print(f"Error In Generating POS N-grams: {e}")

    #lowercase everything
    df[text_col] = df[text_col].apply(str.lower)
    df[text_col] = df[text_col].apply(clean_punct)

    try:
        stopwords_features = stopword_vectorizer.fit_transform(df[text_col]) if train else stopword_vectorizer.transform(df[text_col])
        columns = [f'stop_{c}' for c in stopword_vectorizer.get_feature_names_out()]
        stopwords_features_df = pd.DataFrame(stopwords_features.toarray(), columns=columns).reset_index(drop = True)
        df = pd.concat([df, stopwords_features_df], axis = 1)
    except Exception as e:
        print(f"Error In Generating Stopword N-grams: {e}")

    #remove stopwords here
    df[text_col] = df[text_col].apply(remove_stopwords)
    try:
        words_features = word_vectorizer.fit_transform(df[text_col]) if train else word_vectorizer.transform(df[text_col])
        columns = [f'word_{c}' for c in word_vectorizer.get_feature_names_out()]
        words_features_df = pd.DataFrame(words_features.toarray(), columns=columns).reset_index(drop = True)
        df = pd.concat([df, words_features_df], axis = 1)
    except Exception as e:
        print(f"Error In Generating Word N-grams: {e}")
    return df

In [53]:
df_train_processed = preprocess_data(df_train, 're_text', train = True)

In [54]:
df_val_processed = preprocess_data(df_val, 're_text', train = False)

In [55]:
df_test_0_processed = preprocess_data(df_test_0, 're_text', train = False)

In [56]:
df_test_1_processed = preprocess_data(df_test_1, 're_text', train = False)

In [57]:
# df_val_processed.to_pickle("./df_val_processed.pkl")
# df_test_0_processed.to_pickle("./df_test_0_processed.pkl")
# df_test_1_processed.to_pickle("./df_test_1_processed.pkl")

In [58]:
X_train_df = df_train_processed.drop(columns = ["Author", "Chunk", "re_text", "POS_string", "Type", "Pub", "ID", "Unnamed: 0", "word_count", "sent_count", "sybl_count"])
colnames_reg = X_train_df.columns
y_train = df_train_processed['Author']

X_val_df = df_val_processed.drop(columns = ["Author", "Chunk", "re_text", "POS_string", "Type", "Pub", "ID", "Unnamed: 0", "word_count", "sent_count", "sybl_count"])
y_val = df_val_processed['Author']

X_test_0_df = df_test_0_processed.drop(columns = ["Author", "Chunk", "re_text", "POS_string", "Type", "Pub", "ID", "Unnamed: 0", "word_count", "sent_count", "sybl_count"])
y_test_0 = df_test_0_processed['Author']

X_test_1_df = df_test_1_processed.drop(columns = ["Author", "Chunk", "re_text", "POS_string", "Type", "Pub", "ID", "Unnamed: 0", "word_count", "sent_count", "sybl_count"])
y_test_1 = df_test_1_processed['Author']

In [59]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_df)
X_val_scaled = scaler.transform(X_val_df)

In [60]:
X_test_0_scaled = scaler.transform(X_test_0_df)
X_test_1_scaled = scaler.transform(X_test_1_df)

# 5. Exploring the features

## Feature correlation

In [61]:
import matplotlib.pyplot as plt
import scipy
import scipy.cluster.hierarchy as sch

def plot_corr(df, size=10):
    '''Plot a graphical correlation matrix for a dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot'''

    # Compute the correlation matrix for the received dataframe
    # Don't use df.corr() as it is is extremely slow
    corr = np.corrcoef(df.T)
    
    # Plot the correlation matrix
    fig, ax = plt.subplots(figsize=(size, size))
    cax = ax.matshow(corr, cmap='RdYlGn')
    ax.set_xticks([0, len(corr)])
    ax.set_yticks([0, len(corr)])
    
    # Add the colorbar legend
    cbar = fig.colorbar(cax, ticks=[-1, 0, 1], aspect=40, shrink=.8)

plot_corr(X_train_df, size=18)

### Crafted features

In [62]:
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
feature_names = colnames_reg.values.tolist()

ax = plt.axes()

target_features = ['flesch_score_v2', 'sent_score', 'lexical_diversity', 'avg_word_per_sentence', 'avg_word_length']
assert [f for f in X_train_df.columns if not f.startswith('punct_') and not f.startswith('word_') and not f.startswith('pos_') and not f.startswith('stop_')] == target_features

im = ax.imshow(np.corrcoef(X_train_df[target_features].T), cmap="RdBu_r", vmin=-1, vmax=1)
ax.set_xticks(list(range(len(target_features))))
ax.set_xticklabels(target_features, rotation=90)
ax.set_yticks(list(range(len(target_features))))
ax.set_yticklabels(list(target_features))

plt.colorbar(im).ax.set_ylabel("$r$", rotation=0)
ax.set_title("Feature correlation matrix")
plt.tight_layout()

### Principal Component Analysis

In [63]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

feature_names = colnames_reg.values.tolist()

pca = PCA(n_components=min(len(feature_names), len(X_train_df.index)), random_state = 42)
pca.fit_transform(X_train_df)
fig, ax = plt.subplots()
ax.set_ylim([0, 1])
var_explained = pca.explained_variance_ratio_.cumsum()
var_explained = np.insert(var_explained, 0, 0)

components = list(range(len(var_explained)))
ax.plot(components, var_explained)
ax.set(xlabel='Number of features', ylabel='Variance explained.',
       title='PCA analysis')
ax.grid()

fig.savefig("pca_analysis.png")
plt.show()

In [64]:
# Keep only the first 25 components.
pca = PCA(n_components=25, random_state = 42)
X_train_pca = pca.fit_transform(X_train_df)
X_val_pca = pca.transform(X_val_df)
X_test_0_pca = pca.transform(X_test_0_df)
X_test_1_pca = pca.transform(X_test_1_df)


# 6. Modelling

In [65]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold 

## Grid search

### Logistic regression

In [66]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Also suppress warnings in other processes during grid search.
os.environ["PYTHONWARNINGS"] = "ignore"
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

def grid_search(model, params, X_scaled, X_pca, y, cv=KFold(5, shuffle=True, random_state=42)):
    X_scaled = np.nan_to_num(X_scaled)
    X_pca = np.nan_to_num(X_pca)
    y = np.nan_to_num(y)
    gridSearch_scaled = GridSearchCV(model, params, cv=cv, verbose=1, n_jobs=128)
    gridSearch_scaled.fit(X_scaled, y)
    gridSearch_pca = GridSearchCV(model, params, cv=cv, verbose=1, n_jobs=128)
    gridSearch_pca.fit(X_pca, y)
    return gridSearch_scaled, gridSearch_pca

In [67]:
logi_scaled, logi_pca = grid_search(
    Pipeline([('logi', LogisticRegression())]),
    {'logi__penalty': ['l2', 'l1'],
     'logi__solver': ['saga'],
     'logi__random_state': [42]},
    X_train_scaled,
    X_train_pca,
    y_train)

In [68]:
logi_scaled_best_params = pd.DataFrame.from_dict(logi_scaled.cv_results_).sort_values(by=['rank_test_score'])
logi_scaled_best_params

In [69]:
logi_pca_best_params = pd.DataFrame.from_dict(logi_pca.cv_results_).sort_values(by=['rank_test_score'])
logi_pca_best_params

### Random forest

In [70]:
rf_scaled, rf_pca = grid_search(
    Pipeline([('rf', RandomForestClassifier())]),
    {'rf__n_estimators': [10, 100, 500, 1000],
     'rf__criterion': ['gini', 'entropy'],
     'rf__random_state': [42]},
    X_train_scaled,
    X_train_pca,
    y_train)

In [71]:
rf_scaled_best_params = pd.DataFrame.from_dict(rf_scaled.cv_results_).sort_values(by=['rank_test_score'])
rf_scaled_best_params

In [72]:
rf_pca_best_params = pd.DataFrame.from_dict(rf_pca.cv_results_).sort_values(by=['rank_test_score'])
rf_pca_best_params

### Support vector classification

In [73]:
svc_scaled, svc_pca = grid_search(
    Pipeline([('svc', SVC())]),
    {'svc__kernel': ['linear', 'rbf', 'poly'],
     'svc__C': [0.5, 1., 2.],
     'svc__random_state': [42]},
    X_train_scaled,
    X_train_pca,
    y_train)

In [74]:
svc_scaled_best_params = pd.DataFrame.from_dict(svc_scaled.cv_results_).sort_values(by=['rank_test_score'])
svc_scaled_best_params

In [75]:
svc_pca_best_params = pd.DataFrame.from_dict(svc_pca.cv_results_).sort_values(by=['rank_test_score'])
svc_pca_best_params

## Run the best models from each gridsearch

In [76]:
#merge val and test_0
X_test_scaled = np.concatenate((X_val_scaled, X_test_0_scaled))

In [77]:
print(X_val_scaled.shape)
print(X_test_0_scaled.shape)
print(X_test_scaled.shape)

In [78]:
X_test_pca = np.concatenate((X_val_pca, X_test_0_pca))

In [79]:
y_test = np.concatenate((y_val, y_test_0))

In [80]:
def run_classifier(X_val, X_test, model):
    # Make predictions on the test set
    #model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    y_pred_1 = model.predict(X_test)
    # Evaluate the accuracy of the model
    return y_pred, y_pred_1

In [81]:
# from sklearn.inspection import permutation_importance
# perm_importance = permutation_importance(SVC_model, X_test_scaled, y_test)
# features_array = np.array(colnames_reg)
# sorted_idx = perm_importance.importances_mean.argsort()
# sorted_idx_10 = sorted_idx[0:10]
# plt.barh(features_array[sorted_idx_10], perm_importance.importances_mean[sorted_idx_10])
# plt.xlabel("Permutation Importance")

In [82]:
import eli5

In [83]:
#1. SVC
params = svc_scaled_best_params.iloc[0].params
assert params == {'svc__C': 0.5, 'svc__kernel': 'linear', 'svc__random_state': 42}
SVC_model = SVC(kernel = params['svc__kernel'], C = params['svc__C'], random_state = 42)
SVC_model.fit(X_train_scaled, y_train)
predictions_SVC, predictions_SVC_conclusion = run_classifier(X_test_scaled, X_test_1_scaled, SVC_model)

In [84]:
#1a. SVC PCA
params = svc_pca_best_params.iloc[0].params
assert params == {'svc__C': 1.0, 'svc__kernel': 'linear', 'svc__random_state': 42}
SVC_model_pca = SVC(kernel = params['svc__kernel'], C = params['svc__C'], random_state = 42)
SVC_model_pca.fit(X_train_pca, y_train)
predictions_SVC_pca, predictions_SVC_pca_conclusion = run_classifier(X_test_pca, X_test_1_pca, SVC_model_pca)

In [85]:
#2. RF
params = rf_scaled_best_params.iloc[0].params
assert params == {'rf__criterion': 'gini', 'rf__n_estimators': 500, 'rf__random_state': 42}
RF_model = RandomForestClassifier(criterion = params['rf__criterion'], n_estimators = params['rf__n_estimators'], random_state = 42)
RF_model.fit(X_train_scaled, y_train)
predictions_RF, predictions_RF_conclusion = run_classifier(X_test_scaled, X_test_1_scaled, RF_model)

In [86]:
#2a. RF PCA
params = rf_pca_best_params.iloc[0].params
assert params == {'rf__criterion': 'gini', 'rf__n_estimators': 1000, 'rf__random_state': 42}
RF_model_pca = RandomForestClassifier(criterion = params['rf__criterion'], n_estimators = params['rf__n_estimators'], random_state = 42)
RF_model_pca.fit(X_train_pca, y_train)
predictions_RF_pca, predictions_RF_pca_conclusion = run_classifier(X_test_pca, X_test_1_pca, RF_model_pca)

In [87]:
forest_importances = pd.Series(RF_model.feature_importances_, index=colnames_reg)

In [88]:
forest_importances.sort_values(ascending=False).head(50)

In [89]:
#3. LR
#same parameters for all data and final 
params = logi_scaled_best_params.iloc[0].params
assert params == {'logi__penalty': 'l2', 'logi__random_state': 42, 'logi__solver': 'saga'}
LR_model = LogisticRegression(penalty = params['logi__penalty'], solver = params['logi__solver'], random_state = 42)
LR_model.fit(X_train_scaled, y_train)

params = logi_pca_best_params.iloc[0].params
assert params == {'logi__penalty': 'l2', 'logi__random_state': 42, 'logi__solver': 'saga'}
LR_model_pca = LogisticRegression(penalty = params['logi__penalty'], solver = params['logi__solver'], random_state = 42)

LR_model_pca.fit(X_train_pca, y_train)
predictions_LR, predictions_LR_conclusion = run_classifier(X_test_scaled, X_test_1_scaled, LR_model)
predictions_LR_pca, predictions_LR_pca_conclusion = run_classifier(X_test_pca, X_test_1_pca, LR_model_pca)

In [90]:
df_test_final = pd.concat([df_val, df_test_0])
df_test_final['prediction_SVC'] = predictions_SVC
df_test_final['prediction_SVC_pca'] = predictions_SVC_pca
df_test_final['prediction_RF'] = predictions_RF
df_test_final['prediction_RF_pca'] = predictions_RF_pca
df_test_final['prediction_LR'] = predictions_LR
df_test_final['prediction_LR_pca'] = predictions_LR_pca
df_test_final.to_csv("../data/predictions_test_v2.csv")

In [91]:
df_test_1.loc[:, 'prediction_SVC'] = predictions_SVC_conclusion
df_test_1.loc[:, 'prediction_SVC_pca'] = predictions_SVC_pca_conclusion
df_test_1.loc[:, 'prediction_RF'] = predictions_RF_conclusion
df_test_1.loc[:, 'prediction_RF_pca'] = predictions_RF_pca_conclusion
df_test_1.loc[:, 'prediction_LR'] = predictions_LR_conclusion
df_test_1.loc[:, 'prediction_LR_pca'] = predictions_LR_pca_conclusion
df_test_1.to_csv("../data/predictions_test_conclusion_v2.csv")

In [92]:
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, recall_score, f1_score, precision_score

default_metrics = {
    'accuracy': accuracy_score,
    'recall': lambda y_t, y_p: recall_score(y_t, y_p, zero_division="warn", average='micro'),
    'precision': lambda y_t, y_p: precision_score(y_t, y_p, zero_division="warn", average='micro'),
    'f1': lambda y_t, y_p: f1_score(y_t, y_p, zero_division= "warn", average ='macro')
}

mapping_authors = {'Aman Madaan': 1, 'Hugo Touvron': 2, 'Timo Schick': 3, 'Zhiqing Sun': 4,'GPT-3.5': 5}
def plot_accuracy(df, column, metrics):
    y_true = df['Author'].map(mapping_authors)
    y_pred = df[column].map(mapping_authors)

    # Remove labels and display_labels not present in y_true
    labels = [1, 2, 3, 4, 5]
    display_labels = ['Author 1', 'Author 2', 'Author 3', 'Author 4', 'GPT-3.5']

    # Plot count confusion matrix
    cm_disp = ConfusionMatrixDisplay.from_predictions(y_true, y_pred, labels=labels, display_labels=display_labels)

    # Show plot
    plt.show()

    # Calculate metrics
    metrics = {metric_name: metric_func(y_true, y_pred) for metric_name, metric_func in metrics.items()}
    print(metrics)

In [93]:
plot_accuracy(df_test_final, 'prediction_LR', default_metrics)

In [94]:
plot_accuracy(df_test_final, 'prediction_LR_pca', default_metrics)

In [95]:
plot_accuracy(df_test_final, 'prediction_SVC', default_metrics)

In [96]:
plot_accuracy(df_test_final, 'prediction_SVC_pca', default_metrics)

In [97]:
plot_accuracy(df_test_final, 'prediction_RF', default_metrics)

In [98]:
plot_accuracy(df_test_final, 'prediction_RF_pca', default_metrics)

In [99]:
plot_accuracy(df_test_1, 'prediction_SVC_pca', default_metrics)

In [100]:
plot_accuracy(df_test_1, 'prediction_SVC', default_metrics)

In [101]:
plot_accuracy(df_test_1, 'prediction_LR', default_metrics)
plot_accuracy(df_test_1, 'prediction_LR_pca', default_metrics)

In [102]:
plot_accuracy(df_test_1, 'prediction_RF', default_metrics)
plot_accuracy(df_test_1, 'prediction_RF_pca', default_metrics)

In [103]:
import eli5

In [104]:
colnames_reg_list = colnames_reg.tolist()

In [105]:
eli5.show_weights(LR_model, feature_names = colnames_reg_list)

In [106]:
predictions_LR_conclusion

In [107]:
df_test_final = df_test_final.reset_index(drop = True)

In [108]:
df_test_final[df_test_final['Author'] == "GPT-3.5"]

In [109]:
eli5.show_weights(RF_model, feature_names = colnames_reg_list, top = 20)

In [110]:
df_test_final

In [111]:
colnames_reg_list = colnames_reg.to_list()
eli5.show_prediction(RF_model, X_test_scaled[100], top =20, feature_names = colnames_reg_list)

In [112]:
df_test_1

# Agglomerative clustering

In [113]:
import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


model = AgglomerativeClustering(distance_threshold = 0, n_clusters=None)

model = model.fit(X_train_df)
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=2)
plt.xlabel("Number of papers in node (or index of paper if the point has no parenthesis).")
plt.show()