In [22]:
import json
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import re
from datasets import load_dataset
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, MultiLabelBinarizer
import nlp_preproc_functions as preproc
import ast

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

os.chdir('/Users/jon/Documents/DSDM/term_2/adv_meth_nlp/nlp-final/')

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package punkt to /Users/jon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/jon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/jon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data exploration

## Compare small and full dataset

First load the dataset.

In [None]:
# Load the English part of the dataset
#This code breaks for me on my machine. I've 
train_dataset = load_dataset('multi_eurlex', 'en', split='train')
test_dataset = load_dataset('multi_eurlex', 'en', split='test')
val_dataset = load_dataset('multi_eurlex', 'en', split='validation')

In [None]:
test_df = pd.DataFrame([pd.to_datetime('2022-01-05'), pd.to_datetime('2021-5-23'), pd.to_datetime('13 octobre 1968'),pd.NaT])

`eurovoc_id` is the ID of a certain area of government activity.

`level` refers to the specificity of a certain topic. For example, european parliament is level 0, quantum computing is level 7.

In [None]:
# Load (label_id, descriptor) mapping 
with open('./data/eurovoc_descriptors.json') as jsonl_file:
    eurovoc_descriptors =  json.load(jsonl_file)

# Get feature map info
train_classlabel = train_dataset.features["labels"].feature
test_classlabel = test_dataset.features["labels"].feature
val_classlabel = val_dataset.features["labels"].feature

# Load (label_id, descriptor) mapping 
with open('./data/eurovoc_concepts.json') as jsonl_file:
    eurovoc_concepts =  json.load(jsonl_file)


Let's take a look at the class imbalance for our predictor variables.

In [None]:
def process_sample(sample, classlabel):
    labels = [int(label) for label in sample['labels'].replace('[', '').replace(']', '').split()]
    
    results = [
        {
            'celex_id': sample['celex_id'],
            'label_id': label_id,
            'eurovoc_id': classlabel.int2str(label_id),
            'eurovoc_desc': eurovoc_descriptors[classlabel.int2str(label_id)]['en'],
            'eurovoc_level': next((level for level, ids in eurovoc_concepts.items() if classlabel.int2str(label_id) in ids), None)
        }
        for label_id in labels
    ]
    return results

def get_agg_df(dataset, classlabel):
    results = dataset.apply(process_sample, axis=1, classlabel=classlabel)
    return pd.concat([pd.DataFrame(r) for r in results], ignore_index=True)

# Read datasets
train_dataset = pd.read_csv('./data/full_english_dataset/train_en')
test_dataset = pd.read_csv('./data/full_english_dataset/test_en')
val_dataset = pd.read_csv('./data/full_english_dataset/validation')

train_sample = train_dataset.sample(frac=0.1)
test_sample = test_dataset.sample(frac=0.1)
val_sample = val_dataset.sample(frac=0.1)

# Generate aggregated DataFrames
train_agg_df = get_agg_df(train_sample, train_classlabel)
test_agg_df = get_agg_df(test_sample, test_classlabel)
val_agg_df = get_agg_df(val_sample, val_classlabel)

# Concatenate the DataFrames and create a source column
df = pd.concat([train_agg_df, test_agg_df, val_agg_df], ignore_index=True)
df['source'] = pd.Series(['train'] * len(train_agg_df) + ['test'] * len(test_agg_df) + ['val'] * len(val_agg_df), dtype='string')

# Group, pivot, and normalize the data
df_pivot = df.groupby(['eurovoc_desc', 'source'])\
    .size()\
    .reset_index(name='count')\
    .pivot_table(index='eurovoc_desc', columns='source', values='count')\
    .divide(df_pivot.sum(axis=0), axis=1)\
    .sort_values(by='train', ascending=False)

# Plot a grouped bar chart
fig, ax = plt.subplots(figsize=(10,6))

x = np.arange(len(df_pivot))
width = 0.2

ax.bar(x - width, df_pivot['train'], width, label='train')
ax.bar(x, df_pivot['test'], width, label='test')
ax.bar(x + width, df_pivot['val'], width, label='val')

ax.set_title('Occurrences of eurovoc_desc by source')
ax.set_xlabel('eurovoc_desc')
ax.set_ylabel('Percentage')
ax.legend(title='Source', loc='upper right')

xticks = x - width / 2 + 0.5
ax.set_xticks(xticks)
ax.set_xticklabels(df_pivot.index, rotation=45, ha='right')

plt.show()


Note that this graph represents a 10% sample of our datasets, so we may not be able to make inferences about the balance of each label, but overall we can know that our dataset has class imbalance. 

# Model and Pipeline

I haven't made the pipeline for preprocessing, bt here's a pipeline with the Logistic Regression model. But here are all the preprocessing steps in one place. 

For our baseline modeling, we will evaluate the efficiency of three methods:
* Random assignment
* Word frequency
* Logistic regression

## Preprocessing

In [None]:
# read the full dataframes
df_train = pd.read_csv('./data/full_english_dataset/train_en')
df_test = pd.read_csv('./data/full_english_dataset/test_en')
df_val = pd.read_csv('./data/full_english_dataset/validation')



In [None]:
df_train.head(20)

In [None]:
type(df_train['labels'][0])

In [48]:
def process_all(df):
    # fix labels
    df['labels'] = df['labels'].apply(preproc.clean_label)
    #get publishing date feature
    df['pub_date'] = df['text'].apply(preproc.extract_date)
    df = preproc.impute_timestamps(df, 'pub_date')
    #apply standard preprocessing
    df['pp_text'] = df['text'].apply(preproc.preprocess_text)
    #add stopwords that occur once in in >=50% of documents
    df = preproc.remove_common_words(df, 'pp_text', 'celex_id', threshold=0.5)
    #Get labels as dummies
    all_labels = set([label for labels in df['labels'] for label in labels])
    mlb = MultiLabelBinarizer()
    labels_df = pd.DataFrame(mlb.fit_transform(df['labels']), columns=mlb.classes_)
    labels_df = labels_df.add_prefix('label_')
    df = pd.concat([df, labels_df], axis=1)
    df = df.drop('labels', axis=1)
    #get document type feature
    df = preproc.get_eu_legal_type(df, 'text')
    return df


# read the full dataframes
df_train = pd.read_csv('./data/full_english_dataset/train_en')
df_test = pd.read_csv('./data/full_english_dataset/test_en')
#df_val = pd.read_csv('./data/full_english_dataset/validation')


#process
df_test_pp = process_all(df_test)
#print('test done')
#df_val_pp = process_all(df_val)
print('test done')
df_train_pp = process_all(df_train)
print('train done')

#save
df_train_pp.to_csv('train_pp.csv', index=False)
df_test_pp.to_csv('test_pp.csv', index=False)
#df_val_pp.to_csv('val_pp.csv', index=False)


test done


## Logistic Regression Model

In [None]:
with open('./log_reg_model.pkl', 'rb') as file:
    best_model = pickle.load(file)



In [None]:
# import pickle

# with open('log_reg_model.pkl', 'wb') as file:
#     pickle.dump(best_model, file)


# # Define the preprocessing steps for the textual data
# text_preprocessor = Pipeline(steps=[
#     ('count_vect', CountVectorizer(ngram_range=(1,2))), 
#     ('scale', StandardScaler(with_mean=False))
#     ])

# # Define the preprocessing steps for the categorical data
# categorical_preprocessor = Pipeline(steps=[('scaler', StandardScaler())])

# # Define the column transformer to apply the different preprocessing steps to the different columns
# preprocessor = ColumnTransformer(transformers=[
#     ('text', text_preprocessor, 'pp_text'),
#     ('date', StandardScaler(), ['pub_date']),
#     ('categorical', categorical_preprocessor, ['commission', 'regulation', 
#                                                 'decision', 'council', 'directive', 
#                                                 'parliament', 'committee'])
# ])

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(df_train_pp[['pp_text', 'pub_date', 
#                                                          'commission', 'regulation', 'decision', 
#                                                          'council', 'directive', 'parliament', 
#                                                          'committee']], 
#                                                     df_train_pp[['label_0', 'label_1', 'label_2', 'label_3', 
#                                                         'label_4', 'label_5', 'label_6', 'label_7', 
#                                                         'label_8', 'label_9', 'label_10', 'label_11', 
#                                                         'label_12', 'label_13', 'label_14', 'label_15', 
#                                                         'label_16', 'label_17', 'label_18', 'label_19', 
#                                                         'label_20']], test_size=0.2, random_state=42)

# # Create a pipeline with a placeholder for the classifier
# clf = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', None)])

# # Set up the grid search
# param_grid = [
#     {
#         'classifier': [MultiOutputClassifier(LogisticRegression(max_iter=500))],
#     },
#     {
#         'classifier': [LogisticRegression(multi_class='multinomial', max_iter=500)],
#     }
# ]

# grid_search = RandomizedSearchCV(clf, param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# # Fit the grid search to the training data
# grid_search.fit(X_train, y_train)

# # Get the best model and its score
# best_model = grid_search.best_estimator_
# best_score = grid_search.best_score_
# print('Best model:', best_model)
# print('Best cross-validation accuracy: {:.2f}%'.format(best_score*100))

# Evaluate the performance of the best model on the testing data
# score = best_model.score(X_test, y_test)
# print('Test accuracy: {:.2f}%'.format(score*100))


Saved output from the above model:
```python
Best model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('text',
                                                  Pipeline(steps=[('count_vect',
                                                                   CountVectorizer(ngram_range=(1,
                                                                                                2))),
                                                                  ('scale',
                                                                   StandardScaler(with_mean=False))]),
                                                  'pp_text'),
                                                 ('date', StandardScaler(),
                                                  ['pub_date']),
                                                 ('categorical',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['commission', 'regulation',
                                                   'decision', 'council',
                                                   'directive', 'parliament',
                                                   'committee'])])),
                ('classifier',
                 MultiOutputClassifier(estimator=LogisticRegression(max_iter=500)))])
Best cross-validation accuracy: 32.24%
Test accuracy: 32.05%
```

Now let's evaluate its performance on the validation set. 

In [None]:
df_val_pp.columns

In [None]:
from sklearn.metrics import accuracy_score, f1_score
# Extract the input features and labels from the validation set
X_val = df_val_pp[['pp_text', 'pub_date', 
                   'commission', 'regulation', 'decision', 
                   'council', 'directive', 'parliament', 
                   'committee']]
y_val = df_val_pp[['label_0', 'label_1', 'label_2', 'label_3', 
                   'label_4', 'label_5', 'label_6', 'label_7', 
                   'label_8', 'label_9', 'label_10', 'label_11', 
                   'label_12', 'label_13', 'label_14', 'label_15', 
                   'label_16', 'label_17', 'label_18', 'label_19', 
                   'label_20']]
# Get predictions from the model
y_pred = best_model.predict(X_val)

# Calculate the accuracy
# val_accuracy = accuracy_score(y_val, y_pred, average='micro')
val_f1 = f1_score(y_val, y_pred, average='micro')
# print('Validation accuracy: {:.2f}%'.format(val_accuracy*100))
print('Validation F1: {:.2f}%'.format(val_f1*100))

Validation F1 is at 68.63%. Let's see which categories it predicts well, and which it predicts poorly. 

In [None]:
df_y_pred = pd.DataFrame(y_pred, columns=['pred_label_{}'.format(i) for i in range(y_pred.shape[1])])

# Reset the indices of X_val, y_val, and df_y_pred
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
df_y_pred = df_y_pred.reset_index(drop=True)

# Concatenate X_val, y_val, and df_y_pred along the columns
df_results = pd.concat([X_val[['pp_text']], y_val, df_y_pred], axis=1)

In [None]:
labels_desc = df.drop_duplicates(subset='label_id')[['label_id', 'eurovoc_desc']]
labels_desc.columns

In [None]:
# Join the two DataFrames on the label_id column
df_merged = df_results.merge(labels_desc, left_on='label_{}'.format(i), right_on='label_id')

# Calculate the F1 score for each label
f1_scores = []
for i in range(21):
    score = f1_score(df_merged['label_{}'.format(i)], df_merged['pred_label_{}'.format(i)], average='micro')
    f1_scores.append(score)

# Get the eurovoc_desc for the x-axis
labels = df_merged['eurovoc_desc'].unique()

# Sort the F1 scores in descending order
sorted_indices = np.argsort(f1_scores)[::-1]
f1_scores_sorted = [f1_scores[i] for i in sorted_indices]
labels_sorted = [labels[i] for i in sorted_indices]

# Create a bar chart of the F1 scores
plt.figure(figsize=(10, 5))
plt.bar(labels_sorted, f1_scores_sorted)
plt.xticks(rotation=90)
plt.xlabel('eurovoc_desc')
plt.ylabel('F1 Score')
plt.title('F1 Scores by eurovoc_desc (Sorted by Score)')
plt.show()


In [None]:
df_pivot