# Baseline models

This code splits the labelled data into train, validation, and test sets, and generates predictions using 3 baseline models: a rule-based dictionary, Naive Bayes, and SVM (BoW).

Hyperparameters are set using a grid search, and the best models are further statistically compared by re-running training and evaluation on different random splits of the training data.

## Set-up

### Import modules

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from textwrap import wrap

# sklearn
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.utils import shuffle

# NLTK
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

### Import train, val, and test sets

In [None]:
train = pd.read_pickle('../Data/train.pkl')
val = pd.read_pickle('../Data/val.pkl')
test = pd.read_pickle('../Data/test.pkl')

### Functions and variables

In [None]:
# Get stop words list
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

### Check dataset sizes

In [None]:
for name, dataset in zip(['Train','Val','Test'],[train, val, test]):
    print(f"{name}: {len(dataset)} paragraphs")

In [None]:
print("TRAIN:")
for col in ['Process_action','Market_action','Environment','Social']:
    print(f"{col}: {len(train[train[col]==1])*100/len(train):.2f}% ({len(train[train[col]==1])}) are true")
    
print("\nVAL:")
for col in ['Process_action','Market_action','Environment','Social']:
    print(f"{col}: {len(val[val[col]==1])*100/len(val):.2f}% ({len(val[val[col]==1])}) are true")

print("\nTEST:")
for col in ['Process_action','Market_action','Environment','Social']:
    print(f"{col}: {len(test[test[col]==1])*100/len(test):.2f}% ({len(test[test[col]==1])}) are true")

### Import augmented data

In [None]:
# Import data
translated_data = pd.read_excel('../Data/Back translation with GT.xlsx')
print(translated_data.dtypes)
translated_data.head(2)

In [None]:
# Set correct dtypes
data_for_augmentation = translated_data[translated_data['Exact match']=='No']
data_for_augmentation.rename(columns={'Back-translation':'Paragraph'}, inplace=True)
data_for_augmentation.drop(columns=['Original Paragraph','Exact match'], inplace=True)
data_for_augmentation.set_index('Index',inplace=True)
data_for_augmentation.head(2)

In [None]:
def get_augmented_dataset(df):
    translated_indices = data_for_augmentation.index.to_list() #translated indices
    market_indices = df.index.to_list() #indices in original df
    indices = list(set(translated_indices) & set(market_indices)) # only check indices in both dfs
    translated_set = data_for_augmentation.loc[indices]
    augmented_dataset = shuffle(pd.concat([df, translated_set]), random_state=42)
    return augmented_dataset

In [None]:
augmented_train = get_augmented_dataset(train)
augmented_val = get_augmented_dataset(val)

## Model fitting functions

In [None]:
def check_if_has_keyword(text, unigrams, bigrams):
    text = text.strip()
    text = re.sub('[,\.!?]', '', text) # remove punctuation
    text = re.sub('[^a-zA-Z]', ' ', text) # removes non-letter characters
    words = text.lower().split(' ') # make lowercase & split
    words = [word for word in words if word.strip()]
    words = [lemmatizer.lemmatize(word) for word in words]
    text_in_bigrams = [(ele, words[i+1]) for i, ele in enumerate(words) if i < len(words)-1]
    # check unigrams
    if any(word==unigram for word in words for unigram in unigrams):
        return 1
    elif any(tb==bigram for tb in text_in_bigrams for bigram in bigrams):
        return 1
    else:
        return 0

def fit_cv(param_grid, X, y):
    '''
    Gets best params from given param_grid
    '''
    #Tune hyperparameters
    model = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)),
                         ('clf-svm', SVC(random_state=42))])
    grid = GridSearchCV(model, param_grid, scoring='f1_macro')
    grid.fit(X, y)
    
    #Return best parameters
    return grid.best_params_, grid.best_score_

def svm_grid_search(x_val, y_val, param_grids):
    best_params_all = []
    for param_grid in param_grids:
        best_params, best_score = fit_cv(param_grid, x_val, y_val)
        best_params_all.append((best_params, best_score))
    best_params_final, best_score_final = max(best_params_all, key = lambda i : i[1])
    print(f"Best score: {best_score_final}")
    print(f"Best params: {best_params_final}")
    return best_params_final

def get_best_parameters(pipeline, col, parameters, data_val):
    gs_model = GridSearchCV(pipeline, parameters, scoring='f1_macro')
    gs_model.fit(data_val['Paragraph'], data_val[col])
    print(f"Best score: {gs_model.best_score_}")
    print(f"Best params: {gs_model.best_params_}")
    return gs_model.best_params_

def run_best_parameters(model, best_params, col, data_train):
    model.set_params(**best_params)
    model.fit(data_train['Paragraph'], data_train[col])
    preds = model.predict(test['Paragraph'])
    print_classification_report_heatmap(test[col], preds)
    return model, preds

def print_classification_report_heatmap(actuals, preds):
    # Get classification report
    print(classification_report(actuals, preds, digits = 4))
    
    # Get AUROC score
    print(f"ROC AUC score: {roc_auc_score(actuals, preds)}") #returns macro by default

    # Get heatmap
    fig, ax = plt.subplots(1,1, figsize=(5,5))
    process_mat = confusion_matrix(actuals, preds)
    sns.heatmap(process_mat.T, square = True, annot=True, fmt = "d", ax=ax)
    ax.set_xlabel("true labels")
    ax.set_ylabel("predicted labels")
    plt.show()
    
def find_and_run_best_model(pipeline, data_train, data_val, col, parameters, svm=False):
    if svm:
        best_params = svm_grid_search(data_val['Paragraph'], data_val[col], parameters)
    else:
        best_params = get_best_parameters(pipeline, col, parameters, data_val)
    model, preds = run_best_parameters(pipeline, best_params, col, data_train)
    return model, preds, best_params

## Find and run best baseline models

### Keyword based

In [None]:
# All keyword sets
process_keywords = ['policy', 'audit','visit','questionnaire','certification','policies',
                    'requirement','assessment','certify','certifies',
                    'certified','compliance','complies','complied','comply',
                    'noncompliance','approved','approval','termination','terminate','terminated',
                    'terminates','corrective','code',
                    'award','screening','recycled','fsc','certified',
                    'preferred','monitor','zdhc','remediation','remediate','remediated','remediates']

process_bigrams = [('preferred','supplier'), ('supplier','diversity'), ('local','supplier')]

market_keywords = ['reformulate','reformulated','reformulates','reformulation',
                   'circular','restructuring','platform','disclose','disclosed','discloses',
                   'coalition','partnership','redesign','publish','published','publishes',
                   'affliated','member','partner']

market_bigrams = [('closed','loop')]

social_keywords = ['health', 'safety','labor','labour', 'local', 'quality', 
                   'corruption', 'human', 'occupational', 'conflict', 'collective', 
                   'discrimination', 'community', 'donation', 'donate', 'donates', 
                   'donated', 'antibiotic', 'rana', 'uzbekistan', 'drc', 'zimbabwe', 
                   '2600', 'rmap', 'sedex', '16949','accident']

social_bigrams = [('better','cotton'), ('responsible','mineral'), ('ethical','trade'), 
                  ('collective','bargaining'), ('lost','day'), ('data','confidentiality')]

env_keywords = ['waste','energy','emission','biodiversity','gmo','genetic','ipm','chemical',
                'bepi', 'imds', '14001', 'msc', 'eutr', 'eco', 'carbon', 'co', 'co2', 
                'environmental', 'water', 'pollution', 'plastic', 'packaging']

env_bigrams = [('better','cotton')]

In [None]:
# Process
process_predicted_kw = test['Paragraph'].map(lambda x: check_if_has_keyword(x, process_keywords, process_bigrams))
print_classification_report_heatmap(test['Process_action'], process_predicted_kw)

In [None]:
# Market
market_predicted_kw = test['Paragraph'].map(lambda x: check_if_has_keyword(x, market_keywords, market_bigrams))
print_classification_report_heatmap(test['Market_action'], market_predicted_kw)

In [None]:
# Social
social_predicted_kw = test['Paragraph'].map(lambda x: check_if_has_keyword(x, social_keywords, social_bigrams))
print_classification_report_heatmap(test['Social'], social_predicted_kw)

In [None]:
# Environment
env_predicted_kw = test['Paragraph'].map(lambda x: check_if_has_keyword(x, env_keywords, env_bigrams))
print_classification_report_heatmap(test['Environment'], env_predicted_kw)

### NB

In [None]:
nb_parameters = {'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 'tfidf__use_idf': (True, False),
                 'nb__norm': (True, False),
                 'nb__alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]}
nb_pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)),
                         ('nb', ComplementNB())])

nb_dict = dict()
for col in ['Process_action','Market_action','Social','Environment']:
    print(f"Classification task: {col}")
    model, preds, best_params = find_and_run_best_model(nb_pipeline, train, val, col, nb_parameters, svm=False)
    nb_dict[col] = {'Model': model, 'Preds': preds, 'Best_params': best_params}

### SVM

In [None]:
lin_param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                  'tfidf__use_idf': (True, False),
                  'clf-svm__C': [0.1, 1, 10, 100],
                  'clf-svm__kernel': ['linear']}

rbf_param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                  'tfidf__use_idf': (True, False),
                  'clf-svm__C': [0.1, 1, 10, 100],
                  'clf-svm__gamma': [1, 0.1, 0.01, 0.001],
                  'clf-svm__kernel': ['rbf']}

poly_param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                   'tfidf__use_idf': (True, False),
                   'clf-svm__C': [0.1, 1, 10, 100],
                   'clf-svm__gamma': [1, 0.1, 0.01, 0.001],
                   'clf-svm__kernel': ['poly'], 
                   'clf-svm__degree': [2,3,4]}
svm_parameters = [lin_param_grid, rbf_param_grid, poly_param_grid]

svm_dict = dict()
for col in ['Process_action','Market_action','Social','Environment']:
    print(f"Classification task: {col}")
    svm_pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)),
                         ('clf-svm', SVC(random_state=42))])
    model, preds, best_params = find_and_run_best_model(svm_pipeline, train, val, col, svm_parameters, svm=True)
    svm_dict[col] = {'Model': model, 'Preds': preds, 'Best_params': best_params}

### Augmented NB

In [None]:
nb_parameters = {'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 'tfidf__use_idf': (True, False),
                 'nb__norm': (True, False),
                 'nb__alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]}
nb_pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)),
                         ('nb', ComplementNB())])

aug_nb_dict = dict()
for col in ['Process_action','Market_action','Social','Environment']:
    model, preds, best_params = find_and_run_best_model(nb_pipeline, augmented_train, 
                                                        augmented_val, col, nb_parameters, svm=False)
    aug_nb_dict[col] = {'Model': model, 'Preds': preds, 'Best_params': best_params}

### Augmented SVM

In [None]:
lin_param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                  'tfidf__use_idf': (True, False),
                  'clf-svm__C': [0.1, 1, 10, 100],
                  'clf-svm__kernel': ['linear']}

rbf_param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                  'tfidf__use_idf': (True, False),
                  'clf-svm__C': [0.1, 1, 10, 100],
                  'clf-svm__gamma': [1, 0.1, 0.01, 0.001],
                  'clf-svm__kernel': ['rbf']}

poly_param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                   'tfidf__use_idf': (True, False),
                   'clf-svm__C': [0.1, 1, 10, 100],
                   'clf-svm__gamma': [1, 0.1, 0.01, 0.001],
                   'clf-svm__kernel': ['poly'], 
                   'clf-svm__degree': [2,3,4]}
svm_parameters = [lin_param_grid, rbf_param_grid, poly_param_grid]

aug_svm_dict = dict()

svm_pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)),
                     ('clf-svm', SVC(random_state=42))])
for col in ['Process_action','Market_action','Social','Environment']:
    model, preds, best_params = find_and_run_best_model(svm_pipeline, augmented_train, 
                                                        augmented_val, col, svm_parameters, svm=True)
    aug_svm_dict[col] = {'Model': model, 'Preds': preds, 'Best_params': best_params}

## Save dicts

In [None]:
baseline_models = {'nb_dict': nb_dict, 'svm_dict': svm_dict,
                   'aug_nb_dict': aug_nb_dict, 'aug_svm_dict': aug_svm_dict}

with open("../Data/baseline_models.txt", "wb") as f:
    pickle.dump(baseline_models, f)

## Statistical test

In [None]:
def run_model_n30(model, params, data, col, keywords=None, bigrams=None, augmented=False):
    f1_scores = []
    for i in range(0,30):
        train_stat, test_stat = train_test_split(data, 
                                   test_size=0.2, 
                                   random_state=i, 
                                   stratify=data['All4'])
        if keywords:
            preds = test_stat['Paragraph'].map(lambda x: check_if_has_keyword(x, keywords, bigrams))
        else:
            if augmented == True:
                train_stat = get_augmented_dataset(train_stat) # augment train with back-translations
            model.set_params(**params)
            model.fit(train_stat['Paragraph'], train_stat[col])
            preds = model.predict(test_stat['Paragraph'])
        f1_scores.append(f1_score(test_stat[col], preds, average='macro'))
    
    return f1_scores

In [None]:
# Initialize dict of dicts to store results
model_stat_test = {'Process_action':{}, 
                   'Market_action':{},
                   'Social':{},
                   'Environment':{}}

# Run models 30 times on different random splits of train data (using best hyperparameters)

# Keyword
for col, keywords, bigrams in zip(['Process_action', 'Market_action', 
                          'Social', 'Environment'], 
                         [process_keywords, market_keywords, 
                          social_keywords, env_keywords], 
                         [process_bigrams, market_bigrams, 
                          social_bigrams, env_bigrams]):
    f1_scores = run_model_n30(model, None, train, col, keywords=keywords, bigrams=bigrams)
    model_stat_test[col]['Dictionary'] = f1_scores

# Naive Bayes
for col, inner_dict in nb_dict.items():
    best_params = inner_dict['Best_params']
    model = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)),
                         ('nb', ComplementNB())])
    f1_scores = run_model_n30(model, best_params, train, col)
    model_stat_test[col]['Naive Bayes'] = f1_scores

# SVM
for col, inner_dict in svm_dict.items():
    best_params = inner_dict['Best_params']
    model = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)),
                         ('clf-svm', SVC(random_state=42))])
    f1_scores = run_model_n30(model, best_params, train, col)
    model_stat_test[col]['SVM'] = f1_scores
    
# Augmented Naive Bayes
nb_model = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)),
                         ('nb', ComplementNB())])
for col, inner_dict in aug_nb_dict.items():
    best_params = inner_dict['Best_params']
    model = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)),
                         ('nb', ComplementNB())])
    f1_scores = run_model_n30(model, best_params, train, col, augmented=True)
    model_stat_test[col]['Augmented Naive Bayes'] = f1_scores

# Augmented SVM
svm_model = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)),
                         ('clf-svm', SVC(random_state=42))])
for col, inner_dict in aug_svm_dict.items():
    best_params = inner_dict['Best_params']
    model = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)),
                         ('clf-svm', SVC(random_state=42))])
    f1_scores = run_model_n30(model, best_params, train, col, augmented=True)
    model_stat_test[col]['Augmented SVM'] = f1_scores

In [None]:
# Turn dict into DataFrame
model_stat_test_df = pd.DataFrame(model_stat_test)
model_stat_test_df = model_stat_test_df.reset_index().rename(columns={'index':'Approach'})
model_stat_test_df = model_stat_test_df.melt(id_vars=['Approach'], var_name='Module', value_name='F1').dropna()
model_stat_test_df = model_stat_test_df.explode('F1').reset_index(drop=True)
model_stat_test_df.head()

In [None]:
model_stat_test_df.to_pickle('../Data/baseline_model_stat_test.pkl')