# importing acl-anthology bib file & creating csv

In [None]:
import requests
import gzip
import shutil
import os

# download anthology file
url = "https://aclanthology.org/anthology+abstracts.bib.gz"
r = requests.get(url)
open('anthology+abstracts.bib.gz', 'wb').write(r.content)

# open and unpack gz
with gzip.open('anthology+abstracts.bib.gz', 'rb') as f_in:
    with open('anthology+abstracts.bib', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
os.remove("anthology+abstracts.bib.gz")

In [None]:
# watch out: computes very long
# if you've run this + the following cell once, there should e a csv in your directory
# in that case you can uncomment the cell after the next one and read straight from the csv
import bibtexparser

# parse bib file, output: list of dicts
with open('anthology+abstracts.bib') as bibtex_file:   
    bib_database = bibtexparser.bparser.BibTexParser(common_strings=True).parse_file(bibtex_file)


In [None]:
import pandas as pd

# define keywords & words to exclude
keywords = ['fair', 'fairness', 'race', 'gender', 'bias', 'biases', 'protected attribute', 'protected categor']
excludes = ['hate', 'hate speech']
df = pd.DataFrame()

for paper in bib_database.entries:
    if 'abstract' in paper.keys():
        if any(keyword in paper['abstract'] for keyword in keywords):
            if not any(exclude in paper['abstract'] for exclude in excludes):
                temp_row = {}

                # select only select papers from 2016 on
                if 'year' in paper.keys() and int(paper['year']) >= 2016:

                    for cat in paper.keys():
                        temp_row[cat] = paper[cat]

                    df = df.append(temp_row, ignore_index=True)
        
df.to_csv('bias_paper.csv')

In [None]:
# uncomment if csv is already in directory (and save time parsing the bib file)
# import pandas as pd
# df = pd.read_csv('bias_paper.csv')

# tf-idf & clustering

In [None]:
# inspiration from: https://medium.com/mlearning-ai/text-clustering-with-tf-idf-in-python-c94cd26a31e7

In [None]:
import re
import nltk
from nltk.corpus import stopwords

def preprocess_text(text: str, remove_stopwords: bool) -> str:
    
    # remove links, special characters, numbers, stopwords, whitespaces
    # also remove words that are actually filters
    stopwords_list = stopwords.words("english")
    #stopwords_list.extend(['approach', 'bias', 'biases', 'data', 'fair', 'fairness', 'human', 
     #                      'language', 'languages', 'method', 'paper', 'task', 'tasks', 'well', 'word',
      #                    'words'])

    text = re.sub(r"http\S+", "", text)
    text = re.sub("[^A-Za-z]+", " ", text)
    if remove_stopwords:
        tokens = nltk.word_tokenize(text)
        tokens = [w for w in tokens if not w.lower() in stopwords_list]
        text = " ".join(tokens)
    
    text = text.lower().strip()
    
    return text

In [None]:
# create cleaned column
df['cleaned'] = df['abstract'].apply(lambda x: preprocess_text(x, remove_stopwords=True))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.2, max_features=80)
X = vectorizer.fit_transform(df['cleaned'])

In [None]:
from sklearn.cluster import KMeans

# vary here the number of clusters
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X)
clusters = kmeans.labels_

In [None]:
from sklearn.decomposition import PCA

# initialize PCA with 2 components
pca = PCA(n_components=2, random_state=42)
# pass our X to the pca and store the reduced vectors into pca_vecs
pca_vecs = pca.fit_transform(X.toarray())
# save our two dimensions into x0 and x1
x0 = pca_vecs[:, 0]
x1 = pca_vecs[:, 1]

In [None]:
# assign clusters and pca vectors to our dataframe 
df['cluster'] = clusters
df['x0'] = x0
df['x1'] = x1

pd.set_option("display.max_rows", None, "display.max_columns", None)
pd.options.display.max_colwidth = 100

In [None]:
import numpy as np

def get_top_keywords(n_terms):
    """This function returns the keywords for each centroid of the KMeans"""
    df = pd.DataFrame(X.todense()).groupby(clusters).mean() # groups the TF-IDF vector by cluster
    terms = vectorizer.get_feature_names() # access tf-idf terms
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([terms[t] for t in np.argsort(r)[-n_terms:]])) # for each row of the dataframe, find the n terms that have the highest tf idf score
            
get_top_keywords(20)

In [None]:
# map clusters to appropriate labels 
cluster_map = {0: "cluster 01", 
               1: "cluster 02", 
               2: "cluster 03", 
               3: "cluster 04", 
               4: "cluster 05", 
               5: "cluster 06", 
               #6: "cluster 07", 
               #7: "cluster 08", 
               #8: "cluster 09", 
               #9: "cluster 10"
              }
# apply mapping
df['cluster'] = df['cluster'].map(cluster_map)

# visualizing outputs

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# set image size
plt.figure(figsize=(12, 7))
plt.title("TF-IDF + KMeans bias abstracts clustering", fontdict={"fontsize": 18})
# set axes names
plt.xlabel("X0", fontdict={"fontsize": 16})
plt.ylabel("X1", fontdict={"fontsize": 16})
# create scatter plot with seaborn, where hue is the class used to group the data
sns.scatterplot(data=df, x='x0', y='x1', hue='cluster', palette="viridis")
plt.show()

In [None]:
# hand crafted bias list (feel free to change)
bias_list = ['race', 'gender', 'social', 'ethnic', 'religion']

for i in bias_list:
    df[i] = df['cleaned'].str.contains(i)

In [None]:
# cluster per bias
temp_df = pd.DataFrame()

for i in bias_list:
    temp_df[i] = pd.crosstab(df[i],df['cluster']).loc[True]
    
temp_df.transpose().plot.bar();
plt.title('Paper per bias term by cluster')
plt.ylabel("# of paper")
plt.grid(linestyle='--')

In [None]:
# year per cluster
pd.crosstab(df['year'],df['cluster']).plot.bar();
plt.title('Paper per year by cluster', fontdict={"fontsize": 10})
plt.xlabel("year", fontdict={"fontsize": 12})
plt.ylabel("# of paper")
plt.grid(linestyle='--')

In [None]:
# bias per year
temp_df = pd.DataFrame()

for i in bias_list:
    temp_df[i] = pd.crosstab(df[i],df['year']).loc[True]
    
temp_df.transpose().plot.bar();
plt.title('Paper per bias term by year')
plt.ylabel("# of paper")
plt.grid(linestyle='--')