In [None]:
!pip install Dbias
!pip install https://huggingface.co/d4data/en_pipeline/resolve/main/en_pipeline-any-py3-none-any.whl

In [None]:
#Datasets
!git clone https://github.com/eitanf/sysconf.git

In [None]:
from Dbias.bias_classification import *
from Dbias.bias_recognition import *
from Dbias.text_debiasing import *;

# returns classification label for a given sentence fragment.
print("Giddy Democrats Planning to Exploit the Economic Crisis to Hurt Trump")
print(classifier("Giddy Democrats Planning to Exploit the Economic Crisis to Hurt Trump"))
print(recognizer("Giddy Democrats Planning to Exploit the Economic Crisis to Hurt Trump"))
print(run("Democrats Planning to Exploit the Economic Crisis to Hurt Trump"))
print("To date, most real-time data plane checkers address this problem by exploiting at least one of the following two observations")
print(classifier("We show the existence of exploitable side channels in modern multi-tenant search."))
print(recognizer("We show the existence of exploitable side channels in modern multi-tenant search."))
print(run("We show the existence of exploitable side channels in modern multi-tenant search."))

In [None]:
#Trick for when I did not find any matchings:
#use this for now which guesses the gender, which I guess the is the closest I can get for now
# !pip install gender-guesser
# import gender_guesser.detector as gender
# print(d.get_gender(u"{}".format(name[0], 's')))

# !rm /content/myData -r
!mkdir /content/myData
!mkdir /content/myData/papersAuthors
!mkdir /content/myData/papersTexts

In [None]:
import pandas as pd
import numpy as np
import os
#This creates one csv file per paper from the coauthor.csv file
#Each paperkey.csv will contain all author names+email pairs of that paper

coauthors = pd.read_csv('/content/sysconf/features/coauthors.csv')

#group by individual paper so we can collect the names for each individual paper
groupedPapers = coauthors.groupby('paper')
for paper, group in groupedPapers:
  #this returns a list with TRUE at the index of the first (or unique) occurences of a value
  uniqueValues = np.invert(group.duplicated(subset='name1'))
  #which we use so we only get each name once
  authors = group[uniqueValues]
  data = authors[['name1', 'gs_email1']]
  data.columns = ['name', 'gs_email'] #change column names just to make it a bit more readable
  filename = "/content/myData/papersAuthors/" + paper + ".csv"
  with open(filename, 'w') as f:
    data.to_csv(f, index=False)


#, 'gender', 'country', 'sector'
# #"The common key columns used to combine tables are conference ID, paper ID, and for people, the
# #combination of their normalized name column and gs_email column." -- ReadMe in Features

In [None]:
#This will extend the paperkey.csv files we created earlier with information about each author
#The information of authors comes from persons.csv
#we match the data on the name+email pair as required by the data documentation
#The resulting file contains name, email, gender, country and sector of the given author

authorFeatures = pd.read_csv('/content/sysconf/features/persons.csv')
#keep only relevant columns and fill nan values with "-" to make merging possible
authorFeatures = authorFeatures[['name', 'gs_email', 'gender', 'country', 'sector']].fillna('-')
for filename in os.listdir("/content/myData/papersAuthors/"):
  path = '/content/myData/papersAuthors/' + filename
  #same here, fill nan values to allow merging
  authors = pd.read_csv(path).fillna('-')
  #merge on the name and email pair
  #left means it it keeps the keys from the left frame (authors)
  found = authors.merge(authorFeatures, on=['name', 'gs_email'], how='left')
  with open(path, 'w') as f:
    #write the matches back to to the file
    found.to_csv(f, index=False)

In [None]:
#Here we create a new csv file that will be our final dataset
#we will for each paperkey.csv file :
#count males, females and unknown genders
#count unique number of countries and unknown countries
#the result will be a csv file in the shape
#paperkey, N_males, N_females, gender_unknown, N_countries, country_unknown

def returnCount(data, column, key):
  try:
    return data[column].value_counts()[key]
  except KeyError:
    return 0

data = pd.DataFrame(columns=['paperkey', 'authors_total',
                             'N_males', 'perc_M', 'N_females', 'perc_F', 'gender_unknown', 'variety_gender',
                             'N_countries', 'country_unknown', 'variety_countries'])
for filename in os.listdir("/content/myData/papersAuthors/"):
  path = '/content/myData/papersAuthors/' + filename
  authors = pd.read_csv(path)
  authorsTotal = len(authors.index)
  N_males = returnCount(authors, 'gender', 'M')
  # perc_M = N_males/authorsTotal
  N_females = returnCount(authors, 'gender', 'F')
  # perc_F = N_females/authorsTotal
  gender_unknown = returnCount(authors, 'gender', '-')
  #now we calculate percentage from all known genders
  perc_F = None
  perc_M = None
  if not authorsTotal == gender_unknown:
    perc_M = N_males/(authorsTotal-gender_unknown)
    perc_F = N_females/(authorsTotal-gender_unknown)
  #we make a gender ratio according to the function
  #f(x) = 1/50x if x <=  50
  #     = 1/50x+2 if x > 50
  #I made this function according to percentage, but now I chose to have the decimal number, that is why *100
  variety_gender = 0
  if not perc_M is None:
    if perc_M*100 <= 50:
      variety_gender = 1/50*(perc_M*100)
    else:
      variety_gender = -1/50*(perc_M*100)+2
  #ratio_gender = N_females/(N_males+1)
  country_unknown = returnCount(authors, 'country', '-')
  N_countries = authors['country'].nunique()
  #to avoid counting '-' as a separate value
  if country_unknown != 0:
    N_countries = authors['country'].nunique()-1
  if not country_unknown == authorsTotal:
    variety_countries = N_countries/(authorsTotal-country_unknown)
  paperkey = filename.replace('.csv', '')
  data.loc[len(data.index)] = [paperkey, authorsTotal,
                               N_males, perc_M, N_females, perc_F, gender_unknown, variety_gender,
                               N_countries, country_unknown, variety_countries]
with open('/content/myData/combinedData.csv', 'w') as f:
  data.to_csv(f, index=False)


#Sanity checks that everything is the same length and so on
count = 0
for filename in os.listdir("/content/myData/papersAuthors/"):
  count = count+1
print(count)
combinedData = pd.read_csv('/content/myData/combinedData.csv')
print(len(combinedData.index))
print(combinedData[combinedData['paperkey'] =='ASPLOS_17_001'])
print(combinedData[:5])

In [None]:
#Retrieve all abstracts and add them to the combined datatable

data = pd.read_csv('/content/myData/combinedData.csv')
paperData = pd.read_csv('/content/sysconf/features/papers.csv')
paperData = paperData.set_index('key')

listNotFound = []
texts = []
entities = []
for p in data['paperkey']:
  filename = p + '.txt'
  try:
    f = open('/content/sysconf/data/abstract/' + filename, 'r')
  except IOError:    #This means that the file does not exist (or some other IOError)'
    listNotFound = listNotFound + filename
    texts = texts + ['-']
  else:
    t = f.read()
    f.close()
    texts = texts + [t]
  #add entities of papers to have an idea about topic
  try:
    ent = paperData.at[p, 'entities']
  except:
    entities = entities + ['-']
  else:
    entities = entities + [ent]
data['entities'] = entities
data['abstract'] = texts

#writing all data back to the file
with open('/content/myData/combinedData.csv', 'w') as f:
  data.to_csv(f, index=False)

In [None]:
# #Calculate all bias scores per abstract
# #This runs for about 30 minutes

import datetime

def adaptedClassifier(text, paperkey):
  try:
    c = classifier(text)
  except:
    print(paperkey)
    return {'paperkey': paperkey, 'label': None, 'score': None}
  else:
    c[0]['paperkey'] = paperkey
    return c[0]

# using now() to get current time
start_time = datetime.datetime.now()
print("The program started running at: ", start_time)

data = pd.read_csv('/content/myData/combinedData.csv')
df = data.apply(lambda p: adaptedClassifier(p['abstract'], p['paperkey']), axis=1).tolist()
df = pd.DataFrame.from_dict(df)
with open('/content/myData/biasData.csv', 'w') as f:
  df.to_csv(f, index=False)

# using now() to get current time
end_time = datetime.datetime.now()
print("The program finished running at: ", end_time)
print('The program ran for: ' +  str(end_time-start_time))
# Difference: 0:25:17.151336
# The program ran for: 0:34:48.961188

# #25 abstracts are not classified for some reason

In [None]:
# This goes through the texts that we found to be biased (above)
# and creates a new dataframe that contains the list of words that it recognizes as biased (if any)

#This runs for about 55 minutes

#Actually could have also only run this as it also implicitly does the classification

from Dbias.bias_recognition import *;

def adaptedRecognizer(text, paperkey):
  try:
    output = recognizer(text)
  except:
    return {'paperkey': paperkey, 'words': None}
  else:
    recognizedBias = {}
    recognizedBias['paperkey'] = paperkey
    biasedWords = []
    for element in range(len(output)):
        biasedWords = biasedWords + [output[element]['entity']]
    recognizedBias['biased words'] = biasedWords
    return recognizedBias

# using now() to get current time
start_time = datetime.datetime.now()
print("The program started running at: ", start_time)

biasData = pd.read_csv('/content/myData/biasData.csv')
abstractData = pd.read_csv('/content/myData/combinedData.csv')
#this way we limit our bias recognition to the abstracts that have been classified as biased (to minimise runtime a little bit)
toRecognise = biasData[biasData['label'] == 'Biased'][['paperkey']]
#now we merge on the abstractData so that we have the abstracts in the same dataframe[['a','b']]
toRecognise = toRecognise.merge(abstractData, on=['paperkey'], how='left')[['paperkey', 'abstract']]
df = toRecognise.apply(lambda p: adaptedRecognizer(p['abstract'], p['paperkey']), axis=1).tolist()
df = pd.DataFrame.from_dict(df)
with open('/content/myData/biasedWordsData.csv', 'w') as f:
  df.to_csv(f, index=False)

# using now() to get current time
end_time = datetime.datetime.now()
print("The program finished running at: ", end_time)
print('The program ran for: ' +  str(end_time-start_time))

In [None]:
# merge combinedData.csv and biasData.csv and biasedWordsData.csv on paperkey
import pandas as pd
import numpy as np
import os

data = pd.read_csv('/content/myData/combinedData.csv')
biasLabels = pd.read_csv('/content/myData/biasData.csv')
biasedWords = pd.read_csv('/content/myData/biasedWordsData.csv')

mergeAll = data.merge(biasLabels, on=['paperkey'], how='left')
with open('/content/myData/combinedData.csv', 'w') as f:
  mergeAll.to_csv(f, index=False)

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

colors = ['#FF000B', '#911C20', '#393E49', '#A1938A']

# ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

data = pd.read_csv('/content/myData/combinedData.csv')
# print(data.columns)
biasLabels = pd.read_csv('/content/myData/biasData.csv')
biasedWords = pd.read_csv('/content/myData/biasedWordsData.csv')
#Biased        2228
#Non-biased     138
#total         2391


# print(biasLabels['label'].value_counts())
# print(biasLabels['label'].value_counts())
biasLabels = biasLabels.sort_values('paperkey').reset_index(drop=True)
# print(biasLabels)
biasedWords['biased words'] = biasedWords['biased words'].apply(lambda x: str(x).replace('[', '').replace(']', '').replace("'", ''))
biasedWords = list(biasedWords[biasedWords['biased words'] != '']['biased words'])

words = []
for item in biasedWords:
  if ',' in item:
    words = words + item.split(', ')
  else:
    words = words + [item]

biasedWords = pd.DataFrame(words, columns =['biased words',])
biasedWords['lemma'] = biasedWords['biased words'].apply(lambda x: lemmatizer.lemmatize(x))
frequencies = biasedWords['lemma'].value_counts().rename_axis('word').reset_index(name='frequency')
topten = frequencies[0:11]

# fig, ax = plt.subplots()
ax = topten.plot.barh('word', 'frequency', color=colors[1], figsize=(10, 6), fontsize=20)
ax.invert_yaxis()
plt.tight_layout()
plt.savefig('/content/myData/frequencies.png')

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib as mpl

colors = ['#FF000B', '#911C20', '#393E49', '#A1938A']
c = mpl.colors.ListedColormap(colors, ['red', 'darkred', 'grey', 'beige'])

biasedWords = pd.read_csv('/content/myData/biasedWordsData.csv')
biasedWords['biased words'] = biasedWords['biased words'].apply(lambda x: str(x).replace('[', '').replace(']', '').replace("'", ''))
biasedWords['biased words'] = biasedWords['biased words'].apply(lambda x: lemmatizer.lemmatize(x))
words = list(biasedWords['biased words'].dropna())

biasedWords = ''

text = ' '.join(words).replace('[] ', '').replace('[', '').replace(']', '').replace('"', '').replace("'", '')

# Create and generate a word cloud image:
wordcloud = WordCloud(width=2000, height=1500, background_color="white", colormap=c).generate(text)

# Display the generated image:
plt.figure(figsize=(20,15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout()
plt.savefig('/content/myData/wordcloud.png')
# plt.show()

In [None]:
# Balance my dataset
!pip install imbalanced-learn

In [None]:
# Balance my dataset
# from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
ITERATIONS = 300

data = pd.read_csv('/content/myData/combinedData.csv')
data = data.dropna(subset =['label']).replace('Non-biased', 0).replace('Biased', 1)
data = data[['authors_total','variety_gender','variety_countries','label']]
data = data.dropna()

# X -> Training vector, so features and weights
# y -> Target, so outcome according to the training vector
x = data[['authors_total','variety_gender','variety_countries']]
y = data['label']


# #this is very ugly code, but it works. just kind of done with this
feature_list = np.zeros(len(list(x.columns)))
pValue_list = np.zeros(len(list(x.columns)))
for i in range(ITERATIONS):
  oversample = RandomOverSampler(sampling_strategy='minority')
  x_over, y_over = oversample.fit_resample(x, y)
  x_over_const = sm.add_constant(x_over)
  # X -> Training vector, so features and weights
  # y -> Target, so outcome according to the training vector
  log_reg = sm.Logit(y_over, x_over_const).fit(disp=0)
  for feature in range(1, len(log_reg.params.values)):
    feature_list[feature-1] = feature_list[feature-1] + log_reg.params.values[feature]
    pValue_list[feature-1] = pValue_list[feature-1] + log_reg.pvalues.tolist()[feature]


print('Average results over ' + str(ITERATIONS) + ' iterations:')
print('feature', '\t\t\tcoefficient', '\t\t\tp-value')
for feature in range(len(feature_list)):
  print(list(x.columns)[feature],'\t\t\t', round(feature_list[feature]/ITERATIONS, 3),'\t\t\t', round(pValue_list[feature]/ITERATIONS, 3))



In [None]:
import plotly.express as px

# fig = plt.figure()
# ax = fig.add_subplot(projection='3d')

data = pd.read_csv('/content/myData/combinedData.csv')
# # data = data.dropna(subset =['label']).replace('Non-biased', 0).replace('Biased', 1)
# # X -> Training vector, so features and weights
# # y -> Target, so outcome according to the training vector
# # x = data[['variety_gender', 'variety_countries']]
# # data = data[['authors_total','variety_gender','variety_countries','label']]
# # data = data[['authors_total','variety_gender','N_countries','variety_countries','label']]
# data = data.dropna()

# plt.figure(figsize=(15,15))
# ax.scatter(data['variety_gender'], data['variety_countries'], data['score'], hue=data['label'])
# ax.set_xlabel('variety gender')
# ax.set_ylabel('variety country')
# ax.set_zlabel('bias score')
# ax.dist = 12
# plt.tight_layout()

# plt.show()


# df = px.data.iris()
fig = px.scatter_3d(data, x='variety_gender', y='variety_countries', z='score', color='label', opacity=0.5)
fig.update_traces(marker_size = 5)
fig.show()