<a href="https://colab.research.google.com/github/paulsedille/NeurIPS-Broader-Impact-Statements/blob/main/BIS_analysis_for_release.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from os import sep
import numpy as np
import re
from google.colab import drive
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from google.colab import auth
import gspread
from oauth2client.client import GoogleCredentials
import shutil
import glob
from collections import Counter
import string
import nltk
from nltk.sentiment.util import mark_negation




In [None]:
auth.authenticate_user()

gc = gspread.authorize(GoogleCredentials.get_application_default())

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# It should be authenticated and able to get the NeurIPS 2020 BIS spreadsheet

### Run these next three cells

In [None]:
# Open sheet and make dataframe
sheet = gc.open('NeurIPS 2020 BIS').sheet1
rows = sheet.get_all_values()
megaDf = pd.DataFrame.from_records(rows)
megaDf.columns = megaDf.iloc[0]
megaDf = megaDf.drop(megaDf.index[0])
#megaDf = megaDf.drop('Random Check', axis=1)

# Convert strings to numeric
megaDf['academic'] = pd.to_numeric(megaDf['academic'])
megaDf['industry'] = pd.to_numeric(megaDf['industry'])
megaDf['mixed'] = pd.to_numeric(megaDf['mixed'])
megaDf['word count'] = pd.to_numeric(megaDf['word count'])
megaDf['sentence count'] = pd.to_numeric(megaDf['sentence count'])
megaDf['citation count'] = pd.to_numeric(megaDf['citation count'])

# define valid split groupings
splits = ['all','aff','cluster','loc','us-ch','bigTech'] # no primary grouping but can add


In [None]:
# Generate statements with negated terms
megaDf['bis_with_neg'] = megaDf.apply(lambda x: x['impact statement'].translate(str.maketrans({'.':' . ','!':' ! ','?':' ? ',',':' , ',':':' : ',';':' ; '})),axis=1)
# apply negation
useDoubleNeg = True
megaDf['bis_with_neg'] = megaDf.apply(lambda x: mark_negation(x['bis_with_neg'].split(),double_neg_flip=useDoubleNeg), axis=1)

megaDf['bis_with_neg'].apply(lambda x: ' '.join(x)).to_csv('negated statements.csv')
#' '.join(x for x in megaDf['bis_with_neg'])
#.to_csv('negated statements.csv')

#### Helper functions


In [1]:
# Helper functions to group by continent
def makeDf(relList):
  countryRegex = r"'(\w*\s*\w*)'"
  rowList = []

  for index, row in megaDf.iterrows():
    countries = re.findall(countryRegex,row['country'])
    for country in countries:
      if country in relList:
        rowList.append(row)
        break
  return pd.DataFrame(rowList)


# Helper function to generate dataframes for each continent
def generateContinentDfs():
  ccSheet = gc.open('labelled_countries').sheet1
  ccRows = ccSheet.get_all_values()
  upContDf = pd.DataFrame.from_records(ccRows)
  upContDf.columns = upContDf.iloc[0]
  upContDf = upContDf.drop(upContDf.index[0])
  upContDf = upContDf.drop(upContDf.columns[0],axis=1)
  
  asia = upContDf[upContDf['continent'] == 'Asia']
  asiaList = list(asia['country'].values)
  africa = upContDf[upContDf['continent'] == 'Africa']
  africaList = list(africa['country'].values)
  northAm = upContDf[upContDf['continent'] == 'North America']
  northAmList = list(northAm['country'].values)
  southAm = upContDf[upContDf['continent'] == 'South America']
  southAmList = list(southAm['country'].values)
  europe = upContDf[upContDf['continent'] == 'Europe']
  europeList = list(europe['country'].values)
  oceania = upContDf[upContDf['continent'] == 'Oceania']
  oceaniaList = list(oceania['country'].values)

  asiaDf = makeDf(asiaList)
  africaDf = makeDf(africaList)
  northAmDf = makeDf(northAmList)
  southAmDf = makeDf(southAmList)
  europeDf = makeDf(europeList)
  oceaniaDf = makeDf(oceaniaList)

  return([asiaDf,africaDf,northAmDf,southAmDf,europeDf,oceaniaDf],
         ['Asia','Africa','North America','South America','Europe','Oceania'])


# Helper function to generate dataframes grouped by a column
# @col: the column name
# @top10: True to limit to 10 largest groupings
def generateSbjDfs(col,top10=False):
  dfs = []

  sbjDf = megaDf.groupby(by=col)['title'].count()
  sbjDf = sbjDf.sort_values(ascending=False)

  if top10:
    sbjDf = sbjDf[:10]

  for sbj in sbjDf.keys():
    dfs.append(megaDf[megaDf[col] == str(sbj)])
  return (dfs,sbjDf.keys().tolist())


# Generates a subset of the megaDf
# @sub: 
# 'all' = all papers
# 'aff' = split by affiliation (academic, industry, mixed)
# 'primary' = primary subject area (top 10)
# 'cluster' = clustering subject area (top 10)
# 'loc' = continent
# 'us-ch' = US and China affiliations
# 'bigTech' = Apple, Amazon, Microsoft, Google, Facebook, Huawei
def generateSubset(sub):
  # subset data
  if sub == 'all':
    dfList = [megaDf]
    names = ['all']
  elif sub == 'aff':
    acaDf = megaDf.loc[(megaDf['academic'] == 1) & (megaDf['mixed'] == 0)]
    indDf = megaDf.loc[(megaDf['industry'] == 1) & (megaDf['mixed'] == 0)]
    mixedDf = megaDf[megaDf['mixed'] == 1]
    dfList = [acaDf,indDf,mixedDf]
    names = ['academic','industry','mixed-affiliation']
  elif sub == 'primary':
    (dfList,names) = generateSbjDfs('primary subject area')
  elif sub == 'cluster':
    (dfList,names) = generateSbjDfs('clustering subject preference')
  elif sub == 'loc':
    (dfList,names) = generateContinentDfs()
  elif sub == 'us-ch':
    usOnlyDf = megaDf[megaDf['country'] == "{'USA'}"]
    chOnlyDf = megaDf[megaDf['country'] == "{'China'}"]
    dfList = [usOnlyDf,chOnlyDf]
    names = ['United States','China']
  elif sub == 'bigTech':
    appleDf = megaDf[megaDf['affiliations'].str.contains('Apple')]#megaDf['apple' in str(megaDf['affiliations']).lower()]
    amazonDf = megaDf[megaDf['affiliations'].str.contains('Amazon')]
    microsoftDf = megaDf[megaDf['affiliations'].str.contains('Microsoft')]
    googleDf = megaDf[megaDf['affiliations'].str.contains('Google')]
    facebookDf = megaDf[megaDf['affiliations'].str.contains('Facebook')]
    huaweiDf = megaDf[megaDf['affiliations'].str.contains('Huawei')]
    dfList = [appleDf, amazonDf, microsoftDf, googleDf, facebookDf, huaweiDf]
    names = ['Apple', 'Amazon', 'Microsoft', 'Google', 'Facebook', 'Huawei']
  else:
      print("Subset string not recognized.")
      return

  return [dfList, names]


# Helper function to make a list of words from a string separated by /
# @words: the word string
def makeWordList(words):
  wL = words.split('/')
  return [x.lower().strip() for x in wL]


# Make frequency DataFrames with a wordlist
# @df: the DataFrame to calculate frequencies for
# @wordlist: the wordlist containing the words to calculate frequencies for
# @useNeg: whether or not to also use the negated versions of the wordlist 
def makeFreqDfsWithWordlist(df,wordlist,useNeg=False):
  wordFreqDict = {}
  stmtFreqDict = {}
  
  any = '<any>' # any words count here
  anyNoNeg = '<any_no_neg>' # only words that aren't negated count here

  wordFreqDict[any] = 0
  wordFreqDict[anyNoNeg] = 0

  stmtFreqDict[any] = 0
  stmtFreqDict[anyNoNeg] = 0

  countForAnyProps = 0

  stopwords = set(STOPWORDS) # using wordcloud stopwords
  stopwords.update([' ','','\n'])
  totalLen = 0

  wordlist = [x.strip().lower() for x in wordlist] # assumes no punctuation in wordlist
  wordlist = [' ' + x + ' ' for x in wordlist] # add padding spaces 

  # get version with negation tags if necessary
  if useNeg:
    statements = df['bis_with_neg'].values
  else:
    statements = df['impact statement'].values

  for statement in statements:
    componentWords = set()

    if useNeg: 
      words = statement # presplit if using negated
    else: 
      words = statement.split(' ')
    totalLen += len(words)

    # remove punctuation and go lowercase
    # using spaces so can count with regex
    words = [x.strip().lower() for x in words]
    words = [re.sub("[^\w\s]", " ", x) for x in words]

    joinedStmt = ' ' + ' '.join(words) + ' '
    
    for target in wordlist:
      #numOccs = len(re.findall(target,joinedStmt))

      strippedTar = target.strip()
      numOccs = len(re.findall(f'{strippedTar} ',joinedStmt))

      if numOccs > 0:
        componentWords.add(strippedTar)
        wordFreqDict[any] += numOccs # update <any> row

        if '_neg' not in strippedTar: # update <any_no_neg> row if word not negated
          wordFreqDict[anyNoNeg] += numOccs

        if strippedTar in wordFreqDict:
          wordFreqDict[strippedTar] += numOccs 
        else: 
          wordFreqDict[strippedTar] = numOccs 


    # add to number of statements words appear in
    if len(componentWords) > 0: # if any of the words have occurred, updated <any> row
      stmtFreqDict[any] += 1

      compsNoNeg = [x for x in componentWords if '_neg' not in x] # if at least one non-negated word, update <any_no_neg> row
      if len(compsNoNeg) > 0:
        stmtFreqDict[anyNoNeg] += 1

    for word in componentWords: # update statement count
      if word in stmtFreqDict:
        stmtFreqDict[word] += 1
      else: stmtFreqDict[word] = 1

  # make dataframes
  wordFreqs = pd.DataFrame.from_dict(wordFreqDict,orient='index',columns=['word_freq'])
  wordFreqs['word_prop'] = wordFreqs['word_freq']/totalLen
  wordFreqs = wordFreqs.sort_values(by='word_prop',ascending=False)

  stmtFreqs = pd.DataFrame.from_dict(stmtFreqDict,orient='index',columns=['stmt_freq'])
  stmtFreqs['stmt_prop'] = stmtFreqs['stmt_freq']/len(statements)
  stmtFreqs = stmtFreqs.sort_values(by='stmt_prop',ascending=False)

  # calculate average number of occurrences
  avgs = {}
  for word in wordFreqDict.keys():
    avgs[word] = wordFreqDict[word]/len(df)
  avgOccs = pd.DataFrame.from_dict(avgs,orient='index',columns=['avg_occs'])
  avgOccs = avgOccs.sort_values(by='avg_occs',ascending=False)

  return (wordFreqs, stmtFreqs, avgOccs)

#### The important functions

In [None]:
# Function to print proportions of records containing words and save to csv.
# @sub:
# all = all papers
# aff = split by affiliation
# primary = primary subject area (top 10)
# cluster = clustering subject area (top 10)
# loc = continent
# (potentially to come: countries of interest)
# @wordlist: list of words to consider
# @filename: '<path/to/file.csv>'
def getProportions(sub, wordlist, fileName):
  dfList = []
  names = []

  # subset data
  if sub == 'all':
    dfList = [megaDf]
    names = ['all']
  elif sub == 'aff':
    acaDf = megaDf.loc[(megaDf['academic'] == 1) & (megaDf['mixed'] == 0)]
    indDf = megaDf.loc[(megaDf['industry'] == 1) & (megaDf['mixed'] == 0)]
    mixedDf = megaDf[megaDf['mixed'] == 1]
    dfList = [acaDf,indDf,mixedDf]
    names = ['academic','industry','mixed-affiliation']
  elif sub == 'primary':
    (dfList,names) = generateSbjDfs('primary subject area')
  elif sub == 'cluster':
    (dfList,names) = generateSbjDfs('clustering subject preference')
  elif sub == 'loc':
    (dfList,names) = generateContinentDfs()
  elif sub =='us-ch':
    usOnlyDf = megaDf[megaDf['country'] == "{'USA'}"]
    chOnlyDf = megaDf[megaDf['country'] == "{'China'}"]
    dfList = [usOnlyDf,chOnlyDf]
    names = ['United States','China']
  elif sub == 'bigTech':
    appleDf = megaDf[megaDf['affiliations'].str.contains('Apple')]#megaDf['apple' in str(megaDf['affiliations']).lower()]
    amazonDf = megaDf[megaDf['affiliations'].str.contains('Amazon')]
    microsoftDf = megaDf[megaDf['affiliations'].str.contains('Microsoft')]
    googleDf = megaDf[megaDf['affiliations'].str.contains('Google')]
    facebookDf = megaDf[megaDf['affiliations'].str.contains('Facebook')]
    huaweiDf = megaDf[megaDf['affiliations'].str.contains('Huawei')]
    dfList = [appleDf, amazonDf, microsoftDf, googleDf, facebookDf, huaweiDf]
    names = ['Apple', 'Amazon', 'Microsoft', 'Google', 'Facebook', 'Huawei']
  else:
    print("Subset string not recognized.")
    return

  # calculate proportions and write to csv
  bigListForDf = []
  for word in wordlist:
    word = re.sub("[^\w\s]", "", word.lower().strip()) 
    listForDf = []
    for df, name in zip(dfList,names):
      containsDf = df[df['impact statement'].str.contains(word)]
      prop = len(containsDf)/len(df)
      listForDf.append([word, name, prop])
    bigListForDf = bigListForDf + listForDf
  allWordsDf = pd.DataFrame(bigListForDf,columns=['word','category','proportion'])#allWordsDf.append(pd.Series(listForDf,index=allWordsDf.columns),ignore_index=True)
  allWordsDf.to_csv(fileName)


# Describe citations for a subset of data
# @sub: 
# 'all' = all papers
# 'aff' = split by affiliation
# 'primary' = primary subject area (top 10)
# 'cluster' = clustering subject area (top 10)
# 'loc' = continent
# 'us-ch' = US and China affiliations
# 'bigTech' = Apple, Amazon, Microsoft, Google, Facebook, Huawei
def describeCitations(sub):
  # subset data
  if sub == 'all':
    dfList = [megaDf]
    names = ['all']
  elif sub == 'aff':
    acaDf = megaDf.loc[(megaDf['academic'] == 1) & (megaDf['mixed'] == 0)]
    indDf = megaDf.loc[(megaDf['industry'] == 1) & (megaDf['mixed'] == 0)]
    mixedDf = megaDf[megaDf['mixed'] == 1]
    dfList = [acaDf,indDf,mixedDf]
    names = ['academic','industry','mixed-affiliation']
  elif sub == 'primary':
    (dfList,names) = generateSbjDfs('primary subject area')
  elif sub == 'cluster':
    (dfList,names) = generateSbjDfs('clustering subject preference')
  elif sub == 'loc':
    (dfList,names) = generateContinentDfs()
  elif sub =='us-ch':
    usOnlyDf = megaDf[megaDf['country'] == "{'USA'}"]
    chOnlyDf = megaDf[megaDf['country'] == "{'China'}"]
    dfList = [usOnlyDf,chOnlyDf]
    names = ['United States','China']
  elif sub == 'bigTech':
    appleDf = megaDf[megaDf['affiliations'].str.contains('Apple')]#megaDf['apple' in str(megaDf['affiliations']).lower()]
    amazonDf = megaDf[megaDf['affiliations'].str.contains('Amazon')]
    microsoftDf = megaDf[megaDf['affiliations'].str.contains('Microsoft')]
    googleDf = megaDf[megaDf['affiliations'].str.contains('Google')]
    facebookDf = megaDf[megaDf['affiliations'].str.contains('Facebook')]
    huaweiDf = megaDf[megaDf['affiliations'].str.contains('Huawei')]
    dfList = [appleDf, amazonDf, microsoftDf, googleDf, facebookDf, huaweiDf]
    names = ['Apple', 'Amazon', 'Microsoft', 'Google', 'Facebook', 'Huawei']
  else:
    print("Subset string not recognized.")
    return

  for df,name in zip(dfList,names):
    print(f"{name} citations:\n {df['citation count'].describe()}")


# Calculate the word frequencies for a pregenerated subset and outputs to CSV.
# @dfList: a list of dataframes
# @names: a list of names for the dataframes in the dfList
# @fileName: '<path/to/file.csv>'
# @wordList: the list of qords to get frequencies for
# @numWords: the max number of word frequencies to output. Defaults to all words.
# @useNeg: whether or not to use negated words in the list. Defaults to no. 
def getFreqsForGeneratedSubset(dfList,names,fileName,wordList,numWords=None,useNeg=False):
  wfDfList = []
  stDfList = []
  avgDfList = []

  for df, name in zip(dfList, names):
    (wDf,sDf,avgDf) = makeFreqDfsWithWordlist(df,wordList,useNeg)
    # All frequencies are subset frequencies--
    # so the frequency of occurrence in academic, or South American,
    # or NLP, etc statements

    wCols = ['category','word','word_freq','word_prop'] 
    wDf['category'] = name
    wDf['word'] = wDf.index
    wDf = wDf.reset_index(drop=True)
    wDf = wDf[wCols]
    
    sCols = ['category','word','stmt_freq','stmt_prop'] 
    sDf['category'] = name
    sDf['word'] = sDf.index
    sDf = sDf.reset_index(drop=True)
    sDf = sDf[sCols]
    
    avgCols = ['category','word','avg_occs']
    avgDf['category'] = name
    avgDf['word'] = avgDf.index
    avgDf = avgDf.reset_index(drop=True)
    avgDf = avgDf[avgCols]

    wfDfList.append(wDf[:numWords])
    stDfList.append(sDf[:numWords])
    avgDfList.append(avgDf[:numWords])

  mlWfDf = pd.concat(wfDfList).sort_values(['word','category']).set_index(['word','category'])#.to_csv(totalFreqFile)
  mlStDf = pd.concat(stDfList).sort_values(['word','category']).set_index(['word','category'])
  mlAvgDf = pd.concat(avgDfList).sort_values(['word','category']).set_index(['word','category'])#.to_csv('avg.csv')

  megaMerge = mlWfDf.merge(mlStDf,how='outer',on=['word','category'])
  megaMerge = megaMerge.merge(mlAvgDf,how='outer',on=['word','category'])
  megaMerge.columns = ['total word frequency (count)','total word frequency (proportion)',
                       'statement frequency (count)', 'statement frequency (proportion)',
                       'average number of occurrences']
  megaMerge.to_csv(fileName)

### Generating the big database of frequencies

In [2]:
# Generate big database for standard wordlists with negations

fileBase = 'BFD_norm_2'
with open('/content/drive/My Drive/NeurIPS_BIS_Analysis/BIS_keywords.txt','r') as listFile:
  wordLists = listFile.read().split('\n')

wordLists = [x for x in wordLists if x != '']
wordLists = [makeWordList(x) for x in wordLists]
for l in wordLists:
  l_with_neg = [x + '_neg' for x in l]
  l.extend(l_with_neg)

idx = 1

for wl in wordLists:
  
  for ind, zipped in enumerate(splitList):
    if idx%20 == 0: print(f'working on {idx}')
    fileName = f'./{fileBase}/{idx + 1000}_{str(wl[:3])}_{ind}.csv'
    
    df = zipped[0]
    names = zipped[1]

    getFreqsForGeneratedSubset(df,names,fileName,wl,useNeg=True)
    idx += 1
print('Completed.')

FileNotFoundError: ignored

In [None]:
# Zip it up, then turn into one excel file in separate file
shutil.make_archive(fileBase, 'zip', fileBase)

'/content/BFD_norm_2.zip'

In [None]:
# Generate big database for synonym wordlists
# For each list of words, generate for each cluster:
# Total frequency
# Statement frequency
# Average number of occurrences 
fileRoot = 'BFD_exp_5-8'
with open('/content/drive/My Drive/NeurIPS_BIS_Analysis/synonym_list.txt','r') as listFile:
  wordLists = listFile.read().split('\n')

wordLists = [x for x in wordLists if x != '']
wordLists = [makeWordList(x) for x in wordLists]
for l in wordLists:
  l_with_neg = [x + '_neg' for x in l]
  l.extend(l_with_neg)

idx = 1

for wl in wordLists:
  for ind, zipped in enumerate(splitList):
    if idx%20 == 0: print(f'working on {idx}')
    fileName = f'./{fileRoot}/{idx + 1000}_{str(wl[:3])}_{ind}.csv'

    df = zipped[0]
    names = zipped[1]

    getFreqsForGeneratedSubset(df,names,fileName,wl,useNeg=True)
    idx += 1
print('Completed.')

working on 20
Completed.


In [None]:
# zip it all up
# Conversion to excel took place in separate file
shutil.make_archive(fileRoot, 'zip', fileRoot)

'/content/BFD_exp_5-8.zip'

### Analysis


In [None]:
# Most common affiliations
affs = megaDf['affiliations']
regex = r"'\w*\s*\w*'"
affDict = {}

for row in affs:
  matches = re.findall(regex,row)
  for match in matches:
    match = match.lower()[1:-1]
    if match in affDict:
      affDict[match] += 1
    else:
      affDict[match] = 1

toPop = []
for k,v in affDict.items():
  if 'university' in k and k != 'shanghaitech university': # find and combine universities
    for k1,v1 in affDict.items(): # find prefix
      if k1 in k and k != k1:
        affDict[k] += v1
        toPop.append(k1)
        break

# manual cleaning
affDict['deepmind'] += affDict['google deepmind']
toPop.append('google deepmind')
affDict['element ai'] += affDict['elementai']
toPop.append('elementai')
affDict['eth zürich'] += affDict['eth zurich']
toPop.append('eth zurich')
affDict['facebook'] += affDict['facebook inc']
toPop.append('facebook inc')
affDict['facebook'] += affDict['facebook research']
toPop.append('facebook research')
affDict['facebook'] += affDict['facebook ai']
toPop.append('facebook ai')
affDict['facebook'] += affDict['fair']
toPop.append('fair')
affDict['google'] += affDict['google inc']
toPop.append('google inc')
affDict['google'] += affDict['google llc']
toPop.append('google llc')
affDict['google'] += affDict['google research']
toPop.append('google research')
affDict['google'] += affDict['google brain']
toPop.append('google brain')
affDict['google'] += affDict['google ai']
toPop.append('google ai')
affDict['google'] += affDict['google health']
toPop.append('google health')
affDict['amazon'] += affDict['amazon aws']
toPop.append('amazon aws')
affDict['huawei'] += affDict['huawei technologies']
toPop.append('huawei technologies')
affDict['huawei'] += affDict['huawei noah']
toPop.append('huawei noah')
affDict['ibm'] += affDict['ibm corp']
toPop.append('ibm corp')
affDict['ibm'] += affDict['ibm research']
toPop.append('ibm research')
affDict['salesforce'] += affDict['salesforce research']
toPop.append('salesforce research')
affDict['intel'] += affDict['intel corporation']
toPop.append('intel corporation')
affDict['jp morgan'] += affDict['jpmorgan']
toPop.append('jpmorgan')
affDict['linkedin'] += affDict['linkedin corporation']
toPop.append('linkedin corporation')
affDict['megvii'] += affDict['megvii technology']
toPop.append('megvii technology')
affDict['microsoft'] += affDict['microsoft corporation']
toPop.append('microsoft corporation')
affDict['microsoft'] += affDict['microsoft research']
toPop.append('microsoft research')
affDict['baidu'] += affDict['baidu research']
toPop.append('baidu research')
affDict['adobe'] += affDict['adobe research']
toPop.append('adobe research')
affDict['amazon'] += affDict['amazon research']
toPop.append('amazon research')
affDict['vinai'] += affDict['vinai research']
toPop.append('vinai research')
affDict['sensetime'] += affDict['sensetime research']
toPop.append('sensetime research')
affDict['samsung'] += affDict['samsung research']
toPop.append('samsung research')
affDict['samsung'] += affDict['samsung sds']
toPop.append('samsung sds')
affDict['sas institute'] += affDict['sas']
toPop.append('sas')
affDict['mit'] += affDict['mit ']
toPop.append('mit ')
affDict['éts montréal'] += affDict['ets montreal']
toPop.append('ets montreal')
affDict['nvidia'] += affDict['nvidia corporation']
toPop.append('nvidia corporation')
affDict['nvidia'] += affDict['nvidia research']
toPop.append('nvidia research')
affDict['nnaisense'] += affDict['nnaisense sa']
toPop.append('nnaisense sa')
affDict['intel labs'] += affDict['intel']
toPop.append('intel')
affDict['cmu'] += affDict['carnegie']
toPop.append('carnegie')
affDict['radboud university'] += affDict['radboud universiteit']
toPop.append('radboud universiteit')
affDict['riken'] += affDict['riken aip']
toPop.append('riken aip')
affDict['eth zürich'] += affDict['ethz']
toPop.append('ethz')
affDict['sungkyunkwan university'] += affDict['sunkyunkwan university']
toPop.append('sunkyunkwan university')
affDict['uber'] += affDict['uber atg']
toPop.append('uber atg')
affDict['georgia tech'] += affDict['gatech']
toPop.append('gatech')
affDict['telecom paristech'] += affDict['telecom paristec']
toPop.append('telecom paristec')
affDict['tsinghua university'] += affDict['tsinghua univeristy']
toPop.append('tsinghua univeristy')
affDict['tsinghua university'] += affDict['tsinghua univiersity']
toPop.append('tsinghua univiersity')
affDict['oxford university'] += affDict['u oxford']
toPop.append('u oxford')
affDict['uw madison'] += affDict['uw']
toPop.append('uw')
affDict['waymo'] += affDict['waymo llc']
toPop.append('waymo llc')
affDict['weizmann institute'] += affDict['weizmann']
toPop.append('weizmann')
affDict['weizmann institute'] += affDict['institute weizmann']
toPop.append('institute weizmann')
affDict['yale university'] += affDict['yale univ']
toPop.append('yale univ')

for p in toPop:
  affDict.pop(p)


In [None]:
pd.DataFrame.from_dict(affDict,orient='index').to_csv('aff_freqs_updated_3-8.csv')

In [None]:
# Country frequencies
countries = megaDf['country']
countries
countryDict = {}

for row in countries:
  matches = re.findall(regex,row)
  for match in matches:
    match = match.lower()[1:-1]
    if match in countryDict:
      countryDict[match] += 1
    else:
      countryDict[match] = 1

pd.DataFrame.from_dict(countryDict,orient='index').to_csv('country_freqs.csv')

In [None]:
# Make list of DataFrames and names for a split
# @sub: 
# 'all' = all papers
# 'aff' = split by affiliation
# 'primary' = primary subject area (top 10)
# 'cluster' = clustering subject area (top 10)
# 'loc' = continent
# 'us-ch' = US and China affiliations
# 'bigTech' = Apple, Amazon, Microsoft, Google, Facebook, Huawei
# Returns: list of DataFrames and list of names corresponding to each DataFrame
def makeDfandNames(sub):
  dfList = []
  names = []

  if sub == 'all':
    dfList = [megaDf]
    names = ['all']
  elif sub == 'aff':
    acaDf = megaDf.loc[(megaDf['academic'] == 1) & (megaDf['mixed'] == 0)]
    indDf = megaDf.loc[(megaDf['industry'] == 1) & (megaDf['mixed'] == 0)]
    mixedDf = megaDf[megaDf['mixed'] == 1]
    dfList = [acaDf,indDf,mixedDf]
    names = ['academic','industry','mixed-affiliation']
  elif sub == 'primary':
    (dfList,names) = generateSbjDfs('primary subject area')
  elif sub == 'cluster':
    (dfList,names) = generateSbjDfs('clustering subject preference')
  elif sub == 'loc':
    (dfList,names) = generateContinentDfs()
  elif sub == 'us-ch':
    usOnlyDf = megaDf[megaDf['country'] == "{'USA'}"]
    chOnlyDf = megaDf[megaDf['country'] == "{'China'}"]
    dfList = [usOnlyDf,chOnlyDf]
    names = ['United States','China']
  elif sub == 'bigTech':
    appleDf = megaDf[megaDf['affiliations'].str.contains('Apple')]
    amazonDf = megaDf[megaDf['affiliations'].str.contains('Amazon')]
    microsoftDf = megaDf[megaDf['affiliations'].str.contains('Microsoft')]
    googleDf = megaDf[megaDf['affiliations'].str.contains('Google')]
    facebookDf = megaDf[megaDf['affiliations'].str.contains('Facebook')]
    huaweiDf = megaDf[megaDf['affiliations'].str.contains('Huawei')]
    dfList = [appleDf, amazonDf, microsoftDf, googleDf, facebookDf, huaweiDf]
    names = ['Apple', 'Amazon', 'Microsoft', 'Google', 'Facebook', 'Huawei']
  else:
      print("Subset string not recognized.")
      return
  
  return (dfList, names)




In [None]:
# Generate descriptive stats
for sub in ['cluster','primary']:
  (dfs, names) = makeDfandNames(sub)
  fullDf = pd.concat(dfs)
  if sub != 'all':
    dfs.insert(0,fullDf)

    names.insert(0,'<full set>')
  listForDf = []

  for (df,name) in zip(dfs,names):
    numStmts = len(df)
    pctTot = numStmts/1898
    pctSplit = numStmts/len(fullDf)
    avgWc = df['word count'].mean()
    medWc = df['word count'].median()
    maxWc = df['word count'].max()
    minWc = df['word count'].min()

    avgSc = df['sentence count'].mean()
    medSc = df['sentence count'].median()
    maxSc = df['sentence count'].max()
    minSc = df['sentence count'].min()

    numOptOut = len(df[df['opt out']=='TRUE'])
    numNoOptOut = len(df[df['opt out']=='FALSE'])
    unOptOut = numStmts - numOptOut - numNoOptOut
    if numOptOut + numNoOptOut != 0: optOutMProp = numOptOut/(numOptOut + numNoOptOut)
    else: optOutMProp = float('nan')
    optOutTProp = numOptOut/numStmts

    numAmbOptOut = len(df[df['ambiguous opt out']=='TRUE'])
    numNoAmbOptOut = len(df[df['ambiguous opt out']=='FALSE'])
    unAmbOptOut = numStmts - numNoAmbOptOut - numAmbOptOut
    if numAmbOptOut + numNoAmbOptOut != 0: ambOptOutMProp = numAmbOptOut/(numAmbOptOut + numNoAmbOptOut)
    else: ambOptOutMProp = float('nan')
    ambOptOutTProp = numAmbOptOut/numStmts

    listForDf.append([sub,name,numStmts,pctTot,pctSplit,avgWc,medWc,maxWc,minWc,
                      avgSc,medSc,maxSc,minSc,
                      numOptOut,numOptOut+numNoOptOut,unOptOut,
                      optOutMProp,optOutTProp,
                      numAmbOptOut,numAmbOptOut+numNoAmbOptOut,unAmbOptOut,
                      ambOptOutMProp,ambOptOutTProp])


  cols = ['split','subset','statement count','proportion of total','proportion of split','average word count',
          'median word count','max word count','min word count',
          'average sentence count','median sentence count','max sentence count','min sentence count',
          'explicit opt out count', 'marked explicit opt out count', 'unmarked opt out count', 
          'explicit opt out (proportion of marked)','explicit opt out (proportion of total)',
          'ambiguous opt out count','marked ambiguous opt out count', 'unmarked ambiguous opt out count',
          'ambiguous opt out (proportion of marked)','ambiguous opt out (proportion of total)']
  subDf = pd.DataFrame.from_records(listForDf,columns=cols)
  subDf.to_csv(f'./descriptive_stats/{sub}_descr.csv')




NameError: ignored

In [None]:
shutil.make_archive('descriptive_stats', 'zip', 'descriptive_stats')

'/content/descriptive_stats.zip'

In [None]:
# Creates CSV with top numWords words, excluding stopwords
def getTopWords(numWords, stopwords, fileName, col='impact statement'):
  for sub in splits:
    (dfs, names) = makeDfandNames(sub)
    fullDf = pd.concat(dfs)
    if col != 'impact statement': useList = True
    else: useList = False

    if sub != 'all':
      dfs.insert(0,fullDf)
      names.insert(0,'<full set>')

    listForDf = []

    for (df,name) in zip(dfs,names):
      # concatenate all statements, split into words, and get most common
      if not useList: 
        statements = ' '.join(statement for statement in df[col])
        split = statements.split(' ')
      else: 
        split = ' '.join(' '.join(statement) for statement in df[col])
      split = [x.translate(str.maketrans('', '', string.punctuation)).lower().strip() for x in split]

      words = [x for x in split if x not in stopwords] # exclude stopwords
      count = Counter(words)
      top = count.most_common(numWords)
      listForDf.append([sub,name,top])

    cols = ['split','subset','words']
    topWordsDf = pd.DataFrame.from_records(listForDf,columns=cols)
    topWordsDf.to_csv(f'./top_words/{sub}_{fileName}.csv')

def create_human_readable(array_tuple):
    d1 = {}
    for i in array_tuple:
        d1[i[0]] = i[1]
    df = pd.DataFrame(d1.items(), columns=['Word', 'Count'])
    return df

In [None]:
stops = set(STOPWORDS)
stops.update(['the','we','may','in','this','our','','\n'])
getTopWords(20,stops,'top_20_regStops')

getTopWords(40,stops,'top_40_regStops')

techStops = stops.update(['algorithm','algorithms','model','data','models','neural','system','systems'])
getTopWords(20,stops,'top_20_techStops')


In [None]:
stops = set(STOPWORDS)
stops.update(['the','we','may','in','this','our','','\n'])
getTopWords(500,stops,'top_500_regStops_with_double_negs','bis_with_neg')

In [None]:
stops = set(STOPWORDS)
stops.update(['the','we','may','in','this','our','','\n'])
getTopWords(500,stops,'top_500_regStops_with_negs','bis_with_neg')

In [None]:
shutil.make_archive('top_words', 'zip', 'top_words')

'/content/top_words.zip'

###Academic vs Industry

In [None]:
acaDf = megaDf.loc[(megaDf['academic'] == 1) & (megaDf['mixed'] == 0)]
indDf = megaDf.loc[(megaDf['industry'] == 1) & (megaDf['mixed'] == 0)]
mixedDf = megaDf[megaDf['mixed'] == 1]


In [None]:
acaWC = acaDf['word count'].mean()
acaSC = acaDf['sentence count'].mean()
acaWM = acaDf['word count'].median()
acaSM = acaDf['sentence count'].median()

print("Word count:")
display(acaDf['word count'].describe())
print("Sentence count:")
display(acaDf['sentence count'].describe())

Word count:


count    1163.000000
mean      161.680997
std       118.719620
min         2.000000
25%        81.000000
50%       132.000000
75%       217.000000
max       800.000000
Name: word count, dtype: float64

Sentence count:


count    1163.000000
mean        7.665520
std         5.755405
min         1.000000
25%         4.000000
50%         6.000000
75%        10.000000
max        49.000000
Name: sentence count, dtype: float64

In [None]:
indWC = indDf['word count'].mean()
indSC = indDf['sentence count'].mean()
indWM = acaDf['word count'].median()
indSM = acaDf['sentence count'].median()

print("Word count:")
display(indDf['word count'].describe())
print("Sentence count:")
display(indDf['sentence count'].describe())

Word count:


count    122.000000
mean     187.139344
std      142.700479
min        5.000000
25%       90.250000
50%      140.000000
75%      242.000000
max      730.000000
Name: word count, dtype: float64

Sentence count:


count    122.000000
mean       8.680328
std        6.685703
min        1.000000
25%        4.000000
50%        7.000000
75%       12.000000
max       38.000000
Name: sentence count, dtype: float64

In [None]:
mixWC = mixedDf['word count'].mean()
mixSC = mixedDf['sentence count'].mean()
mixWM = mixedDf['word count'].median()
mixSM = mixedDf['sentence count'].median()

print("Word count:")
display(mixedDf['word count'].describe())
print("Sentence count:")
display(mixedDf['sentence count'].describe())

Word count:


count     613.000000
mean      178.683524
std       209.075191
min         5.000000
25%        88.000000
50%       142.000000
75%       227.000000
max      4337.000000
Name: word count, dtype: float64

Sentence count:


count    613.000000
mean       8.797716
std       11.376426
min        1.000000
25%        4.000000
50%        7.000000
75%       11.000000
max      241.000000
Name: sentence count, dtype: float64

### US vs China

In [None]:
# Includes all papers that have some American affiliation
usaDf = megaDf[megaDf['country'].str.contains('USA')]
print("Word count:")
display(usaDf['word count'].describe())
print("Sentence count:")
display(usaDf['sentence count'].describe())

Word count:


count    1229.000000
mean      171.765663
std       170.871689
min         2.000000
25%        84.000000
50%       138.000000
75%       220.000000
max      4337.000000
Name: word count, dtype: float64

Sentence count:


count    1229.000000
mean        8.161920
std         8.941211
min         1.000000
25%         4.000000
50%         6.000000
75%        11.000000
max       241.000000
Name: sentence count, dtype: float64

In [None]:
# America-only affiliation
usaOnlyDf = megaDf[megaDf['country'] == "{'USA'}"]
usaOnlyDf
print("Word count:")
display(usaOnlyDf['word count'].describe())
print("Sentence count:")
display(usaOnlyDf['sentence count'].describe())

Word count:


count     795.000000
mean      182.262893
std       196.001671
min         5.000000
25%        89.000000
50%       143.000000
75%       233.000000
max      4337.000000
Name: word count, dtype: float64

Sentence count:


count    795.00000
mean       8.50566
std       10.28772
min        1.00000
25%        4.00000
50%        6.00000
75%       11.00000
max      241.00000
Name: sentence count, dtype: float64

In [None]:
chinaOnlyDf = megaDf[megaDf['country'] == "{'China'}"]
chinaOnlyDf
print("Word count:")
display(chinaOnlyDf['word count'].describe())
print("Sentence count:")
display(chinaOnlyDf['sentence count'].describe())

Word count:


count     97.000000
mean     135.608247
std       97.296856
min        9.000000
25%       72.000000
50%      122.000000
75%      163.000000
max      576.000000
Name: word count, dtype: float64

Sentence count:


count    97.000000
mean      7.546392
std       5.879025
min       1.000000
25%       4.000000
50%       7.000000
75%      10.000000
max      38.000000
Name: sentence count, dtype: float64

### Keywords

In [None]:
# Prints academic/industry/mixed statistics for individual word
# @word: the word to consider
def statsForWord(word):
  megWordDf = megaDf[megaDf['impact statement'].str.contains(word)]
  print(f"proportion of all statements containing '{word}': {len(megWordDf)/len(megaDf)}")
  acaWordDf = acaDf[acaDf['impact statement'].str.contains(word)]
  print(f"proportion of academic statements containing '{word}': {len(acaWordDf)/len(acaDf)}")
  indWordDf = indDf[indDf['impact statement'].str.contains(word)]
  print(f"proportion of industry statements containing '{word}': {len(indWordDf)/len(indDf)}")
  mixWordDf = mixedDf[mixedDf['impact statement'].str.contains(word)]
  print(f"proportion of mixed statements containing '{word}': {len(mixWordDf)/len(mixedDf)}")
  
  print(f"All '{word}' statements means:")
  print(f"word count: {megWordDf['word count'].mean()}")
  print(f"sentence count: {megWordDf['sentence count'].mean()}")

  print(f"Academic '{word}' statements means:")
  print(f"word count: {acaWordDf['word count'].mean()}")
  print(f"sentence count: {acaWordDf['sentence count'].mean()}")

  print(f"Industry '{word}' statements means:")
  print(f"word count: {indWordDf['word count'].mean()}")
  print(f"sentence count: {indWordDf['sentence count'].mean()}")

  print(f"Mixed '{word}' statements means:")
  print(f"word count: {mixWordDf['word count'].mean()}")
  print(f"sentence count: {mixWordDf['sentence count'].mean()}")

In [None]:
# benefit
statsForWord('benefit')

proportion of all statements containing benefit: 0.23182297154899895
proportion of academic statements containing benefit: 0.22785898538263114
proportion of industry statements containing benefit: 0.22950819672131148
proportion of mixed statements containing benefit: 0.2398042414355628
All 'benefit' statements means:
word count: 214.05454545454546
sentence count: 10.377272727272727
Academic 'benefit' statements means:
word count: 214.2377358490566
sentence count: 10.211320754716981
Industry 'benefit' statements means:
word count: 224.64285714285714
sentence count: 10.321428571428571
Mixed 'benefit' statements means:
word count: 211.7074829931973
sentence count: 10.687074829931973


In [None]:
statsForWord('method')

proportion of all statements containing method: 0.4315068493150685
proportion of academic statements containing method: 0.40498710232158214
proportion of industry statements containing method: 0.45901639344262296
proportion of mixed statements containing method: 0.4763458401305057
All 'method' statements means:
word count: 206.07570207570208
sentence count: 9.87912087912088
Academic 'method' statements means:
word count: 197.21868365180467
sentence count: 9.250530785562633
Industry 'method' statements means:
word count: 232.85714285714286
sentence count: 10.928571428571429
Mixed 'method' statements means:
word count: 215.22602739726028
sentence count: 10.691780821917808


### Clustering

In [None]:
# Poking around the subject areas
sbjDf = megaDf.groupby(by='primary subject area')['title'].count()
sbjDf = sbjDf.sort_values(ascending=False)
top10 = sbjDf[:10]

top10

primary subject area
Applications -> Computer Vision                                  96
Deep Learning                                                    63
Reinforcement Learning and Planning -> Reinforcement Learning    57
Reinforcement Learning and Planning                              57
Deep Learning -> Analysis and Understanding of Deep Networks     56
Deep Learning -> Generative Models                               54
Algorithms -> Representation Learning                            50
Theory -> Statistical Learning Theory                            48
Algorithms -> Bandit Algorithms                                  40
Applications -> Natural Language Processing                      35
Name: title, dtype: int64

In [None]:
# Make a word could for a dataframe
# @df: the dataframe to look at
# @filename: the root of the file to output image to (outputs to <filename>.png)
# @stopwords: the words to exclude
def makeWordCloud(df, filename, stopwords=STOPWORDS):
  text = ' '.join(statement for statement in df['impact statement'])
  wc = WordCloud(stopwords = stopwords, background_color='white').generate(text=text)
  plt.imshow(wc, interpolation='bilinear')
  plt.axis('off')
  plt.show()
  wc.to_file(f'{filename}.png')

##Top Words

### Geography


In [None]:
# Poking around countries/continents
regex = r"'(\w*\s*\w*)'"
countrySet = set()

countries = megaDf['country']
for row in countries:
  matches = re.findall(regex,row)
  for match in matches:
    countrySet.add(match)
print("set")
countrySet


set


{'Australia',
 'Austria',
 'Belgium',
 'Brazil',
 'Canada',
 'Chile',
 'China',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Egypt',
 'Finland',
 'France',
 'Germany',
 'Greece',
 'Hong Kong',
 'Independent',
 'India',
 'Iran',
 'Israel',
 'Italy',
 'Japan',
 'Malaysia',
 'Netherlands',
 'Norway',
 'Pakistan',
 'Poland',
 'Portugal',
 'Qatar',
 'Romania',
 'Russia',
 'Saudi Arabia',
 'Singapore',
 'South Africa',
 'South Korea',
 'Spain',
 'Sweden',
 'Switzerland',
 'Taiwan',
 'Thailand',
 'Turkey',
 'UAE',
 'UK',
 'USA',
 'Vietnam'}

In [None]:
countryRegex = r"'(\w*\s*\w*)'"


In [None]:
# Export countries for labelling
contDf = pd.DataFrame(columns=['country','continent'])
contDf['country'] = list(countrySet)
contDf.to_csv('countries_with_continents.csv')

In [None]:
# Read in updated list
#ccSheet = gc.open('/content/drive/MyDrive/NeurIPS_BIS_Analysis/countries_with_continents_labelled.csv')
auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())

ccSheet = gc.open('labelled_countries').sheet1

In [None]:
ccRows = ccSheet.get_all_values()
upContDf = pd.DataFrame.from_records(ccRows)
upContDf.columns = upContDf.iloc[0]
upContDf = upContDf.drop(upContDf.index[0])
upContDf = upContDf.drop(upContDf.columns[0],axis=1)
upContDf#

Unnamed: 0,country,continent
1,Iran,Asia
2,Denmark,Europe
3,India,Asia
4,China,Asia
5,South Africa,Africa
6,South Korea,Asia
7,Cyprus,Europe
8,Independent,N/a
9,Austria,Europe
10,Brazil,South America


In [None]:
asia = upContDf[upContDf['continent'] == 'Asia']
asiaList = list(asia['country'].values)
africa = upContDf[upContDf['continent'] == 'Africa']
africaList = list(africa['country'].values)
northAm = upContDf[upContDf['continent'] == 'North America']
northAmList = list(northAm['country'].values)
southAm = upContDf[upContDf['continent'] == 'South America']
southAmList = list(southAm['country'].values)
europe = upContDf[upContDf['continent'] == 'Europe']
europeList = list(europe['country'].values)
oceania = upContDf[upContDf['continent'] == 'Oceania']
oceaniaList = list(oceania['country'].values)


In [None]:
lists = [asiaList,africaList,northAmList,southAmList,europeList,oceaniaList]

def makeDf(relList):
  rowList = []

  for index, row in megaDf.iterrows():
    countries = re.findall(countryRegex,row['country'])
    for country in countries:
      if country in relList:
        rowList.append(row)
        break
  return pd.DataFrame(rowList)
                         

In [None]:
# Separate dfs

#for contList, contDf in zip(lists, dfs):
#  contDf = makeDf(contList)
asiaDf = makeDf(asiaList)
africaDf = makeDf(africaList)
northAmDf = makeDf(northAmList)
southAmDf = makeDf(southAmList)
europeDf = makeDf(europeList)
oceaniaDf = makeDf(oceaniaList)

In [None]:
x = 0
contDfs = [asiaDf,africaDf,northAmDf,southAmDf,europeDf,oceaniaDf]
for df in contDfs:
  x += len(df)
x
# We expect some overlap

2418

### Top Words

In [None]:
# make lists of keywords
goodWords = 'Positive / Benefit / Beneficial / Good / Help / Profit / Gain / Advantage / Utility / Help / Improve / Enhance / Advance / Advantage / Assist / Promote / Effective / Useful / Productive / Constructive'
goodWordList = makeWordList(goodWords)

badWords = 'Cost / Bad / Hurt / Abuse / Damage / Ruin / Impair / Injury / Suffering / Damage / Ill / Loss / Detriment / Wrong / Oppress / Impose upon / Maltreat / Damaging / Dangerous / Evil / Destructive / Hazardous / Detrimental / Disadvantage / Toxic / Destructive'
badWordList = makeWordList(badWords)

agWordList = 'Agriculture / Farming / Crops'

busWordList = 'Products / Services / Business / Industry / Corporate / Advertising / marketing / Logistics / Customer service/ Human resources / Hiring'

culWordList = 'Culture / Media / Sport / News / Internet / Search / Personal assistant / Social media / Art'

energyWordList = 'Energy'

eduWordList = 'Education / Teach / School / Research / Science / Technology / Engineering'

envWordList = 'Environment / Climate change / Climate'

devWordList = 'Development / Charity / Charities / Non-profit / SDG / Sustainable development goals / Poverty'

tradeWordList = 'Trade'

irWordList = 'International relations / Institutional relations'

transitWordList = 'Transport / Autonomous vehicles / driverless car'

workWordList = 'Work / Worker / Labour / Labor / Job / Employment / Automation / Automated'

healthWordList = 'Health / Medicine / Pharmaceuticals / Drugs / Government / Public sector / Public services / Social care / Social credit'

econWordList = 'Finance / Financial / Economy / Economic / Banking / Banks / Credit / Loan / Mortgage'

secWordList = 'Security / Defence / Cybersecurity / Military / Surveillance / Tracking / Face recognition / Facial recognition / Emotion recognition / Affect recognition / Biometric / Weapons / Warfare / War / Disinformation / misinformation'

commWordList = 'Community / Housing / Accomodation'

crimjustWordList = 'Crime / Criminal / Terrorism / Terrorist / Justice / Court / Recidivism / Risk score / Prison / Police / Policing / Cybercrime / Hacking / Phishing / Spam / Scam / Pornography / Deepfake'




posApps = 'Health / Science / Engineering / Education / Climate change / Charity / Development Goals / Art'
posAppList = makeWordList(posApps)

negApps = 'Weapon / Porn / Scam / Spam / Surveillance'
negAppList = makeWordList(negApps)

neutApps = 'Customer service / Advertising / Finance / Logistics / Industry / Robotics'
neutAppList = makeWordList(neutApps)

contApps = 'Policing / Military / Government'
contAppList = makeWordList(contApps)

techExpl = 'Limitation / Weakness / Defect / Drawback / Shortcoming / Strength / Improvement / Enhance / Increase / Improve / Advance'
techExplList = makeWordList(techExpl)

techNonSoc = 'Accuracy / Performance / Efficiency / Speed / Optimal / Compute / Memory / Data / Storage / AUC / FPR / TPR / Recall / Precision / BLEU / Benchmark / Sensitivity / Score / Metric'
techNonSocList = makeWordList(techNonSoc)

techSoc = 'Explainable / Interpretable / Fairness / Discrimination / Bias / Equality / Privacy / Anonymity / Consent / Data / Personal data / GDPR / Robustness / Adversarial robustness / Verification / Safety / Generalisable / Side effects / Reward hacking / Scalable Supervision / Safe exploration / Robustness / Distributional shift / Scalable oversight / Security / Vulnerability / Cybersecurity / Feedback loops'
techSocList = makeWordList(techSoc)

nonTechLim = 'Accountable / Responsibility / Transparency / Human Values / Human Rights / Ethical / Moral decisions / Trust / Trustworthy'
nonTechLimList = makeWordList(nonTechLim)

explRisk = 'Accident / Error / Unintended / Misuse / Malicious / Structural / Second-order / Diffuse / Immediate / Short / Medium / Long / Minor / Major / Severe / Extreme / Tail / X-risk / Existential / Extinction'
explRiskList = makeWordList(explRisk)

implRisk = 'Collision / Hacker / terrorist / criminal / War / AGI / TAI / Superintelligence'
implRiskList = makeWordList(implRisk)

structRisk = 'Environment / Energy / Electricity / Compute / Economy / Economic Growth / Jobs / Labour / Employment / Labor / Markets / Employment / Worker / Worker rights / Political / Democracy / Power / National politics / Public opinion / Repression / International relations / Conflict / International security / National security / War / Weapons / Arms race / Misinformation / Fake news / Manipulation / Direct attention / Persuasion / Synthetic media / Generated media / Wellbeing / Autonomy / Health / Mental health / Legal / Liability / Law / Regulation / Governance / Oversight / Policies / Human agency / Autonomy / Dignity / Global inequality / Fairness / Discrimination / Inequality / Bias / Fair'
structRiskList = makeWordList(structRisk)

stakeholders = 'Users / Owners / Developers / Regulators / Public / Researchers / Tech companies / Technology companies / Google / Facebook / EU / Impacted communities / Minorities / Marginalised / Underrepresented / Gender / Race / Demographics'
stakeholdersList = makeWordList(stakeholders)

epistemics = "Uncertain / Maybe / Perhaps / Might / May / Don't know / Could / Unknown / Further investigation / Future work / Further work / More work"
epistemicsList = makeWordList(epistemics)

usedGuide = 'Impact stack / Applications / Implications /  Initiatives'
usedGuideList = makeWordList(usedGuide)

makingRecs = 'Should / Recommend / Recommendation'
makingRecsList = makeWordList(makingRecs)


In [None]:
getFreqsForSubset('aff','aff_good_total_freqs.csv','aff_good_stmt_freqs.csv',wordlist=goodWordList)
getFreqsForSubset('aff','aff_bad_total_freqs.csv','aff_bad_stmt_freqs.csv',wordlist=badWordList)
getFreqsForSubset('loc','loc_good_total_freqs.csv','loc_good_stmt_freqs.csv',wordlist=goodWordList)
getFreqsForSubset('loc','loc_bad_total_freqs.csv','loc_bad_stmt_freqs.csv',wordlist=badWordList)



In [None]:
getFreqsForSubset('primary','prim_bad_total_freqs.csv','prim_bad_stmt_freqs.csv',wordlist=badWordList)
getFreqsForSubset('cluster','clust_bad_total_freqs.csv','clust_bad_stmt_freqs.csv',wordlist=badWordList)


### Citations


In [None]:
# Looking at citations
megaDf[megaDf['citation count'] != 0]

Unnamed: 0,paper title (separate scrape),paper authors (separate scrape),title,paper identifier,paper link,impact statement,impact title,word count,sentence count,citation count,has positive,has negative,has opt out,has NA,has impact statement,manually corrected,human review,Image based PDF,paper title (subjects),primary subject area,secondary subject areas,clustering subject preference,authors,affiliations,academic,industry,mixed,country
7,Fast and Flexible Temporal Point Processes wit...,"Oleksandr Shchur, Nicholas Gao, Marin Biloš, S...",Fast and Flexible Temporal Point Processes wit...,00ac8ed3b4327bdd4ebbebcb2ba10a00,https://proceedings.neurips.cc/paper/2020/file...,Existing works have applied TPPs and MJPs for ...,Broader impact,120,5,6,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,Fast and Flexible Temporal Point Processes wit...,Probabilistic Methods,Algorithms -> Density Estimation; Applications...,Probabilistic methods and inference,"['Oleksandr Shchur', ' Nicholas Gao', ' Marin ...",{'Technical University of Munich'},1,0,0,{'Germany'}
12,Synbols: Probing Learning Algorithms with Synt...,"Alexandre Lacoste, Pau Rodríguez López, Freder...",Synbols: Probing Learning Algorithms with Synt...,0169cf885f882efd795951253db5cdfb,https://proceedings.neurips.cc/paper/2020/file...,The introduction of benchmark new datasets has...,Broader Impact,386,13,3,TRUE,TRUE,FALSE,FALSE,TRUE,TRUE,TRUE,FALSE,Synbols: Probing Learning Algorithms with Synt...,"Data, Challenges, Implementations, and Softwar...",Algorithms -> Active Learning; Algorithms -> F...,"Datasets, challenges, software","['Alexandre Lacoste', ' Pau Rodríguez López', ...","{'ElementAI', 'MILA', 'University of British C...",1,1,1,{'Canada'}
15,Cascaded Text Generation with Markov Transformers,"Yuntian Deng, Alexander Rush",Cascaded Text Generation with Markov Transformers,01a0683665f38d8e5e567b3b15ca98bf,https://proceedings.neurips.cc/paper/2020/file...,Our work proposes an alternative approach to b...,Broader Impact,204,9,11,TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,Cascaded Text Generation with Markov Transformers,Applications -> Natural Language Processing,Deep Learning -> Generative Models,Natural language processing,"['Yuntian Deng', ' Alexander Rush']","{'Harvard University', 'Cornell University'}",1,0,0,{'USA'}
28,Coresets for Regressions with Panel Data,"Lingxiao Huang, K Sudhir, Nisheeth Vishnoi",Coresets for Regressions with Panel Data,03287fcce194dbd958c2ec5b33705912,https://proceedings.neurips.cc/paper/2020/file...,Many organizations have to routinely outsource...,Broader impact,257,11,4,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,Coresets for Regressions with Panel Data,Algorithms -> Data Compression,Algorithms -> Regression,"Core machine learning methods (e.g., supervise...","['Lingxiao Huang', ' K Sudhir', ' Nisheeth Vis...","{'Yale University', 'EPFL'}",1,0,0,"{'USA', 'Switzerland'}"
32,Multi-Robot Collision Avoidance under Uncertai...,"Wenhao Luo, Wen Sun, Ashish Kapoor",Multi-Robot Collision Avoidance under Uncertai...,03793ef7d06ffd63d34ade9d091f1ced,https://proceedings.neurips.cc/paper/2020/file...,The objective of this work is to provide an ex...,Broader Impact,329,13,2,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,Multi-Robot Collision Avoidance under Uncertai...,Social Aspects of Machine Learning -> AI Safety,Applications -> Robotics; Theory -> Control Th...,Safety and Robustness for Autonomous Systems,"['Wenhao Luo', ' Wen Sun', ' Ashish Kapoor']","{'Microsoft', 'Carnegie Mellon University', 'M...",1,1,1,{'USA'}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1864,Measuring Systematic Generalization in Neural ...,"Nicolas Gontier, Koustuv Sinha, Siva Reddy, Ch...",Measuring Systematic Generalization in Neural ...,fc84ad56f9f547eb89c72b9bac209312,https://proceedings.neurips.cc/paper/2020/file...,Transformer based models have been very effect...,Broader Impact,211,11,1,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,Measuring Systematic Generalization in Neural ...,Applications -> Natural Language Processing,Algorithms -> Structured Prediction; Deep Lear...,Natural language processing,"['Nicolas Gontier', ' Koustuv Sinha', ' Siva R...","{'McGill University / Mila / FAIR', 'Montreal ...",1,1,1,"{'Canada', 'France', 'USA'}"
1867,Fast Matrix Square Roots with Applications to ...,"Geoff Pleiss, Martin Jankowiak, David Eriksson...",Fast Matrix Square Roots with Applications to ...,fcf55a303b71b84d326fb1d06e332a26,https://proceedings.neurips.cc/paper/2020/file...,This paper introduces an algorithm to improve ...,Broader Impact,413,20,8,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,Fast Matrix Square Roots with Applications to ...,Probabilistic Methods -> Gaussian Processes,,Probabilistic methods and inference,"['Geoff Pleiss', ' Martin Jankowiak', ' David ...","{'Facebook', 'University of Pennsylvania', 'Co...",1,1,1,{'USA'}
1889,Multi-agent Trajectory Prediction with Fuzzy Q...,"Nitin Kamra, Hao Zhu, Dweep Kumarbhai Trivedi,...",Multi-agent Trajectory Prediction with Fuzzy Q...,fe87435d12ef7642af67d9bc82a8b3cd,https://proceedings.neurips.cc/paper/2020/file...,We have presented a general architecture for m...,Broader Impact,132,9,1,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,Multi-agent Trajectory Prediction with Fuzzy Q...,Deep Learning -> Attention Models,Algorithms -> Relational Learning; Deep Learni...,"Other applications (e.g., robotics, biology, c...","['Nitin Kamra', ' Hao Zhu', ' Dweep Kumarbhai ...","{'University of Southern California', 'Peking ...",1,0,0,"{'USA', 'China'}"
1892,Can the Brain Do Backpropagation? --- Exact Im...,"Yuhang Song, Thomas Lukasiewicz, Zhenghua Xu, ...",Can the Brain Do Backpropagation? — Exact Impl...,fec87a37cdeec1c6ecf8181c0aa2d3bf,https://proceedings.neurips.cc/paper/2020/file...,This work shows that backpropagation in artifi...,Broader Impact,280,12,2,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,Can the Brain Do Backpropagation? --- Exact Im...,Deep Learning -> Biologically Plausible Deep N...,Deep Learning -> Analysis and Understanding of...,Neuroscience and cognitive science,,"{'University of Oxford', 'Hebei University of ...",1,0,0,"{'UK', 'China'}"


In [None]:
describeCitations('primary')

Applications -> Computer Vision citations:
 count    96.000000
mean      0.750000
std       2.357519
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max      15.000000
Name: citation count, dtype: float64
Deep Learning citations:
 count    63.000000
mean      0.841270
std       2.343187
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max      10.000000
Name: citation count, dtype: float64
Reinforcement Learning and Planning -> Reinforcement Learning citations:
 count    57.000000
mean      0.964912
std       1.945369
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max      10.000000
Name: citation count, dtype: float64
Reinforcement Learning and Planning citations:
 count    57.000000
mean      0.912281
std       2.523325
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max      12.000000
Name: citation count, dtype: float64
Deep Learning -> Analysis and Understanding of Deep 

## NLTK


In [None]:
# Negating words
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
dfWithNeg = megaDf
# separate punctuation so negation knows when to stop
dfWithNeg['bis_with_neg'] = dfWithNeg.apply(lambda x: x['impact statement'].translate(str.maketrans({'.':' . ','!':' ! ','?':' ? ',',':' , ',':':' : ',';':' ; '})),axis=1)
# apply negation
dfWithNeg['bis_with_neg'] = dfWithNeg.apply(lambda x: mark_negation(x['bis_with_neg'].split()), axis=1)

In [None]:
dfWithNeg.to_csv('BIS_with_negations.csv')