# Build LGBTQ corpus texts

In [11]:
# Load article IDs
fpath = '/data/news/now2010-2021/lgbtq/article_ids.txt'
with open(fpath) as f:
    article_ids = [int(aid) for aid in f.read().splitlines()]
len(article_ids)

218809

## Assemble articles

In [None]:
# Strip out articles that don't match
text_dirpath = '/data/news/now2010-2021/lgbtq/article_ids.txt'
for fname in sorted(os.listdir(text_dirpath), )

## Determine which month/country files to search for articles in (more trouble than its worth)

In [37]:
import csv
source_dirpath = '/data/news/now2010-2021/sources/'
month_countries = []
for fname in tqdm(sorted(os.listdir(source_dirpath))):
    fpath = os.path.join(source_dirpath, fname)
    sources = pd.read_csv(fpath, sep='\t', header=None, names=['article_id', 'word_count', 'date', 'country', 'source', 'url', 'title'], 
                          index_col='article_id', engine='c', error_bad_lines=False, quoting=csv.QUOTE_NONE)
    matching_sources = sources.loc[sources.index.isin(article_ids)]
    matching_sources.loc[:, 'month'] = matching_sources.date.str.slice(0, 5)
    month_countries += list(matching_sources[['month', 'country']].itertuples(index=False, name=None))
month_countries = set(month_countries)
len(month_countries)

  0%|          | 0/53 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


2318

In [43]:
list(month_countries)[:10]

[('11-09', 'PK'),
 ('13-04', 'MY'),
 ('18-12', 'CA'),
 ('15-12', 'JM'),
 ('19-11', 'AU'),
 ('11-01', 'IN'),
 ('10-05', 'JM'),
 ('17-11', 'HK'),
 ('17-04', 'BD'),
 ('15-10', 'GB')]

In [45]:
# formatted = [f'{month}-{country}'.lower() for month, country in month_countries]
formatted = [(month.split('-')[0], month.split('-')[1], country) for month, country in month_countries]
formatted[:20]

[('11', '09', 'PK'),
 ('13', '04', 'MY'),
 ('18', '12', 'CA'),
 ('15', '12', 'JM'),
 ('19', '11', 'AU'),
 ('11', '01', 'IN'),
 ('10', '05', 'JM'),
 ('17', '11', 'HK'),
 ('17', '04', 'BD'),
 ('15', '10', 'GB'),
 ('17', '02', 'BD'),
 ('14', '06', 'ZA'),
 ('12', '05', 'SG'),
 ('13', '07', 'KE'),
 ('20', '01', 'TZ'),
 ('20', '12', 'LK'),
 ('13', '11', 'GH'),
 ('12', '06', 'PH'),
 ('16', '10', 'AU'),
 ('14', '03', 'KE')]

In [47]:
text_fnames = os.listdir('/data/news/now2010-2021/text')
selected_fnames = []
for f in formatted:
    for fname in text_fnames:
        if f[0] in fname.lower() and f[1] in fname.lower() and f[2].lower() in fname.lower():
            selected_fnames.append(fname)
            break
    else:
        print(f)
len(selected_fnames)

('20', '07', '??')
('19', '07', 'y ')
('20', '02', '??')
('19', '12', '??')
('20', '06', '??')
('20', '03', '??')
('19', '07', '? ')
('19', '08', '??')
('19', '09', '??')
('19', '07', '  ')
('19', '10', '??')
('19', '11', '??')
('20', '01', '??')
('19', '12', 'ZA')


2304

In [42]:
[name for name in text_fnames if '20-06' in name]

[]

# Filter NOW corpus to articles that mention LGBTQ issues

## Load, check terms for presence

In [1]:
# Load lexicons, get term IDs
import os
import pandas as pd

# Load old lexicon
lex_dirpath = '/data/news/now2010-2021/lexicon'
fname = 'now_dic.txt'
old_lex = pd.read_csv(os.path.join(lex_dirpath, fname), sep='\t', skiprows=[0,1], index_col='wID', names=['wID', 'word', 'lemma', 'PoS'],
                     keep_default_na=False)

# Load specific lexica
import pdb
from tqdm.notebook import tqdm
lexica = [old_lex]
for fname in sorted(tqdm(os.listdir(lex_dirpath))):
    if not fname.startswith('lexicon'): continue
    fpath = os.path.join(lex_dirpath, fname)
    current_lex = pd.read_csv(fpath, sep='\t', header=None, index_col=0, names=['wID', 'word', 'lemma', 'PoS'], keep_default_na=False,
                             dtype={'wID': int, 'word': str})
    lexica.append(current_lex)

lex = pd.concat(lexica)
lex.sort_index(inplace=True)
lex

  0%|          | 0/51 [00:00<?, ?it/s]

Unnamed: 0_level_0,word,lemma,PoS
wID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,the,the,at
2,",",,","
3,.,,.
4,q!,q!,q!
5,of,of,io
...,...,...,...
63471367,torso--minus,,jj_nn1
63471368,LIPBUBBLE,,np1_nn1_vv0
63471369,**40;3671;TOOLONG,,fu
63471370,@@42681482,,fo


In [2]:
# LGBTQ terms
mendelsohn_terms = [
    'gay',
    'gays',
    'lesbian',
    'lesbians',
    'bisexual',
    'bisexuals',
    'homosexual',
    'homosexuals',
    'transgender',
    'transgenders',
    'transsexual',
    'transsexuals',
    'transexual',
#     'transexuals',
    'transvestite',
    'transvestites',
    'transgendered',
    'asexual',
    'agender',
    'aromantic',
    'lgb',
    'lgbt',
    'lgbtq',
    'lgbtqia',
    'glbt',
    'lgbtqqia',
    'genderqueer',
    'genderfluid',
    'intersex',
    'pansexual',
]
added_terms = [
    'nonbinary',
    'non-binary', 
    'demisexual',
    'demi-sexual',
    'gender-expansive',
    'homophobia',
    'homophobic',
    'transphobia',
    'transphobic',
    'queerphobia',
#     'queerphobic',
    'heteronormativity',
    'heterosexism',
    'heterosexist',
#     'cissexism',
#     'cissexist',
    'transmisogyny',
    'bi-sexual',
#     'bi-sexuals',
    'pan-sexual',
#     'pan-sexuals',
    'pansexuals'
]
terms = mendelsohn_terms + added_terms
# len(terms) 

# Check for presence in lexicon
matches = lex[(lex.word.str.lower().isin(terms)) | (lex.lemma.str.lower().isin(terms))]
matches


Unnamed: 0_level_0,word,lemma,PoS
wID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2275,gay,gay,jj
9616,LGBT,lgbt,np1
15177,transgender,transgender,nn1
17204,gays,gay,nn2
18119,lesbian,lesbian,jj_nn1
...,...,...,...
61709097,TRANSPHOBIA,,nn1
61946145,TRANSGENDER,,vvi
62280693,Nonbinary,,nn1_np1_jj
62614837,Genderfluid,,np1_jj_nn1


In [3]:
# See which terms did not occur
[w for w in terms if w not in matches.word.str.lower().unique()]


[]

## Find articles that contain keywords (build LGBTQ news corpus)

In [4]:
lgbtq_wids = matches.index.tolist()
len(lgbtq_wids)

230

In [7]:
# Search articles for matches
from tqdm.notebook import tqdm
from multiprocessing import Pool
import pdb

db_dirpath = '/data/news/now2010-2021/db'
out_dirpath = '/data/news/now2010-2021/lgbtq/text_ids/'
# n_files = len(os.listdir(db_dirpath)) #2687
            
def search_file(fname):
    fpath = os.path.join(db_dirpath, fname)
    outpath = os.path.join(out_dirpath, f'{fname[:-4]}_text_ids.txt')
    if os.path.exists(outpath):
        return
    try:
        data = pd.read_csv(fpath, sep='\t', header=None, names=['textID',  'ID', 'wID'], index_col='ID', 
                           error_bad_lines=False, low_memory=False)
        # Convert to int datatypes
        data.wID = pd.to_numeric(data.wID, errors='coerce')
        data.textID = pd.to_numeric(data.textID, errors='coerce')
        data = data[data.wID.notnull()]
        data.wID = data.wID.astype(int)
        data.textID = data.textID.astype(int)
    except:
        tqdm.write(fname)
#         pdb.set_trace()
        return
    matches = set(data.loc[data.wID.isin(lgbtq_wids), 'textID'].unique())
    with open(outpath, 'w') as f:
        for tid in matches:
            f.write(f'{tid}\n')
            

fnames = sorted(os.listdir(db_dirpath))
# list(tqdm(map(search_file, fnames), total=len(fnames))) # debug
with Pool(15) as p:
    list(tqdm(p.imap(search_file, fnames), total=len(fnames)))

  0%|          | 0/2687 [00:00<?, ?it/s]

  self.run()
  self.run()
  self.run()
  self.run()
  self.run()
  self.run()
  self.run()
  self.run()
  self.run()
  self.run()
  self.run()
  self.run()
  self.run()
  self.run()
  self.run()


In [9]:
# Concatenate all article ID matches
parts = []
out_dirpath = '/data/news/now2010-2021/lgbtq/text_ids/'
for fname in tqdm(sorted(os.listdir(out_dirpath))):
    fpath = os.path.join(out_dirpath, fname)
    with open(fpath) as f:
        parts.append(f.read().splitlines())
all_parts = [int(p) for part in parts for p in part]
len(all_parts)

  0%|          | 0/2687 [00:00<?, ?it/s]

218809

In [10]:
# Save out
outpath = '/data/news/now2010-2021/lgbtq/article_ids.txt'
with open(outpath, 'w') as f:
    for aid in all_parts:
        f.write(f'{aid}\n')

# Download NOW corpus

In [2]:
# Extract all txt links, save to a file for wget
from bs4 import BeautifulSoup
import urllib

html = urllib.request.urlopen('https://www.corpusdata.org/a3047_now/x.asp?e=yoder@cs.cmu.edu').read()
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a')
len(links)

508

In [3]:
links[:10]

[<a href="https://www.english-corpora.org/now/" target="_blank">NOW</a>,
 <a href="https://www.corpusdata.org/restrictions.asp?r=y">Restrictions on use</a>,
 <a href="mailto:corpus@byu.edu">please email us</a>,
 <a href="https://www.corpusdata.org/formats.asp">
 <font color="#0000FF">Format</font></a>,
 <a href="https://www.corpusdata.org/a3047_now/db/db_10-01-kus.zip">10-01</a>,
 <a href="https://www.corpusdata.org/a3047_now/db/db_10-02-kvz.zip">10-02</a>,
 <a href="https://www.corpusdata.org/a3047_now/db/db_10-03-ovi.zip">10-03</a>,
 <a href="https://www.corpusdata.org/a3047_now/db/db_10-04-laq.zip">10-04</a>,
 <a href="https://www.corpusdata.org/a3047_now/db/db_10-05-wbb.zip">10-05</a>,
 <a href="https://www.corpusdata.org/a3047_now/db/db_10-06-bwq.zip">10-06</a>]

In [5]:
links = [link['href'] for link in links]
links[:10]

['https://www.english-corpora.org/now/',
 'https://www.corpusdata.org/restrictions.asp?r=y',
 'mailto:corpus@byu.edu',
 'https://www.corpusdata.org/formats.asp',
 'https://www.corpusdata.org/a3047_now/db/db_10-01-kus.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-02-kvz.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-03-ovi.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-04-laq.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-05-wbb.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-06-bwq.zip']

In [6]:
text_links = [link for link in links if 'text' in link]
len(text_links)

133

In [7]:
text_links

['http://www.corpusdata.org/a3047_now/text/text_10-01-kus.zip',
 'http://www.corpusdata.org/a3047_now/text/text_10-02-kvz.zip',
 'http://www.corpusdata.org/a3047_now/text/text_10-03-ovi.zip',
 'http://www.corpusdata.org/a3047_now/text/text_10-04-laq.zip',
 'http://www.corpusdata.org/a3047_now/text/text_10-05-wbb.zip',
 'http://www.corpusdata.org/a3047_now/text/text_10-06-bwq.zip',
 'http://www.corpusdata.org/a3047_now/text/text_10-07-uek.zip',
 'http://www.corpusdata.org/a3047_now/text/text_10-08-lqd.zip',
 'http://www.corpusdata.org/a3047_now/text/text_10-09-udn.zip',
 'http://www.corpusdata.org/a3047_now/text/text_10-10-nln.zip',
 'http://www.corpusdata.org/a3047_now/text/text_10-11-weq.zip',
 'http://www.corpusdata.org/a3047_now/text/text_10-12-ixv.zip',
 'http://www.corpusdata.org/a3047_now/text/text_11-01-pct.zip',
 'http://www.corpusdata.org/a3047_now/text/text_11-02-keb.zip',
 'http://www.corpusdata.org/a3047_now/text/text_11-03-lup.zip',
 'http://www.corpusdata.org/a3047_now/te

In [8]:
# Save out
with open('/data/news/now_urls_text.txt', 'w') as f:
    for url in text_links:
        f.write(f'{url}\n')

In [9]:
# Download sources
sources = [link for link in links if 'source' in link]
print(len(sources))
sources

53


['http://www.corpusdata.org/a3047_now/shared/now_sources_pt1.zip',
 'http://www.corpusdata.org/a3047_now/shared/now_sources_pt2.zip',
 'http://www.corpusdata.org/a3047_now/16-11-hsk/sources-16-11.zip',
 'http://www.corpusdata.org/a3047_now/16-12-kds/sources-16-12.zip',
 'http://www.corpusdata.org/a3047_now/17-01-ske/sources-17-01.zip',
 'http://www.corpusdata.org/a3047_now/17-02-skr/sources-17-02.zip',
 'http://www.corpusdata.org/a3047_now/17-03-wwt/sources-17-03.zip',
 'http://www.corpusdata.org/a3047_now/17-04-mks/sources-17-04.zip',
 'http://www.corpusdata.org/a3047_now/17-05-lop/sources-17-05.zip',
 'http://www.corpusdata.org/a3047_now/17-06-lei/sources-17-06.zip',
 'http://www.corpusdata.org/a3047_now/17-07-jjr/sources-17-07.zip',
 'http://www.corpusdata.org/a3047_now/17-08-nsi/sources-17-08.zip',
 'http://www.corpusdata.org/a3047_now/17-09-isn/sources-17-09.zip',
 'http://www.corpusdata.org/a3047_now/17-10-kll/sources-17-10.zip',
 'http://www.corpusdata.org/a3047_now/17-11-mmm/so

In [10]:
# Save out
with open('/data/news/now_urls_sources.txt', 'w') as f:
    for url in sources:
        f.write(f'{url}\n')

In [11]:
# Download sources
dbs = [link for link in links if 'db' in link]
print(len(dbs))
dbs

133


['https://www.corpusdata.org/a3047_now/db/db_10-01-kus.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-02-kvz.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-03-ovi.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-04-laq.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-05-wbb.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-06-bwq.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-07-uek.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-08-lqd.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-09-udn.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-10-nln.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-11-weq.zip',
 'https://www.corpusdata.org/a3047_now/db/db_10-12-ixv.zip',
 'https://www.corpusdata.org/a3047_now/db/db_11-01-pct.zip',
 'https://www.corpusdata.org/a3047_now/db/db_11-02-keb.zip',
 'https://www.corpusdata.org/a3047_now/db/db_11-03-lup.zip',
 'https://www.corpusdata.org/a3047_now/db/db_11-04-fpt.zip',
 'https://www.corpusdata

In [12]:
# Save out
with open('/data/news/now_urls_db.txt', 'w') as f:
    for url in dbs:
        f.write(f'{url}\n')

In [13]:
# Download lexicon
lex = [link for link in links if 'lexicon' in link]
print(len(lex))
lex

52


['http://www.corpusdata.org/a3047_now/shared/now_lexicon.zip',
 'http://www.corpusdata.org/a3047_now/16-11-hsk/lexicon-16-11.zip',
 'http://www.corpusdata.org/a3047_now/16-12-kds/lexicon-16-12.zip',
 'http://www.corpusdata.org/a3047_now/17-01-ske/lexicon-17-01.zip',
 'http://www.corpusdata.org/a3047_now/17-02-skr/lexicon-17-02.zip',
 'http://www.corpusdata.org/a3047_now/17-03-wwt/lexicon-17-03.zip',
 'http://www.corpusdata.org/a3047_now/17-04-mks/lexicon-17-04.zip',
 'http://www.corpusdata.org/a3047_now/17-05-lop/lexicon-17-05.zip',
 'http://www.corpusdata.org/a3047_now/17-06-lei/lexicon-17-06.zip',
 'http://www.corpusdata.org/a3047_now/17-07-jjr/lexicon-17-07.zip',
 'http://www.corpusdata.org/a3047_now/17-08-nsi/lexicon-17-08.zip',
 'http://www.corpusdata.org/a3047_now/17-09-isn/lexicon-17-09.zip',
 'http://www.corpusdata.org/a3047_now/17-10-kll/lexicon-17-10.zip',
 'http://www.corpusdata.org/a3047_now/17-11-mmm/lexicon-17-11.zip',
 'http://www.corpusdata.org/a3047_now/17-12-usu/lexic

In [14]:
# Save out
with open('/data/news/now_urls_lexicon.txt', 'w') as f:
    for url in lex:
        f.write(f'{url}\n')

# Investigate All the News 1 corpus

In [2]:
import pandas as pd
import sqlite3

datapath = '/data/news/all-the-news.db'
with sqlite3.connect(datapath) as con:
    cursor = con.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    cursor.close()
tables

[('longform',)]

In [5]:
with sqlite3.connect(datapath) as con:
    data = pd.read_sql_query('SELECT * from longform', con)
data

Unnamed: 0,id,title,author,date,content,year,month,publication,category,digital,section,url
0,1,Agent Cooper in Twin Peaks is the audience: on...,\nTasha Robinson\n,2017-05-31,And never more so than in Showtime’s new...,2017,5,Verge,Longform,1.0,,
1,2,"AI, the humanity!",\nSam Byford\n,2017-05-30,AlphaGo’s victory isn’t a defeat for hum...,2017,5,Verge,Longform,1.0,,
2,3,The Viral Machine,\nKaitlyn Tiffany\n,2017-05-25,Super Deluxe built a weird internet empi...,2017,5,Verge,Longform,1.0,,
3,4,How Anker is beating Apple and Samsung at thei...,\nNick Statt\n,2017-05-22,Steven Yang quit his job at Google in th...,2017,5,Verge,Longform,1.0,,
4,5,Tour Black Panther’s reimagined homeland with ...,\nKwame Opam\n,2017-05-15,Ahead of Black Panther’s 2018 theatrical...,2017,5,Verge,Longform,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
204130,271689,Opinion | The West’s Schism Over Liberal Value...,,,,,,,,,,
204131,271690,"Sammy Stewart, Pitcher Whose Life Took a Downt...",Richard Sandomir,2018-03-07,"Sammy Stewart, who set a record when he struck...",2018,3,New York Times,newspaper,0.0,,
204132,271692,California Today: Down to the Wire in Hollywoo...,Mike McPhate,2017-05-01,Good morning. (Want to get California Today by...,2017,5,New York Times,newspaper,0.0,,
204133,271697,An Export From Sweden: Art Treasures Collected...,,,,,,,,,,


In [6]:
data.year.value_counts()

2016    95997
2017    75034
2015     9838
2018     4966
2014     3362
2013     1142
2012      948
2011      163
2010       65
2008        3
2009        3
2005        2
2004        2
2003        2
            2
2007        1
2006        1
2000        1
Name: year, dtype: int64

In [7]:
data.publication.value_counts()

New York Times         30257
Breitbart              23791
New York Post          18144
Reuters                15055
CNN                    14288
Washington Post        12051
NPR                    12001
Buzzfeed News          10699
Guardian                9920
Los Angeles Times       9036
Business Insider        8344
Atlantic                7386
National Review         6258
Talking Points Memo     5323
Vox                     4953
Fox News                4384
New Inquiry             3305
Verge                   1225
Name: publication, dtype: int64

# Investigate All the News 2 corpus

In [12]:
import pandas as pd

datapath = '/data/news/all-the-news-2-1.csv'
chunked = pd.read_csv(datapath, iterator=True)
sample = chunked.get_chunk(200)
sample

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,year,month,day,author,title,article,url,section,publication
0,0,0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,1,1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2,2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,3,3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,4,4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ
...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,195,2018-03-24 00:00:00,2018,3.0,24,Perry Bard,The Places Rex Tillerson Didn’t Go,"Trump’s Secretary of State, ex-CEO of ExxonMob...",https://hyperallergic.com/433978/perry-bard-th...,,Hyperallergic
196,196,196,2016-12-03 00:00:00,2016,12.0,3,,White House: no change to 'one China' policy a...,WASHINGTON (Reuters) - The White House on Frid...,https://www.reuters.com/article/us-usa-trump-t...,World News,Reuters
197,197,197,2018-03-18 00:00:00,2018,3.0,18,,Putin says 'nonsense' to think Russia would po...,MOSCOW (Reuters) - Russian President Vladimir ...,https://www.reuters.com/article/us-britain-rus...,World News,Reuters
198,198,198,2019-06-25 00:00:00,2019,6.0,25,Melissa Fares,Wayfair employee walkout called over alleged f...,(Reuters) - Wayfair Inc came under pressure on...,https://www.reuters.com/article/us-usa-immigra...,U.S.,Reuters


In [42]:
# Publication and year metadata
from tqdm.notebook import tqdm
from IPython.display import display

data = pd.DataFrame(columns=['publication', 'year'])
chunksize = 10**5
pbar = tqdm(total=2.7e6)
vals = []
for chunk in pd.read_csv(datapath, chunksize=chunksize):
    vals.append(chunk.loc[:, ['publication', 'year']].values)
    pbar.update(chunksize)
pbar.close()
len(vals)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2700000.0), HTML(value='')))




27

In [44]:
import numpy as np
data = pd.DataFrame(np.concatenate(vals), columns=['publication', 'year'])
data

Unnamed: 0,publication,year
0,Vox,2016
1,Business Insider,2016
2,Reuters,2018
3,Reuters,2019
4,TMZ,2016
...,...,...
2688874,TMZ,2020
2688875,TMZ,2020
2688876,TMZ,2020
2688877,TMZ,2020


In [46]:
pd.crosstab(data.publication, data.year, margins=True)

year,2016,2017,2018,2019,2020,2016,2017,2018,2019,All
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Axios,7,15659,16543,15605,1,0,0,0,0,47815
Business Insider,3243,2874,2204,7067,15903,0,0,0,26662,57953
Buzzfeed News,8459,9706,8194,5068,1392,0,0,0,0,32819
CNBC,36562,45065,42964,85878,23266,0,0,1,4360,238096
CNN,23546,25315,29736,38332,10673,0,0,0,0,127602
Economist,6819,6182,5942,5684,1380,0,0,0,220,26227
Fox News,56,2464,17624,0,0,0,0,0,0,20144
Gizmodo,9755,6345,6590,4537,0,0,0,1,0,27228
Hyperallergic,3363,3259,3045,2972,736,0,0,0,176,13551
Mashable,33120,27459,15958,13757,3063,2,21,21,706,94107


In [40]:
# Overview of publications and years
from tqdm.notebook import tqdm
from IPython.display import display

# pubs = set()
# years = set()
pubs = pd.Series(dtype=int)
years = pd.Series(dtype=int)
chunksize = 10**5
pbar = tqdm(total=2.7e6)
for chunk in pd.read_csv(datapath, chunksize=chunksize):
#     pubs |= set(chunk['publication'].unique())
#     years |= set(chunk['year'].unique())
    pubs = pubs.add(chunk.publication.value_counts(), fill_value=0)
    years = years.add(chunk.year.value_counts(), fill_value=0)
    pbar.update(chunksize)
pbar.close()
display(pubs)
display(years)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2700000.0), HTML(value='')))




Axios                  47815.0
Business Insider       57953.0
Buzzfeed News          32819.0
CNBC                  238096.0
CNN                   127602.0
Economist              26227.0
Fox News               20144.0
Gizmodo                27228.0
Hyperallergic          13551.0
Mashable               94107.0
New Republic           11809.0
New Yorker              4701.0
People                136488.0
Politico               46377.0
Refinery 29           111433.0
Reuters               840094.0
TMZ                    49595.0
TechCrunch             52095.0
The Hill              208411.0
The New York Times    252259.0
The Verge              52424.0
Vice                  101137.0
Vice News              15539.0
Vox                    47272.0
Washington Post        40882.0
Wired                  20243.0
dtype: float64

2016                                                                                                                                                                                  604503.0
2017                                                                                                                                                                                  640389.0
2018                                                                                                                                                                                  553563.0
2019                                                                                                                                                                                  590058.0
2020                                                                                                                                                                                  234830.0
2016                                         

In [None]:
import plotly.express as px

# Investigate existing LGBT news corpus (used by Mendelsohn et al 2020)

In [2]:
# Load corpus
import pandas as pd

background_fpath = '/data/fanfiction_ao3/lgbt_news/background_news_corpus.csv'
bg = pd.read_csv(background_fpath, encoding='latin-1')
bg

Unnamed: 0,Text,Title,Id,Count,Date,Category
0,UNION TOWNSHIP The Karl and Helen Burger Galle...,Events in New Jersey,53f8952d7988105a42595735,87,2012-07-08T00:00:00Z,N.Y. / Region
1,"In his view, the European Union ÛÓ for all it...",Quest for Balance in Joining European Union,51fbd80e79881007d5901216,3,2013-08-03T17:45:00Z,World
2,Potential overseas researchers and teachers fa...,Simplifying Italy to Lure Global Graduates,53f9b77b79881001aab76f02,5,2012-08-13T14:00:00Z,World
3,"Perhaps, however, it is time to look at demand...",Oil Spurt: A Rally That Few Believe,4fd1d2e58eb7c8105d70e775,5,1996-03-24T00:00:00Z,Business
4,"It is also east of Church Street, on the outsk...",Big Ticket | Sold for $9.5 Million,4fd3a2b18eb7c8105d8eae60,6,2012-01-20T13:01:31Z,N.Y. / Region
...,...,...,...,...,...,...
34615,Disney officials promised that if the roads be...,A New Battle of Manassas Is Under Way in the S...,4fd1dbfd8eb7c8105d71d2bc,15,1994-06-22T00:00:00Z,U.S.
34616,Adopting such a model will again restrict cash...,"In Death of Rain Forests, We're All Accomplices",4fd1b5e98eb7c8105d6dbcd2,4,1994-06-20T00:00:00Z,Opinion
34617,"\""Your letter,\"" Fortas said to Wolfson, \""and...",Masterful Meltdown,4fd1be1f8eb7c8105d6eb220,11,1994-06-05T00:00:00Z,Magazine
34618,"Last week, in the busy, final days of budget n...",A Reconditioned East Side G. O. P. Flexes Its ...,4fd1e45f8eb7c8105d72ecfb,0,1994-06-26T00:00:00Z,New York and Region


In [5]:
lgbt_articles = pd.read_csv('/data/fanfiction_ao3/lgbt_news/lgbt_news_corpus.csv', encoding='latin-1')
lgbt_articles # each row is a paragraph with a term from their list in it

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Text,Title,Id,Count,Date,Category,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,Re ÛÏObama Alters Hospital Rules for Gay Righ...,Hospital Visiting Rights for Same-Sex Partners,54457293798810347c1a843e,1,2010-04-26T00:00:00Z,Opinion,,,,,,,,,,,,,
1,You cannot have people who have no expertise i...,"For Police Union Head, Review Board Proposal I...",4fd1d13d8eb7c8105d70bb37,3,1992-06-28T00:00:00Z,New York and Region,,,,,,,,,,,,,
2,In an interview with Tracy Clark-Flory on Salo...,Why Asexuals DonÛªt Want to Be Invisible Anymore,54232178798810604c505435,4,2014-09-24T15:44:50Z,Unknown,,,,,,,,,,,,,
3,After we filed back out into the Tennessee sun...,"Dollywood: A Little Bit Country, a Little Bit Gay",53f782e67988105a42594f8f,28,2014-08-24T00:00:00Z,Travel,,,,,,,,,,,,,
4,''Gays and lesbians are a very good group of p...,San Francisco Toasts Gay Weddings,4fd24bb98eb7c8105d7eacb6,20,2004-02-29T00:00:00Z,Unknown,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34615,Infection data are sparse. Estimates of the nu...,HETEROSEXUALS AND AIDS: THE CONCERN IS GROWING,4fd170d58eb7c8105d663f0a,5,1986-10-28T00:00:00Z,Science; Health,,,,,,,,,,,,,
34616,The Council notes that throughout the history ...,TEXT OF NEW RIGHTS BILL,4fd14a698eb7c8105d627e7c,2,1986-01-23T00:00:00Z,New York and Region,,,,,,,,,,,,,
34617,Another reason throws light on C.D.C. reluctan...,The Mysteries Of Belle Glade,4fd155988eb7c8105d6398da,3,1986-06-11T00:00:00Z,Opinion,,,,,,,,,,,,,
34618,The executive director of the National Gay and...,CITY'S HOMOSEXUALS PROTEST HIGH COURT SODOMY R...,4fd1662a8eb7c8105d653865,7,1986-07-03T00:00:00Z,New York and Region,,,,,,,,,,,,,


In [8]:
len(lgbt_articles['Title'].unique())

17820

In [10]:
vc = lgbt_articles['Title'].value_counts()
vc[vc>1]

NEWS SUMMARY                                                            640
INSIDE                                                                  128
Corrections                                                             110
METRO DIGEST                                                             77
News Summary                                                             71
                                                                       ... 
New Top Cop to Keep an Eye on How Hollywood Depicts Homosexuality         2
Fugitive in Gay Bar Attacks Dies After Shootout With Arkansas Police      2
SOUTHWARD GOES RIVER OF FINANCE                                           2
Gay Users of Internet Play Down Concerns Over New Strain of AIDS          2
Name America's Most Liberal City                                          2
Name: Title, Length: 6450, dtype: int64

In [13]:
pd.set_option('display.max_colwidth', None)
lgbt_articles.loc[lgbt_articles['Title'] == 'New Top Cop to Keep an Eye on How Hollywood Depicts Homosexuality', ['Text', 'Title', 'Id', 'Count', 'Date']]

Unnamed: 0,Text,Title,Id,Count,Date
9671,"In recent years, the organization has shown a reluctance to battle Hollywood too publicly, opting instead to work back channels. Its public approach seems built more around praising positive depictions of gay men and lesbians (ÛÏMilkÛ) than decrying negative ones (ÛÏI Now Pronounce You Chuck & LarryÛ). In its reaction to the forthcoming film ÛÏBruno,Û which stars Sacha Baron Cohen as a flamboyantly gay fashion journalist, G.L.A.A.D. has taken more of a middle-of-the-road tack, saying that some people would very likely find the stereotypes in the movie funny and others would find them offensive.",New Top Cop to Keep an Eye on How Hollywood Depicts Homosexuality,4fd3960f8eb7c8105d8ccc45,5,2009-06-18T15:53:44Z
12431,"The organization that polices how Hollywood depicts homosexuality has a new top cop. The Gay & Lesbian Alliance Against Defamation on Wednesday hired Jarrett T. Barrios, a former state senator from Massachusetts, as its president. The question for moviemakers and television networks is: How loudly will his G.L.A.A.D. complain when it spots problematic treatment?",New Top Cop to Keep an Eye on How Hollywood Depicts Homosexuality,4fd3960f8eb7c8105d8ccc45,0,2009-06-18T15:53:44Z


In [7]:
lgbt_articles.loc[1,'Text']

"You cannot have people who have no expertise in police work sit in judgment of police officers, because police work by its very nature is esoteric. When you have to have represented on that board every segment of the community, the gay segment, black, the Hispanic and so on, they tend to pursue a political agenda as opposed to looking at police behavior in an objective and fair fashion. So they can't be coming down with decisions that are fair."