In [2]:
#install bnc corpus reader from nltk to read xml files in zip folder
#zip of full xml bnc text is accessible from http://www.natcorp.ox.ac.uk/  
#database containing metadata on all files by file ID available for download directly from here: https://www.english-corpora.org/bnc/files/sources_bnc.zip
#this was used to filter on audience age for kids books, and for smaller sample sizes 
import nltk
from nltk.corpus.reader.bnc import BNCCorpusReader
from nltk import ngrams, FreqDist
import pandas as pd

In [60]:
#first define all readers for different samples from bnc:
#bnc full text, kids books in bnc, a 10% sample (general sample), a 1% sample (baby) and a sample with equal spoken and written (bnc sampler)

In [2]:
#read all files in bnc corpus (2554.zip) to create reader
#direct download link for 2554.zip: http://hdl.handle.net/20.500.12024/2554
bnc_reader=BNCCorpusReader(root='2554.zip/download/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')

In [15]:
#all ids in the bnc database with audience age as 'child' for sample of written works intended for children
kids_sample=['A/A7/A7A.xml','A/AB/ABX.xml','A/AC/AC5.xml','A/AC/AC4.xml','A/AC/ACB.xml','A/AC/ACV.xml','A/AE/AEB.xml','A/AL/ALS.xml','A/AM/AMB.xml','A/AP/APW.xml','A/AT/AT4.xml','B/B0/B0B.xml','B/B1/B1S.xml','B/B2/B2N.xml','B/BM/BMU.xml','B/BP/BPD.xml','C/C8/C85.xml','C/C8/C8N.xml','C/CA/CA3.xml','C/CA/CAB.xml','C/CA/CAX.xml','C/CC/CCA.xml','C/CE/CE0.xml','C/CE/CEU.xml','C/CF/CFJ.xml','C/CH/CH4.xml','C/CH/CH9.xml','C/CH/CHR.xml','E/EF/EFJ.xml','F/FE/FEH.xml','F/FN/FNS.xml','F/FP/FPT.xml','F/FP/FPV.xml','F/FS/FSL.xml','F/FU/FUB.xml','G/G2/G22.xml','G/G2/G23.xml','G/G2/G24.xml','G/G2/G25.xml','H/H9/H93.xml','H/H9/H9E.xml']
kids_reader=BNCCorpusReader(root='2554.zip/download/Texts/', fileids=kids_sample)

In [5]:
#10% sample of BNC to investigate differences in sample sizes
#general sample size - file ids of books within 'whole' sample to give ~10 million words 
general_sample=['A/A6/A69.xml', 'A/A7/A79.xml', 'A/A7/A7C.xml','A/AB/ABU.xml','A/AB/ABV.xml','A/AD/AD9.xml','A/AD/ADB.xml','A/AL/AL8.xml','A/AL/ALJ.xml','A/AL/ALL.xml','A/AL/ALU.xml','A/AM/AM4.xml','A/AM/AMU.xml','A/AN/AN3.xml','A/AN/AN5.xml','A/AN/ANB.xml','A/AN/ANC.xml','A/AN/ANU.xml','A/AP/APC.xml','A/AP/APP.xml','A/AP/APT.xml','A/AR/AR0.xml','A/AS/AS3.xml', 'A/AS/ASN.xml', 'A/AS/ASV.xml', 'A/AT/AT3.xml', 'A/AT/ATE.xml', 'A/AY/AYJ.xml', 'B/B0/B07.xml', 'B/B0/B0H.xml', 'B/B0/B0K.xml' 'B/B0/B0N.xml', 'B/B0/B0P.xml', 'B/B0/B0U.xml', 'B/B1/B11.xml', 'B/B1/B1D.xml', 'B/B1/B1N.xml', 'B/B1/B1X.xml', 'B/B2/B20.xml', 'B/B2/B22.xml', 'B/B2/B24.xml', 'B/B2/B2G.xml', 'B/B2/B2S.xml', 'B/B2/B2U.xml', 'B/B3/B32.xml', 'B/BL/BLW.xml', 'B/BL/BLY.xml', 'B/BM/BM1.xml', 'B/BM/BM6.xml', 'B/BM/BM9.xml', 'B/BM/BMN.xml', 'B/BM/BMR.xml', 'B/BM/BMY.xml', 'B/BN/BN4.xml', 'B/BN/BN5.xml', 'B/BN/BNB.xml', 'B/BN/BNF.xml', 'B/BN/BNG.xml', 'B/BN/BNN.xml', 'B/BN/BNU.xml', 'B/BP/BP0.xml', 'B/BP/BP1.xml', 'B/BP/BP7.xml', 'B/BP/BP9.xml', 'B/BP/BPK.xml', 'C/C8/C8S.xml', 'C/C9/C98.xml', 'C/C9/C9U.xml', 'C/CA/CA0.xml', 'C/CA/CA5.xml', 'C/CA/CA7.xml', 'C/CA/CA8.xml', 'C/CA/CAC.xml', 'C/CA/CAM.xml', 'C/CB/CB5.xml', 'C/CB/CB6.xml', 'C/CB/CBJ.xml', 'C/CB/CBN.xml', 'C/CC/CCB.xml', 'C/CC/CCD.xml', 'C/CC/CCK.xml', 'C/CC/CCW.xml', 'C/CD/CD8.xml', 'C/CD/CDB.xml', 'C/CD/CDD.xml', 'C/CD/CDE.xml','C/CD/CDG.xml', 'C/CD/CDS.xml', 'C/CD/CDX.xml', 'C/CE/CE7.xml', 'C/CE/CE9.xml', 'C/CE/CEB.xml', 'C/CE/CEC.xml', 'C/CE/CEE.xml', 'C/CE/CEF.xml', 'C/CE/CEG.xml', 'C/CF/CFY.xml', 'C/CJ/CJX.xml', 'C/CK/CKE.xml', 'C/CM/CML.xml', 'E/E9/E9V.xml', 'E/EA/EA3.xml', 'E/EA/EA6.xml', 'E/EA/EA7.xml', 'E/EA/EAW.xml', 'E/EC/EC3.xml', 'E/ED/EDK.xml', 'E/EE/EEE.xml', 'E/EE/EEF.xml', 'E/EE/EEM.xml', 'E/EE/EEY.xml', 'E/EF/EF4.xml', 'E/EF/EFA.xml', 'E/EF/EFP.xml', 'E/EF/EFS.xml', 'E/EF/EFU.xml', 'E/EF/EFV.xml', 'E/EU/EUX.xml', 'E/EW/EW1.xml', 'E/EW/EW7.xml', 'E/EW/EWM.xml', 'E/EW/EWR.xml','E/EX/EX7.xml', 'F/FA/FAV.xml', 'F/FB/FB2.xml', 'F/FN/FNX.xml', 'F/FP/FP0.xml', 'F/FP/FPJ.xml', 'F/FP/FPN.xml', 'F/FP/FPR.xml', 'F/FR/FR3.xml', 'F/FR/FRB.xml', 'F/FR/FRL.xml', 'F/FS/FS0.xml', 'F/FS/FS6.xml', 'F/FS/FSE.xml', 'F/FS/FSF.xml', 'F/FS/FST.xml', 'G/G0/G0D.xml', 'G/G0/G0L.xml', 'G/G1/G10.xml', 'G/G1/G19.xml', 'G/G1/G1C.xml', 'G/G1/G1G.xml', 'G/G1/G1H.xml', 'G/G2/G20.xml', 'G/G3/G3F.xml', 'G/GU/GU8.xml', 'G/GU/GUC.xml', 'G/GU/GUG.xml', 'G/GU/GUY.xml', 'G/GV/GV5.xml', 'G/GV/GV6.xml', 'G/GV/GVJ.xml', 'G/GV/GVR.xml', 'G/GV/GVS.xml', 'G/GV/GVU.xml', 'G/GV/GVW.xml', 'G/GW/GW3.xml', 'G/GW/GW6.xml', 'G/GW/GW9.xml', 'G/GW/GWK.xml', 'G/GW/GWL.xml', 'H/H1/H10.xml', 'H/H7/H78.xml', 'H/H7/H7T.xml', 'H/H7/H7U.xml', 'H/H7/H7X.xml', 'H/H8/H82.xml', 'H/H8/H84.xml', 'H/H8/H85.xml', 'H/H8/H86.xml', 'H/H8/H88.xml', 'H/H8/H89.xml', 'H/H8/H8A.xml', 'H/H8/H8D.xml', 'H/H8/H8M.xml', 'H/H8/H8T.xml', 'H/H8/H8U.xml', 'H/H8/H8X.xml', 'H/H9/H90.xml', 'H/H9/H92.xml', 'H/H9/H98.xml', 'H/H9/H99.xml', 'H/H9/H9C.xml', 'H/H9/H9G.xml', 'H/H9/H9J.xml', 'H/H9/H9M.xml', 'H/H9/H9Y.xml', 'H/HA/HA0.xml', 'H/HG/HGG.xml', 'H/HG/HGJ.xml', 'H/HH/HH0.xml', 'H/HH/HH2.xml', 'H/HJ/HJH.xml', 'H/HN/HNJ.xml', 'H/HN/HNW.xml', 'H/HP/HP0.xml', 'H/HR/HR4.xml', 'H/HR/HR7.xml', 'H/HR/HR8.xml', 'H/HR/HRA.xml', 'H/HR/HRB.xml', 'H/HR/HRC.xml', 'H/HR/HRM.xml', 'H/HT/HTK.xml', 'H/HT/HTR.xml', 'H/HT/HTT.xml', 'H/HT/HTU.xml', 'H/HT/HTX.xml', 'H/HU/HU0.xml', 'H/HW/HW8.xml', 'H/HW/HWA.xml', 'H/HW/HWD.xml', 'H/HW/HWE.xml', 'H/HW/HWL.xml', 'H/HW/HWM.xml', 'H/HW/HWN.xml', 'J/J0/J0P.xml', 'J/J0/J0W.xml', 'J/J1/J13.xml', 'J/J1/J16.xml', 'J/J5/J56.xml', 'K/K8/K8R.xml', 'K/K8/K8S.xml', 'K/K9/K95.xml']
#define reader using general sample files 
general_reader=BNCCorpusReader(root='2554.zip/download/Texts/', fileids=general_sample)

In [12]:
#BNC sampler file for small but representative sample of corpus 
#~ 2 million written words of both spoken and written (2551.zip) 
#direct download link for 2551.zip: https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/2551/2551.zip?sequence=3&isAllowed=y
small_sample=BNCCorpusReader(root='2551.zip/download/XML/', fileids=r'[A-K]\w*\w*\.xml')

In [53]:
#baby bnc sample -  4 samples of 1 million words in different domains - for small sample of bnc will use the fiction section
#file names in the baby bnc 'fiction' sample of ~1million written words
#direct download link for 2553.zip: https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/2553/2553.zip?sequence=3&isAllowed=y
baby_ids=['AB9.xml', 'AC2.xml', 'BMW.xml', 'BPA.xml', 'C8T.xml', 'CB5.xml', 'CCW.xml', 'CDB.xml', 'CFY.xml', 'FAJ.xml', 'FET.xml', 'FPB.xml', 'G0L.xml', 'G0S.xml', 'G0Y.xml','G01.xml', 'GUU.xml', 'GVL.xml', 'H9C.xml', 'H9D.xml', 'H85.xml', 'HR9.xml', 'J10.xml', 'J54.xml', 'K8V.xml']
#BNC baby sampler 
baby_reader=BNCCorpusReader(root='2553.zip/download/Texts/fic/', fileids=baby_ids)

In [3]:
#extract all words from files in each reader (subcorpus) - returns list of words
#bnc full text may take some time as over 100 million words 
all_words=bnc_reader.words()

In [4]:
#count number words in the sample - may also take some time to run 
len(all_words) #bnc full text total word count 

111978070

In [5]:
#count the number of times each word appears in each sample - returns dict of unique word counts
counts=nltk.FreqDist(all_words) #762481 unique words 
#convert dict of unique word counts to dataframe and export to csv
df=pd.DataFrame.from_dict(counts, orient='index').reset_index()
df=df.rename(columns={'index':'Term', 0:'All_BNC_Counts'})
df.to_csv('allBNC_counts.csv')

In [7]:
df

Unnamed: 0,index,0
0,FACTSHEET,17
1,WHAT,1001
2,IS,1767
3,AIDS,1836
4,?,387952
...,...,...
762476,sleeptalk,1
762477,Danii,2
762478,poshy,1
762479,Fridged,1


In [8]:
#get all the words from kids books in the corpus
kids_words=kids_reader.words()

In [9]:
#count total number of words in all the texts 
len(kids_words) #1095907 words in total of kids subcorpus 

1095907

In [11]:
#unique word count - returns dict of frequency of each words appearance
kids_counts=nltk.FreqDist(kids_words)
#convert dict to dataframe and export to csv
df1=pd.DataFrame.from_dict(kids_counts, orient='index').reset_index()
df1 #31450 unique words 
df1=df1.rename(columns={'index':'Term', 0:'KBNC_Counts'})
df1.to_csv('KBNC_wordCounts.csv')

In [14]:
#BNC sampler - returns the 1 million spoken and 1 million written words in files
ss_words=small_sample.words()

In [16]:
#counts the number of words ~50/50 split spoken/written
len(ss_words) #2279356 in total

2279356

In [37]:
#unique word count of each words occurence 
ss_counts=nltk.FreqDist(ss_words)
df2=pd.DataFrame.from_dict(ss_counts, orient='index').reset_index()
df2=df2.rename(columns={'index':'Term', 0:'BNC_Sampler_Counts'})
df2 #64526 unique words 
df2.to_csv('BNC_Sampler_Counts.csv')

Unnamed: 0,Term,BNC_Sampler_Counts
0,Lebanon,50
1,leader,193
2,builds,15
3,cabinet,66
4,.,107567
...,...,...
64521,Cambri,1
64522,humping,1
64523,Wrens,2
64524,policewoman,1


In [55]:
#Baby BNC counts - 1 million word sample of fiction texts 
baby_words=baby_reader.words()

In [61]:
len(baby_words)

1211132

In [56]:
baby_counts=nltk.FreqDist(baby_words)

In [57]:
df3=pd.DataFrame.from_dict(baby_counts, orient='index').reset_index()
df3=df3.rename(columns={'index':'Term', 0:'BNC_Baby_Counts'})
df3 #39753 unique words 
df3.to_csv('BNC_baby_counts.csv')

Unnamed: 0,Term,BNC_Baby_Counts
0,Detective,20
1,Chief,107
2,Inspector,71
3,John,207
4,McLeish,302
...,...,...
39748,well-typed,1
39749,missives,1
39750,Toothpaste,1
39751,Memories,1


In [7]:
general_words=general_reader.words()
len(general_words) #10 million total words ~10% of total corpus 

10022421

In [8]:
word_counts=nltk.FreqDist(general_words)

In [10]:
df1=pd.DataFrame.from_dict(word_counts, orient='index').reset_index()
.rename(columns={'index':'Term', 0:'BNC_Gen_Counts'})
df1.to_csv('BNC_large_sample_counts.csv')

In [11]:
df1

Unnamed: 0,index,0
0,SOCIAL,31
1,POLICIES,10
2,The,56112
3,business,2320
4,of,278401
...,...,...
147871,Burghgeshes,1
147872,ploughlands,1
147873,barbered,1
147874,bloody-handed,1
