# No More Silence -- Get Sentences

This file segments and preprocesses sentences for use by Word2Vec. This includes removing stop words, numbers and converting to lowercase.

By convention, we filter out all documents spanning more than 3 years.

Creates a file where each row contains a senctence and the year(s) a sentence occured. Each row has he following format: year(s) and sentence is separated by tab "\t", words in the sentence are separated by space " ", and years are separated by dash "-">

In [1]:
# Load modules
from time import time
import pandas as pd
import spacy
import re

In [2]:
# Instantiate spacy
nlp = spacy.load("en_core_web_lg")

In [3]:
# load data
df = pd.read_csv("../NoMoreSilence_ProjectDataV2.tsv",delimiter="\t")

In [4]:
with open("sents1year.txt", "w+") as ofile:

    i = 0
    for date, text in zip(df["Date "], df["Corrected Text"]):
        t0 = time()

        print("Processing document: " + str(i))
        print("\tCharacters: " + str(len(text)))
        
        if len(text) == 1:
            print("\tWords: 0")
            print("\tProcess Time:" + str(time() - t0))

            i += 1
            continue

        # get date for document: look for years
        data = re.split('\W+', date)
        l = []
        for datum in data:
            # if number greater than 1000, then year
            if datum.isdigit() and int(datum) > 1000:
                l.append(int(datum))
        # if more than one year found, create range from min to max
        if len(l) > 1:
            l_min = min(l)
            l_max = max(l)
            l = list(range(l_min, l_max + 1))

        # skip if more than 3 years in length
        if len(l) != 1:
            print("\tWords: 0")
            print("\tProcess Time:" + str(time() - t0))

            i += 1
            continue

        # segment sentences
        doc = nlp(text)

        print("\tSentences: " + str(len(list(doc.sents))))

        words = set()
        for sent in doc.sents:
            # toeknize sentence
            sent = nlp(sent.text)
            
            # convert to lowercase and remove stop words
            sent = [token.text.lower() for token in sent if not token.is_stop]
            
            # remove numbers
            sent = [word for word in sent if word.isalpha()]
            
            # filter out words length less than three 
            sent = [word for word in sent if len(word) > 2]
            
            # write sentence to file
            if len(sent) > 1:
                ofile.write("-".join(map(str, l)) + "\t" + " ".join(sent) + "\n")
                words = words.union(set(sent))

        print("\tWords: " + str(len(words)))
        print("\tProcess Time:" + str(time() - t0))

        i += 1

Processing document: 0
	Characters: 110069
	Sentences: 996
	Words: 2341
	Process Time:10.715761661529541
Processing document: 1
	Characters: 9699
	Sentences: 89
	Words: 359
	Process Time:1.1236793994903564
Processing document: 2
	Characters: 364812
	Words: 0
	Process Time:0.006016731262207031
Processing document: 3
	Characters: 377092
	Words: 0
	Process Time:0.005010843276977539
Processing document: 4
	Characters: 10192
	Sentences: 111
	Words: 470
	Process Time:1.2812371253967285
Processing document: 5
	Characters: 481539
	Words: 0
	Process Time:0.007018566131591797
Processing document: 6
	Characters: 81024
	Words: 0
	Process Time:0.007018566131591797
Processing document: 7
	Characters: 42956
	Words: 0
	Process Time:0.005012989044189453
Processing document: 8
	Characters: 153037
	Sentences: 2240
	Words: 2819
	Process Time:20.494321584701538
Processing document: 9
	Characters: 73155
	Words: 0
	Process Time:0.0040094852447509766
Processing document: 10
	Characters: 102808
	Sentences: 151

Processing document: 174
	Characters: 6457
	Words: 0
	Process Time:0.01316976547241211
Processing document: 175
	Characters: 64440
	Words: 0
	Process Time:0.0030090808868408203
Processing document: 176
	Characters: 725057
	Sentences: 6580
	Words: 13699
	Process Time:69.50256037712097
Processing document: 177
	Characters: 17878
	Words: 0
	Process Time:0.01630854606628418
Processing document: 178
	Characters: 31333
	Words: 0
	Process Time:0.0030083656311035156
Processing document: 179
	Characters: 155453
	Sentences: 2076
	Words: 4222
	Process Time:19.237274646759033
Processing document: 180
	Characters: 26224
	Words: 0
	Process Time:0.011703968048095703
Processing document: 181
	Characters: 19734
	Sentences: 320
	Words: 985
	Process Time:2.6587705612182617
Processing document: 182
	Characters: 148114
	Words: 0
	Process Time:0.007122516632080078
Processing document: 183
	Characters: 8461
	Sentences: 104
	Words: 488
	Process Time:0.9291889667510986
Processing document: 184
	Characters: 316

	Process Time:0.004009246826171875
Processing document: 354
	Characters: 1
	Words: 0
	Process Time:0.00400233268737793
Processing document: 355
	Characters: 1
	Words: 0
	Process Time:0.004006147384643555
Processing document: 356
	Characters: 1
	Words: 0
	Process Time:0.0040056705474853516
Processing document: 357
	Characters: 1
	Words: 0
	Process Time:0.00400543212890625
Processing document: 358
	Characters: 1
	Words: 0
	Process Time:0.004007101058959961
Processing document: 359
	Characters: 1
	Words: 0
	Process Time:0.008008718490600586
Processing document: 360
	Characters: 1
	Words: 0
	Process Time:0.00400543212890625
Processing document: 361
	Characters: 1
	Words: 0
	Process Time:0.0040051937103271484
Processing document: 362
	Characters: 1
	Words: 0
	Process Time:0.004004716873168945
Processing document: 363
	Characters: 1
	Words: 0
	Process Time:0.004004001617431641
Processing document: 364
	Characters: 1
	Words: 0
	Process Time:0.00400543212890625
Processing document: 365
	Charac

	Process Time:8.51949429512024
Processing document: 523
	Characters: 143007
	Words: 0
	Process Time:0.003977060317993164
Processing document: 524
	Characters: 53521
	Sentences: 605
	Words: 1768
	Process Time:6.54082989692688
Processing document: 525
	Characters: 63880
	Sentences: 769
	Words: 2310
	Process Time:8.283151865005493
Processing document: 526
	Characters: 77559
	Words: 0
	Process Time:0.0039708614349365234
Processing document: 527
	Characters: 7240
	Words: 0
	Process Time:0.004011631011962891
Processing document: 528
	Characters: 67965
	Words: 0
	Process Time:0.008008480072021484
Processing document: 529
	Characters: 49824
	Sentences: 567
	Words: 1425
	Process Time:6.3124895095825195
Processing document: 530
	Characters: 9636
	Sentences: 167
	Words: 497
	Process Time:1.401914358139038
Processing document: 531
	Characters: 105048
	Words: 0
	Process Time:0.004006624221801758
Processing document: 532
	Characters: 45596
	Sentences: 979
	Words: 903
	Process Time:7.834543704986572


	Characters: 254783
	Sentences: 6301
	Words: 3853
	Process Time:51.22527241706848
Processing document: 686
	Characters: 129202
	Sentences: 1396
	Words: 2238
	Process Time:14.924180746078491
Processing document: 687
	Characters: 1
	Words: 0
	Process Time:0.0040051937103271484
Processing document: 688
	Characters: 7682
	Sentences: 57
	Words: 423
	Process Time:0.6648979187011719
Processing document: 689
	Characters: 11607
	Sentences: 237
	Words: 462
	Process Time:1.8665523529052734
Processing document: 690
	Characters: 6378
	Sentences: 79
	Words: 307
	Process Time:0.8250882625579834
Processing document: 691
	Characters: 26905
	Words: 0
	Process Time:0.0040056705474853516
Processing document: 692
	Characters: 5077
	Sentences: 60
	Words: 302
	Process Time:0.6248712539672852
Processing document: 693
	Characters: 25248
	Words: 0
	Process Time:0.0040056705474853516
Processing document: 694
	Characters: 133695
	Sentences: 1802
	Words: 1927
	Process Time:18.537068605422974
Processing document: 6

In [5]:
start = 1968
buckets = [list(range(start + 5*i, start + 5*(i+1))) for i in range(10)]

with open("sents5year.txt", "w+") as ofile:

    i = 0
    for date, text in zip(df["Date "], df["Corrected Text"]):
        t0 = time()

        print("Processing document: " + str(i))
        print("\tCharacters: " + str(len(text)))
        
        if len(text) == 1:
            print("\tWords: 0")
            print("\tProcess Time:" + str(time() - t0))

            i += 1
            continue

        # get date for document: look for years
        data = re.split('\W+', date)
        l = []
        for datum in data:
            # if number greater than 1000, then year
            if datum.isdigit() and int(datum) > 1000:
                l.append(int(datum))
        # if more than one year found, create range from min to max
        if len(l) > 1:
            l_min = min(l)
            l_max = max(l)
            l = list(range(l_min, l_max + 1))


        if l == []:
            print("\tWords: 0")
            print("\tProcess Time:" + str(time() - t0))

            i += 1
            continue

        print(l)
        y = ''
        for bucket in buckets:
            if set(l).issubset(bucket):
                y = str(bucket[0]) + "-" + str(bucket[-1])
        
        
        print(y)
        if y == "":
            print("\tWords: 0")
            print("\tProcess Time:" + str(time() - t0))

            i += 1
            continue
        # segment sentences
        doc = nlp(text)

        print("\tSentences: " + str(len(list(doc.sents))))

        words = set()
        for sent in doc.sents:
            # toeknize sentence
            sent = nlp(sent.text)
            
            # convert to lowercase and remove stop words
            sent = [token.text.lower() for token in sent if not token.is_stop]
            
            # remove numbers
            sent = [word for word in sent if word.isalpha()]
            
            # filter out words length less than three 
            sent = [word for word in sent if len(word) > 2]
            
            # write sentence to file
            if len(sent) > 1:
                ofile.write(y + "\t" + " ".join(sent) + "\n")
                words = words.union(set(sent))

        print("\tWords: " + str(len(words)))
        print("\tProcess Time:" + str(time() - t0))

        i += 1

Processing document: 0
	Characters: 110069
[1986]
1983-1987
	Sentences: 996
	Words: 2341
	Process Time:10.099878787994385
Processing document: 1
	Characters: 9699
[1992]
1988-1992
	Sentences: 89
	Words: 359
	Process Time:0.9625601768493652
Processing document: 2
	Characters: 364812
[1995, 1996]
1993-1997
	Sentences: 4746
	Words: 4763
	Process Time:42.52310276031494
Processing document: 3
	Characters: 377092
[1985, 1986, 1987]
1983-1987
	Sentences: 3712
	Words: 5035
	Process Time:36.26141023635864
Processing document: 4
	Characters: 10192
[1993]
1993-1997
	Sentences: 111
	Words: 470
	Process Time:1.0207481384277344
Processing document: 5
	Characters: 481539
	Words: 0
	Process Time:0.003008127212524414
Processing document: 6
	Characters: 81024
[1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000]

	Words: 0
	Process Time:0.005013465881347656
Processing document: 7
	Characters: 42956
[1988, 1989, 1990, 1991, 1992]
1988-1992
	Sentences: 364
	Words: 609
	Process Time:3.5544486045837402
Processin

Processing document: 122
	Characters: 483
[1993]
1993-1997
	Sentences: 4
	Words: 60
	Process Time:0.11229777336120605
Processing document: 123
	Characters: 12225
[1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994]

	Words: 0
	Process Time:0.005015850067138672
Processing document: 124
	Characters: 21517
[1975]
1973-1977
	Sentences: 429
	Words: 1242
	Process Time:3.1713461875915527
Processing document: 125
	Characters: 29641
[1990]
1988-1992
	Sentences: 547
	Words: 1211
	Process Time:4.626374006271362
Processing document: 126
	Characters: 2337
[1994]
1993-1997
	Sentences: 36
	Words: 119
	Process Time:0.32808828353881836
Processing document: 127
	Characters: 5718
[1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986]

	Words: 0
	Process Time:0.004011869430541992
Processing document: 128
	Characters: 32422
[1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 19

	Process Time:0.0049059391021728516
Processing document: 170
	Characters: 1413
[1971]
1968-1972
	Sentences: 30
	Words: 107
	Process Time:0.23987150192260742
Processing document: 171
	Characters: 29410
[1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994]

	Words: 0
	Process Time:0.015565156936645508
Processing document: 172
	Characters: 40808
[1993, 1994]
1993-1997
	Sentences: 813
	Words: 1699
	Process Time:6.502331495285034
Processing document: 173
	Characters: 2957
[1994]
1993-1997
	Sentences: 40
	Words: 152
	Process Time:0.3919553756713867
Processing document: 174
	Characters: 6457
[1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994]

	Words: 0
	Process Time:0.015761375427246094
Processing document: 175
	Characters: 64440
[1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 199

[1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994]

	Words: 0
	Process Time:0.00400996208190918
Processing document: 261
	Characters: 87560
[1989, 1990, 1991, 1992]
1988-1992
	Sentences: 1640
	Words: 2615
	Process Time:13.034037590026855
Processing document: 262
	Characters: 23789
[1989]
1988-1992
	Sentences: 263
	Words: 863
	Process Time:2.5557467937469482
Processing document: 263
	Characters: 166355
[1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994]

	Words: 0
	Process Time:0.018984556198120117
Processing document: 264
	Characters: 58759
[1994]
1993-1997
	Sentences: 676
	Words: 2226
	Process Time:5.855358600616455
Processing document: 265
	Characters: 12901
[1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994]

	Words: 0
	Process Time:0.011548280715942383
Processing document: 266
	Characters:

Processing document: 407
	Characters: 77474
[1995]
1993-1997
	Sentences: 784
	Words: 2143
	Process Time:7.766165256500244
Processing document: 408
	Characters: 65673
[1994]
1993-1997
	Sentences: 1174
	Words: 2192
	Process Time:9.264350652694702
Processing document: 409
	Characters: 201052
[1993]
1993-1997
	Sentences: 3469
	Words: 3467
	Process Time:29.750077962875366
Processing document: 410
	Characters: 38994
[1985, 1986, 1987]
1983-1987
	Sentences: 1068
	Words: 1339
	Process Time:10.45792818069458
Processing document: 411
	Characters: 8592
[1992]
1988-1992
	Sentences: 137
	Words: 356
	Process Time:1.542222023010254
Processing document: 412
	Characters: 26190
[1988]
1988-1992
	Sentences: 394
	Words: 1198
	Process Time:4.625215291976929
Processing document: 413
	Characters: 20963
[1992]
1988-1992
	Sentences: 127
	Words: 505
	Process Time:2.5046589374542236
Processing document: 414
	Characters: 11250
[1986, 1987, 1988, 1989, 1990]

	Words: 0
	Process Time:0.01458120346069336
Processing 

	Characters: 83674
[1989]
1988-1992
	Sentences: 2793
	Words: 2226
	Process Time:20.84042739868164
Processing document: 540
	Characters: 54192
[1992]
1988-1992
	Sentences: 1272
	Words: 1790
	Process Time:9.765932559967041
Processing document: 541
	Characters: 19513
[1985]
1983-1987
	Sentences: 560
	Words: 583
	Process Time:4.01467490196228
Processing document: 542
	Characters: 54391
[1985]
1983-1987
	Sentences: 494
	Words: 1344
	Process Time:5.060454845428467
Processing document: 543
	Characters: 1
	Words: 0
	Process Time:0.0030105113983154297
Processing document: 544
	Characters: 9525
[1987]
1983-1987
	Sentences: 70
	Words: 438
	Process Time:0.8091506958007812
Processing document: 545
	Characters: 206019
[1990]
1988-1992
	Sentences: 2584
	Words: 3723
	Process Time:24.228419303894043
Processing document: 546
	Characters: 145260
[1987]
1983-1987
	Sentences: 2375
	Words: 2521
	Process Time:19.855782508850098
Processing document: 547
	Characters: 2383
[1988]
1988-1992
	Sentences: 184
	Word

	Process Time:9.479205846786499
Processing document: 674
	Characters: 82579
[1983, 1984]
1983-1987
	Sentences: 2895
	Words: 2524
	Process Time:20.631775617599487
Processing document: 675
	Characters: 97904
[1984, 1985, 1986, 1987, 1988, 1989]

	Words: 0
	Process Time:0.005002260208129883
Processing document: 676
	Characters: 86798
[1983, 1984, 1985, 1986, 1987]
1983-1987
	Sentences: 1294
	Words: 1766
	Process Time:10.613218307495117
Processing document: 677
	Characters: 289470
[1991]
1988-1992
	Sentences: 7626
	Words: 4051
	Process Time:57.35449290275574
Processing document: 678
	Characters: 272344
[1992]
1988-1992
	Sentences: 3409
	Words: 3512
	Process Time:31.247079849243164
Processing document: 679
	Characters: 166783
	Words: 0
	Process Time:0.0030107498168945312
Processing document: 680
	Characters: 146966
[1989]
1988-1992
	Sentences: 2303
	Words: 2987
	Process Time:20.216978073120117
Processing document: 681
	Characters: 21182
	Words: 0
	Process Time:0.0019812583923339844
Processi

In [29]:
start = 1960
buckets = [list(range(start + 10*i, start + 10*(i+1))) for i in range(6)]

with open("sents10year.txt", "w+") as ofile:

    i = 0
    for date, text in zip(df["Date "], df["Corrected Text"]):
        t0 = time()

        print("Processing document: " + str(i))
        print("\tCharacters: " + str(len(text)))
        
        if len(text) == 1:
            print("\tWords: 0")
            print("\tProcess Time:" + str(time() - t0))

            i += 1
            continue

        # get date for document: look for years
        data = re.split('\W+', date)
        l = []
        for datum in data:
            # if number greater than 1000, then year
            if datum.isdigit() and int(datum) > 1000:
                l.append(int(datum))
        # if more than one year found, create range from min to max
        if len(l) > 1:
            l_min = min(l)
            l_max = max(l)
            l = list(range(l_min, l_max + 1))


        if l == []:
            print("\tWords: 0")
            print("\tProcess Time:" + str(time() - t0))

            i += 1
            continue

        print(l)
        y = ''
        for bucket in buckets:
            if set(l).issubset(bucket):
                y = str(bucket[0]) + "-" + str(bucket[-1])
        
        
        print(y)
        if y == "":
            print("\tWords: 0")
            print("\tProcess Time:" + str(time() - t0))

            i += 1
            continue
        # segment sentences
        doc = nlp(text)

        print("\tSentences: " + str(len(list(doc.sents))))

        words = set()
        for sent in doc.sents:
            # toeknize sentence
            sent = nlp(sent.text)
            
            # convert to lowercase and remove stop words
            sent = [token.text.lower() for token in sent if not token.is_stop]
            
            # remove numbers
            sent = [word for word in sent if word.isalpha()]
            
            # filter out words length less than three 
            sent = [word for word in sent if len(word) > 2]
            
            # write sentence to file
            if len(sent) > 1:
                ofile.write(y + "\t" + " ".join(sent) + "\n")
                words = words.union(set(sent))

        print("\tWords: " + str(len(words)))
        print("\tProcess Time:" + str(time() - t0))

        i += 1

Processing document: 0
	Characters: 110069
[1986]
1980-1989
	Sentences: 996
	Words: 2341
	Process Time:13.416133642196655
Processing document: 1
	Characters: 9699
[1992]
1990-1999
	Sentences: 89
	Words: 359
	Process Time:1.261507511138916
Processing document: 2
	Characters: 364812
[1995, 1996]
1990-1999
	Sentences: 4746
	Words: 4763
	Process Time:51.64207053184509
Processing document: 3
	Characters: 377092
[1985, 1986, 1987]
1980-1989
	Sentences: 3712
	Words: 5035
	Process Time:42.007826805114746
Processing document: 4
	Characters: 10192
[1993]
1990-1999
	Sentences: 111
	Words: 470
	Process Time:1.423121452331543
Processing document: 5
	Characters: 481539
	Words: 0
	Process Time:0.011029243469238281
Processing document: 6
	Characters: 81024
[1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000]

	Words: 0
	Process Time:0.011027812957763672
Processing document: 7
	Characters: 42956
[1988, 1989, 1990, 1991, 1992]

	Words: 0
	Process Time:0.008020401000976562
Processing document: 8
	Characters:

	Process Time:0.006017446517944336
Processing document: 121
	Characters: 65312
[1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994]

	Words: 0
	Process Time:0.006014585494995117
Processing document: 122
	Characters: 483
[1993]
1990-1999
	Sentences: 4
	Words: 60
	Process Time:0.13335561752319336
Processing document: 123
	Characters: 12225
[1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994]

	Words: 0
	Process Time:0.006013393402099609
Processing document: 124
	Characters: 21517
[1975]
1970-1979
	Sentences: 429
	Words: 1242
	Process Time:3.960645914077759
Processing document: 125
	Characters: 29641
[1990]
1990-1999
	Sentences: 547
	Words: 1211
	Process Time:4.627392292022705
Processing document: 126
	Characters: 2337
[1994]
1990-1999
	Sentences: 36
	Words: 119
	Process Time:0.33188366889953613
Processing document: 127
	Characters: 5718
[1974

[1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994]

	Words: 0
	Process Time:0.0040111541748046875
Processing document: 215
	Characters: 5461
[1983]
1980-1989
	Sentences: 75
	Words: 314
	Process Time:0.7860944271087646
Processing document: 216
	Characters: 189750
[1990]
1990-1999
	Sentences: 2582
	Words: 3169
	Process Time:25.887146949768066
Processing document: 217
	Characters: 9055
[1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994]

	Words: 0
	Process Time:0.00500941276550293
Processing document: 218
	Characters: 59276
[1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994]

	Words: 0
	Process Time:0.004010200500488281
Processing document: 219
	Characters: 10508
[1971]
1970-1979
	Sentences: 343
	Words: 589
	Process Time:2.5919313430786133
Processing document: 220
	C

	Characters: 1
	Words: 0
	Process Time:0.003005504608154297
Processing document: 320
	Characters: 1
	Words: 0
	Process Time:0.0020051002502441406
Processing document: 321
	Characters: 8973
[1983]
1980-1989
	Sentences: 94
	Words: 483
	Process Time:1.019705057144165
Processing document: 322
	Characters: 1
	Words: 0
	Process Time:0.0030083656311035156
Processing document: 323
	Characters: 1
	Words: 0
	Process Time:0.0030078887939453125
Processing document: 324
	Characters: 1
	Words: 0
	Process Time:0.003007650375366211
Processing document: 325
	Characters: 1
	Words: 0
	Process Time:0.003007650375366211
Processing document: 326
	Characters: 1
	Words: 0
	Process Time:0.0030083656311035156
Processing document: 327
	Characters: 1
	Words: 0
	Process Time:0.003008127212524414
Processing document: 328
	Characters: 1
	Words: 0
	Process Time:0.002005338668823242
Processing document: 329
	Characters: 1
	Words: 0
	Process Time:0.0030078887939453125
Processing document: 330
	Characters: 1
	Words: 0
	

	Characters: 23771
[1994]
1990-1999
	Sentences: 293
	Words: 1156
	Process Time:3.043090581893921
Processing document: 473
	Characters: 16294
[1992, 1993]
1990-1999
	Sentences: 183
	Words: 733
	Process Time:1.8609330654144287
Processing document: 474
	Characters: 47705
[1993, 1994]
1990-1999
	Sentences: 601
	Words: 1223
	Process Time:5.483960866928101
Processing document: 475
	Characters: 131663
[1992, 1993]
1990-1999
	Sentences: 2343
	Words: 2210
	Process Time:21.223427534103394
Processing document: 476
	Characters: 34369
[1985]
1980-1989
	Sentences: 264
	Words: 1060
	Process Time:3.978470802307129
Processing document: 477
	Characters: 52452
[1987]
1980-1989
	Sentences: 634
	Words: 748
	Process Time:6.556391477584839
Processing document: 478
	Characters: 6145
[1994]
1990-1999
	Sentences: 107
	Words: 611
	Process Time:1.1691513061523438
Processing document: 479
	Characters: 34220
[1985, 1986, 1987, 1988, 1989]
1980-1989
	Sentences: 477
	Words: 963
	Process Time:5.446172475814819
Process

	Process Time:53.95339775085449
Processing document: 605
	Characters: 32638
[1986]
1980-1989
	Sentences: 508
	Words: 1262
	Process Time:4.484005451202393
Processing document: 606
	Characters: 4950
[1987]
1980-1989
	Sentences: 45
	Words: 228
	Process Time:0.5939676761627197
Processing document: 607
	Characters: 7864
[1989]
1980-1989
	Sentences: 82
	Words: 358
	Process Time:0.9533848762512207
Processing document: 608
	Characters: 66677
[1986, 1987]
1980-1989
	Sentences: 868
	Words: 2292
	Process Time:7.922583818435669
Processing document: 609
	Characters: 32108
[1987]
1980-1989
	Sentences: 360
	Words: 1197
	Process Time:3.405209541320801
Processing document: 610
	Characters: 59149
[1989]
1980-1989
	Sentences: 576
	Words: 2038
	Process Time:6.30065131187439
Processing document: 611
	Characters: 19563
[1988, 1989, 1990]

	Words: 0
	Process Time:0.011271476745605469
Processing document: 612
	Characters: 27182
[1989]
1980-1989
	Sentences: 376
	Words: 1064
	Process Time:3.344592571258545
Proc

In [6]:
buckets

[[1968, 1969, 1970, 1971, 1972],
 [1973, 1974, 1975, 1976, 1977],
 [1978, 1979, 1980, 1981, 1982],
 [1983, 1984, 1985, 1986, 1987],
 [1988, 1989, 1990, 1991, 1992],
 [1993, 1994, 1995, 1996, 1997],
 [1998, 1999, 2000, 2001, 2002],
 [2003, 2004, 2005, 2006, 2007],
 [2008, 2009, 2010, 2011, 2012],
 [2013, 2014, 2015, 2016, 2017]]

In [18]:
from collections import Counter
with open("sents5year.txt") as fp: c = Counter(row.split("\t")[0] for row in fp.read().strip("\n").split("\n"))
for key in sorted(c.keys()):
    print(key, c[key])

1968-1972 809
1973-1977 597
1978-1982 6461
1983-1987 85945
1988-1992 178854
1993-1997 72736
1998-2002 14
2003-2007 74


In [17]:
with open("sents1year.txt") as fp: c = Counter(row.split("\t")[0] for row in fp.read().strip("\n").split("\n"))
for key in sorted(c.keys()):
    print(key, c[key])

1968 158
1971 651
1974 236
1975 361
1979 421
1980 17
1981 92
1982 5931
1983 664
1984 13813
1985 21638
1986 15317
1987 19948
1988 28163
1989 18129
1990 43465
1991 27265
1992 27283
1993 25265
1994 17480
1995 1934
1996 961
1998 14
2005 74


In [28]:
flatten = lambda l: [item for sublist in l for item in sublist]
with open("sents.txt") as fp: 
    l = [
        list(range(int(row.split("\t")[0].split("-")[0]), 
              int(row.split("\t")[0].split("-")[-1]) + 1)) for row in fp.read().strip("\n").split("\n")
    ]
c = Counter(flatten(l))
for key in sorted(c.keys()):
    print(key, c[key])

1968 134
1971 558
1974 206
1975 325
1979 349
1980 17
1981 84
1982 5572
1983 1992
1984 15372
1985 22666
1986 19651
1987 22512
1988 27180
1989 19704
1990 46027
1991 31355
1992 33510
1993 36940
1994 26693
1995 9021
1996 3963
1998 11
2005 75
