In [69]:
import os, sys, time, resource, re, gc, shutil
from multiprocess import Pool
from functools import partial
from urllib.parse import urlparse, parse_qsl
import matplotlib
matplotlib.use('pgf')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import django
from django.db.models import Count, Sum
sys.path.append('/home/galm/software/django/tmv/BasicBrowser/')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "BasicBrowser.settings")
django.setup()

from scoping.models import *

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize

pgf_with_latex = {
    "text.usetex": True,            # use LaTeX to write all text
    "pgf.rcfonts": False,           # Ignore Matplotlibrc
    "text.latex.unicode": True,
    "pgf.preamble": [
        #r"\usepackage[utf8x]{inputenc}",
        r"\usepackage{xcolor}"
    ],
    "pgf.texsystem" : "xelatex",
    "figure.figsize": [3.5,5]
}
matplotlib.rcParams.update(pgf_with_latex)

In [81]:
import nltk
from nltk.stem import SnowballStemmer
import string

stoplist = set(nltk.corpus.stopwords.words("english"))
stoplist.add('elsevier')
stoplist.add('rights')
stoplist.add('reserved')
stoplist.add('john')
stoplist.add('wiley')
stoplist.add('sons')
stoplist.add('copyright')

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    return tokens

class snowball_stemmer(object):
    def __init__(self):
        self.stemmer = SnowballStemmer("english")
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in tokenize(doc)]

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   tokenizer=snowball_stemmer(),
                                   stop_words=stoplist)

c_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stoplist, tokenizer=snowball_stemmer())



In [82]:
kws = Doc.objects.filter(
    query=q,
    kw__text__iregex='\W'
).values('kw__text').annotate(
    n = Count('pk')
).filter(n__gt=100).order_by('-n')
print(kws.count())
print(kws)

2861
<QuerySet [{'n': 62921, 'kw__text': 'climate change'}, {'n': 56689, 'kw__text': 'climate-change'}, {'n': 11118, 'kw__text': 'carbon-dioxide'}, {'n': 9116, 'kw__text': 'united-states'}, {'n': 8002, 'kw__text': 'global warming'}, {'n': 7579, 'kw__text': 'carbon dioxide'}, {'n': 7578, 'kw__text': 'atmospheric co2'}, {'n': 7128, 'kw__text': 'carbon sequestration'}, {'n': 7117, 'kw__text': 'co2 capture'}, {'n': 5993, 'kw__text': 'climate variability'}, {'n': 5360, 'kw__text': 'co2 emissions'}, {'n': 5146, 'kw__text': 'elevated co2'}, {'n': 4640, 'kw__text': 'global change'}, {'n': 4451, 'kw__text': 'land-use'}, {'n': 4188, 'kw__text': 'greenhouse gases'}, {'n': 4006, 'kw__text': 'organic-matter'}, {'n': 3884, 'kw__text': 'nitrous oxide'}, {'n': 3802, 'kw__text': 'greenhouse-gas emissions'}, {'n': 3615, 'kw__text': 'el-nino'}, {'n': 3513, 'kw__text': 'renewable energy'}, '...(remaining elements truncated)...']>


In [83]:
d = Doc.objects.filter(query=q).first()
print(d.content)

print('')

print(list(tokenize(d.content)))

For improved exploitation of the energy content present in the organic matter of raw sewage, an innovative concept for treatment of municipal wastewater is tested in pilot trials and assessed in energy balance and operational costs. The concept is based on a maximum extraction of organic matter into the sludge via coagulation, flocculation and microsieving (100 mu m mesh size) to increase the energy recovery in anaerobic sludge digestion and decrease aeration demand for carbon mineralisation. Pilot trials with real wastewater yield an extraction of 70-80% of total chemical oxygen demand into the sludge while dosing 15-20 mg/L Al and 5-7 mg/L polymer with stable operation of the microsieve and effluent limits below 2-3 mg/L total phosphorus. Anaerobic digestion of the microsieve sludge results in high biogas yields of 600 NL/kg organic dry matter input (oDM(in)) compared to 430 NL/kg oDM(in) for mixed sludge from a conventional activated sludge process. The overall energy balance for a 

In [112]:
kw_text = set([x['kw__text'] for x in kws])
kw_ws = set([x['kw__text'].split()[0] for x in kws]) - stopwords
kw_ws 

{'hydrogen-sulfide',
 'o-2',
 'asian',
 'zeolite',
 'net',
 'risk-management',
 'big',
 'puerto-rico',
 'zeolitic',
 'alternative',
 'wild',
 'acid-base-balance',
 'la',
 'treatment',
 'age',
 'c-3',
 'geological',
 'use',
 'geothermal',
 'mars,',
 'yr',
 'enrichment',
 'coralline',
 'fine',
 'mitochondrial-dna',
 'finnish',
 'fram',
 'seed-germination',
 'storage',
 'south',
 'embryonic-development',
 'costa',
 'drinking',
 'southern',
 'remotely-sensed',
 'ice-age',
 'environmental-stress',
 'cement',
 'condensed',
 'hydraulic',
 'transaction',
 'coral-reef',
 'intercomparison',
 'dinoflagellate',
 'erosion',
 'exergy',
 'deep',
 'future',
 'elevated-temperature',
 'geopotential',
 'extratropical',
 'land-surface',
 'rare-earth-elements',
 'evolutionary',
 'uv-radiation',
 'ribosomal-rna',
 'phylogenetic',
 'emissions',
 'environmental',
 'membrane',
 'snowball',
 'younger',
 'techno-economic',
 'error-correction',
 'decadal',
 'h-2',
 'lake-sediments',
 'sitka',
 'marked',
 'remote-

In [110]:
len(kw_ws)

1486

In [137]:
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords as sw
punct = set(string.punctuation)
from nltk.corpus import wordnet as wn
stopwords = set(sw.words('english'))

def lemmatize(token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
        return WordNetLemmatizer().lemmatize(token, tag)
    
kws = Doc.objects.filter(
    query=q,
    kw__text__iregex='\W'
).values('kw__text').annotate(
    n = Count('pk')
).filter(n__gt=100).order_by('-n')

kw_text = set([x['kw__text'] for x in kws])
kw_ws = set([x['kw__text'].split()[0] for x in kws]) - stopwords
    
def tokenize(X):
       
    for sent in sent_tokenize(X):
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
            token = token.lower().strip()
            if token in stopwords:
                continue
            if all(char in punct for char in token):
                continue
            if len(token) < 3:
                continue
            if all(char in string.digits for char in token):
                continue
            lemma = lemmatize(token,tag)
            yield lemma

def fancy_tokenize(X):
    
    X = d.content
    common_words = set(X.split()) & kw_ws
    for w in list(common_words):
        wpat = "({}\W*\w*)".format(w)
        wn = re.findall(wpat, X)
        kw_matches = set(wn) & kw_text
        if len(kw_matches) > 0:
            for m in kw_matches:
                yield m
                X = X.replace(m," ")
    
    for sent in sent_tokenize(X):
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
            token = token.lower().strip()
            if token in stopwords:
                continue
            if all(char in punct for char in token):
                continue
            if len(token) < 3:
                continue
            if all(char in string.digits for char in token):
                continue
            lemma = lemmatize(token,tag)
            yield lemma
            
d = Doc.objects.filter(query=q).first()
print(d.content)

print('')

print(list(tokenize(d.content)))

For improved exploitation of the energy content present in the organic matter of raw sewage, an innovative concept for treatment of municipal wastewater is tested in pilot trials and assessed in energy balance and operational costs. The concept is based on a maximum extraction of organic matter into the sludge via coagulation, flocculation and microsieving (100 mu m mesh size) to increase the energy recovery in anaerobic sludge digestion and decrease aeration demand for carbon mineralisation. Pilot trials with real wastewater yield an extraction of 70-80% of total chemical oxygen demand into the sludge while dosing 15-20 mg/L Al and 5-7 mg/L polymer with stable operation of the microsieve and effluent limits below 2-3 mg/L total phosphorus. Anaerobic digestion of the microsieve sludge results in high biogas yields of 600 NL/kg organic dry matter input (oDM(in)) compared to 430 NL/kg oDM(in) for mixed sludge from a conventional activated sludge process. The overall energy balance for a 

In [60]:
all_ys = range(0,100000)
q = Query.objects.get(pk=2355)
X = []
vecs = []
ars = AR.objects.filter(ar__gt=0).order_by('ar')
for ar in ars:
    c_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stoplist, tokenizer=tokenize)
    abstracts = []
    ys = range(ar.start,ar.end+1)
    ydocs = Doc.objects.filter(query=q,PY__in=ys)
    abdocs = ydocs.filter(content__iregex='\w')
    tidocs = ydocs.exclude(content__iregex='\w')
    abstracts = list(abdocs.values_list('content',flat=True)) + list(tidocs.values_list('title',flat=True))
    X_y = c_vectorizer.fit_transform(abstracts)
    vecs.append(c_vectorizer)
    X.append(X_y)
    print(X_y.shape)

(625, 1380)
(7623, 12411)
(16395, 20482)
(34511, 32665)
(117766, 67100)
(128268, 74252)


In [65]:
all_vocab = set()
for i, vec in enumerate(vecs):
    x = X[i]
    vocab = vec.vocabulary_
    sum_words = x.sum(axis=0).tolist()[0]
    new_vocab = [(x[0],sum_words[x[1]]) for x in vocab.items() if x[0] not in all_vocab]
    new_vocab =sorted(new_vocab, key = lambda x: x[1], reverse=True)
    all_vocab = all_vocab | set(vec.get_feature_names())
    print(new_vocab[:20])
    print('\n')


[('change', 296), ('climate', 262), ('model', 168), ('effect', 160), ('co2', 156), ('atmospheric', 152), ('climatic', 133), ('global', 131), ('greenhouse', 109), ('ocean', 106), ('temperature', 104), ('carbon', 95), ('ice', 84), ('year', 83), ('surface', 82), ('use', 81), ('sea', 80), ('water', 77), ('increase', 74), ('time', 71)]


[('loss', 552), ('efficiency', 515), ('mol', 439), ('ambient', 417), ('coal', 404), ('photosynthetic', 393), ('concern', 381), ('chamber', 353), ('demonstrate', 351), ('consumption', 305), ('product', 299), ('way', 297), ('throughout', 297), ('combustion', 293), ('unit', 287), ('oil', 285), ('positive', 285), ('deltac', 283), ('east', 265), ('demand', 258)]


[('downscaling', 197), ('degreesc', 145), ('ncep', 130), ('otcs', 87), ('inqua', 87), ('nee', 87), ('fco', 80), ('hadcm2', 78), ('dtr', 75), ('annex', 71), ('earthworm', 60), ('boreas', 56), ('oeschger', 54), ('cdm', 51), ('opt', 50), ('reanalyses', 50), ('amip', 49), ('bracken', 49), ('msw', 48), ('ba

<QuerySet [{'n': 62921, 'kw__text': 'climate change'}, {'n': 56689, 'kw__text': 'climate-change'}, {'n': 36222, 'kw__text': ''}, {'n': 21888, 'kw__text': 'temperature'}, {'n': 17378, 'kw__text': 'climate'}, {'n': 16516, 'kw__text': 'model'}, {'n': 14299, 'kw__text': 'variability'}, {'n': 11118, 'kw__text': 'carbon-dioxide'}, {'n': 10961, 'kw__text': 'precipitation'}, {'n': 10240, 'kw__text': 'management'}, {'n': 10151, 'kw__text': 'co2'}, {'n': 10103, 'kw__text': 'growth'}, {'n': 9343, 'kw__text': 'adaptation'}, {'n': 9223, 'kw__text': 'china'}, {'n': 9116, 'kw__text': 'united-states'}, {'n': 8862, 'kw__text': 'carbon'}, {'n': 8591, 'kw__text': 'dynamics'}, {'n': 8422, 'kw__text': 'impacts'}, {'n': 8264, 'kw__text': 'emissions'}, {'n': 8172, 'kw__text': 'biodiversity'}, '...(remaining elements truncated)...']>

In [5]:
## parse the bible, turn it into same format
import codecs
import re
bible = []
i=0
clines=""
corpus = []
lchapter="none"
with codecs.open('/home/galm/projects/big_literature/py/Martin_Luther_Uebersetzung_1912.txt',encoding = "ISO-8859-1") as ml:
    for line in ml:
        chapter = line.split(':')[0]
        try:
            verse = re.search(".*([0-9]+\:[0-9]*)(.*)",line).group(2).strip()
        except:
            corpus.append(str(clines))
            break
        if lchapter!=chapter and i > 0:
            corpus.append(str(clines))
            clines=verse            
        else:
            clines+=" "+verse
            #print('\n')
        lchapter = chapter
        i+=1
        
X_y = c_vectorizer.fit_transform(corpus)
X.append(X_y)

In [20]:
ind = np.arange(1)

def vvrect(X,col_x,m):
    return plt.bar(ind+X.shape[1]/2,X.shape[0],width=X.shape[1],facecolor=col_x,edgecolor='black',alpha=m)#,alpha=m)

means = [x.getnnz(0).mean()/x.shape[0]*100 for x in X]
nmeans = normalize([means])[0]

crange = ['#fbb4ae','#b3cde3','#ccebc5','#decbe4','#fed9a6','#ffffcc']

crange = ["#D53E4F", "#FC8D59", "#FEE08B", "#E6F598", "#99D594", "#3288BD"]

crange = ["#8dd3c7","#ffffb3","#bebada","#fb8072","#80b1d3","#fdb462","grey"]

years = [x.name for x in ars] + ["Bible"]

fig = plt.figure()

plots = []
lvalues = []
for x in [5,4,3,2,1,0,6]:
    a = (nmeans[x]+0.3)/1.3
    #a = nmeans[x]
    vvrect(X[x], "None",1)
    vvrect(X[x], "white",1)
    p = vvrect(X[x], crange[x] ,a)
    plots.append(p[0])
    lvalues.append(years[x] + ": " + str(round(means[x],2)))
    print(crange[5-x])
    print(a)
    
plt.legend(plots,lvalues,title='% of documents mean\nterm appears in')
    
plt.axis('equal')
plt.ylabel('Documents')
plt.xlabel('Words')
fig.patch.set_facecolor('#f0f0f0')    
#plt.tight_layout()

plt.savefig(
    '../plots/literature_size/volume_variety.pdf',
    bbox_inches='tight',
    facecolor=fig.get_facecolor(),
)

plt.show()


#8dd3c7
0.26106256071014744
#ffffb3
0.26469409500636126
#bebada
0.29105686163912314
#fb8072
0.33736081753188196
#80b1d3
0.41035679812605236
#fdb462
0.43180147447054457
grey
0.9392784910599727


In [7]:
tab = pd.DataFrame([(x.shape[0],x.shape[1]) for x in X]).rename({0:'Documents',1:'Terms'},axis="columns")
tab.index = years
tab['Average Document score per Term'] = means
tab

Unnamed: 0,Documents,Terms,Average Document score per Term
AR1,1848,2834,0.580116
AR2,6941,13279,0.518233
AR3,18728,26657,0.30759
AR4,44000,49955,0.173971
AR5,108277,92369,0.097896
AR6,128357,108102,0.087417
Bible,1189,10662,2.044534


In [8]:

shape = [(x+6000)/10000 for x in list(X[5].shape)[::1]]

#plt.rcParams["figure.figsize"] = tuple([round(x) for x in shape[::1]])

#plt.rcParams["figure.figsize"] = (9.5,13)

xs = [5,4,3,2,1,0,6]

for ar in AR.objects.all():
    
    iprs = IPCCRef.objects.filter(ar=ar).count()
    
    plots = []
    lvalues = []

    ar_xs = [x for x in xs if ar.ar > x ]
    print(ar_xs)
    #continue
    for x in ar_xs:
        a = (nmeans[x]+0.1)/1.1
        #a = nmeans[x]
        vvrect(X[x], "None",1)
        vvrect(X[x], "white",1)
        p = vvrect(X[x], crange[x] ,a)
        plots.append(p[0])
        lvalues.append(years[x] + ": " + str(round(means[x],2)))
        ar = AR.objects.get(ar=x+1)
        iprs = IPCCRef.objects.filter(ar=ar).count()
        xtent = shape[1]*10000+5000+1000
        #plt.axhline(iprs,xmin=1/xtent*1000,xmax=1/xtent*(X[x].shape[1]+1000))
    
    plt.legend(plots,lvalues,title='% of documents mean term appears in')
    
    #plt.axis('equal')

    #plt.ylim((-1000,130000))
    #plt.ylim(ymin=-1000)
    #plt.xlim((-1000,100000))
    plt.xlim((-1000,shape[1]*10000+5000))
    plt.ylim((-1000,shape[0]*10000+5000))
    #plt.ylim(ymax=shape[0]*10000)
    #plt.xlim(xmax=shape[1]*10000)
    plt.ylabel('Documents')
    plt.xlabel('Words')
    plt.savefig('../plots/literature_size/volume_variety_{}.pdf'.format(ar.name),bbox_inches='tight')
    
    print(plt.xlim())

    plt.show()


[]
(-1000.0, 119102.0)
[0]
(-1000.0, 119102.0)
[1, 0]
(-1000.0, 119102.0)
[2, 1, 0]
(-1000.0, 119102.0)
[3, 2, 1, 0]
(-1000.0, 119102.0)
[4, 3, 2, 1, 0]
(-1000.0, 119102.0)
[5, 4, 3, 2, 1, 0]
(-1000.0, 119102.0)


In [9]:
for ar in ars:
    plots = []
    lvalues = []

    ar_xs = [x for x in xs if ar.ar > x or x ==6]
    print(ar_xs)
    #continue
    for x in ar_xs:
        a = (nmeans[x]+0.1)/1.1
        #a = nmeans[x]
        vvrect(X[x], "None",1)
        vvrect(X[x], "white",1)
        p = vvrect(X[x], crange[x] ,a)
        plots.append(p[0])
        #lvalues.append(years[x] + ": " + str(round(X[x].mean()*1000000/X[x].sum(),5)))
        lvalues.append(years[x] + ": " + str(round(means[x],2)))
        print(crange[5-x])
        print(a)
    
    plt.legend(plots,lvalues,title='% of documents mean term appears in')
    
    #plt.axis('equal')

    #plt.ylim((-1000,130000))
    #plt.ylim(ymin=-1000)
    #plt.xlim((-1000,100000))
    plt.xlim((-1000,shape[1]*10000+5000))
    plt.ylim((-1000,shape[0]*10000+5000))
    #plt.ylim(ymax=shape[0]*10000)
    #plt.xlim(xmax=shape[1]*10000)
    plt.ylabel('Documents')
    plt.xlabel('Words')
    plt.savefig('../plots/literatures_size/volume_variety_bible_{}.pdf'.format(ar.name),bbox_inches='tight')
    
    print(plt.xlim())

    plt.show()



[0, 6]
#fdb462
0.32849265164700725
grey
0.9282382167072406
(-1000.0, 119102.0)
[1, 0, 6]
#80b1d3
0.30314894323988
#fdb462
0.32849265164700725
grey
0.9282382167072406
(-1000.0, 119102.0)
[2, 1, 0, 6]
#fb8072
0.21688096617404234
#80b1d3
0.30314894323988
#fdb462
0.32849265164700725
grey
0.9282382167072406
(-1000.0, 119102.0)
[3, 2, 1, 0, 6]
#bebada
0.1621581092098728
#fb8072
0.21688096617404234
#80b1d3
0.30314894323988
#fdb462
0.32849265164700725
grey
0.9282382167072406
(-1000.0, 119102.0)
[4, 3, 2, 1, 0, 6]
#ffffb3
0.13100211228024514
#bebada
0.1621581092098728
#fb8072
0.21688096617404234
#80b1d3
0.30314894323988
#fdb462
0.32849265164700725
grey
0.9282382167072406
(-1000.0, 119102.0)
[5, 4, 3, 2, 1, 0, 6]
#8dd3c7
0.12671029902108336
#ffffb3
0.13100211228024514
#bebada
0.1621581092098728
#fb8072
0.21688096617404234
#80b1d3
0.30314894323988
#fdb462
0.32849265164700725
grey
0.9282382167072406
(-1000.0, 119102.0)


In [10]:
means

nmeans

array([0.26134192, 0.23346384, 0.13856906, 0.07837392, 0.04410232,
       0.03938133, 0.92106204])

In [11]:
for x in xs:
    print(X[x].sum(0).mean())
    print(X[x].sum(0).mean()/X[x].shape[0]*100)
    print(X[x].shape)
    print('\n\n')
    
    

169.52702077667388
0.13207462061023076
(128357, 108102)



158.94852169017744
0.14679804731399784
(108277, 92369)



115.27574817335602
0.2619903367576273
(44000, 49955)



87.34006077202986
0.4663608541864046
(18728, 26657)



52.82280292190677
0.7610258308875777
(6941, 13279)



14.361679604798871
0.7771471647618436
(1848, 2834)



53.88051022322266
4.531582020456069
(1189, 10662)





In [12]:
for x in xs:
    print(X[x].getnnz(0).mean())
    print(X[x].getnnz(0).mean()/X[x].shape[0]*100)
    print(X[x].shape)
    print('\n\n')
    

112.20582412906329
0.08741698865590758
(128357, 108102)



105.99933960527883
0.09789645040523734
(108277, 92369)



76.54727254529077
0.17397107396656994
(44000, 49955)



57.605394455490114
0.3075896756487084
(18728, 26657)



35.970555011672566
0.5182330357538188
(6941, 13279)



10.720536344389556
0.580115603051383
(1848, 2834)



24.309510410804727
2.0445340967876136
(1189, 10662)





In [13]:
X_y.sum(0).mean()


53.88051022322266