# Corpus Generator
Generates a corpus for training using the titles of CamHarvestCollection

In [68]:
import pymongo
from pymongo import MongoClient
import nltk
from nltk.corpus import stopwords
import json
import string
import codecs
import gensim 
from gensim import corpora, models
from collections import Counter
import re
punct_filter = [u'"',u'#',u'$',u'%',u'&',u'\\',u"'",u'(',u')',u'*',u'+',u',',u'.',u'/',
     u'-',u':',u';',u'<',u'=',u'>',u'?',u'@',u'[',u']',u'^',u'_',u'`',u'{',
     u'|',u'}',u'–',u'\u2013',u'\u2010',u'\u2212',u'\u2018',u'\u2019',u'\u2022',u'\u2020',u'\u00B0',u'\u29B9',u'\uFF0D',u'\u2261']
stop = stopwords.words('english')
#mongo_url = 'mongodb://localhost:6666/'
mongo_url = 'mongodb://localhost:27017/'
db = 'Cherry'
coll_in = 'Cranberry'
client = MongoClient(mongo_url)
ch = client[db][coll_in]
coops = client[db]['raspberry']
#corpusfile = 'corpus2.txt' 

In [73]:
class GensimCorpus(object):
    def __init__(self,corpus_text_file,diction):
        self.corpus_text_file = corpus_text_file
        self.dictionary = diction
        
    def __iter__(self):
        for line in open(self.corpus_text_file):
            yield self.dictionary.doc2bow(line.split())

def dictionary_generator(corpus_file):
    dictionary = corpora.Dictionary(line.split() for line in open(corpus_file))
    return dictionary

def create_models(corpus_file):
    dictionary = dictionary_generator(corpus_file)
    print('Created Dictionary')
    corp = GensimCorpus(corpus_file,dictionary)
    print('Created Corpus Object')
    tfidf = models.TfidfModel(corp)
    print('Created TFIDF Model')
    tfidf_corp = tfidf[corp]
    print('Created TFIDF Corpus')
    return dictionary,corp,tfidf,tfidf_corp

def load_models(dictionary_file,corpus_file,tfidf_file):
    dictionary = corpora.Dictionary.load(dictionary_file)
    corp = GensimCorpus(corpus_file,dictionary)
    tfidf = models.TfidfModel.load(tfidf_file)
    tfidf_corp = tfidf[corp]
    return dictionary,corp,tfidf,tfidf_corp

def tfidf_filtered_corpus_generator(threshold):
    corpus_filename = 'tfidf_filtered_'+str(threshold).strip('.')+'.txt'
    ind=0
    with codecs.open(corpus_filename,'a',encoding='utf8') as f:
        for doc in tfidf_corp:
            if ind%500000 == 0:
                print(ind)
            f.write(' '.join([dictionary[i] for i,j in doc if j>=threshold]))
            f.write('\n')
            ind+=1

def raw_corpus_generator(file_name):
    ind = 0 
    with codecs.open(file_name,'a',encoding='utf8') as f:
        for rec in ch.find({'crossref_doi':True}):
            lt = rec['title'].lower()
            slt = lt.strip()
            tslt = slt.translate(punct_filter)
            export = tslt+u'\n'
            f.write(export)
            ind+=1
            if ind%100000==0:
                print(ind)

def remove_unicode_punct(subj, chars):
    return re.sub(u'(?u)[' + re.escape(''.join(chars)) + ']', ' ', subj)
                
def sanitise(title):
    lt = title.lower()
    slt = lt.strip()
    tslt = remove_unicode_punct(slt,punct_filter)
    stop_filtered = [i for i in tslt.split() if i not in stop]
    export = u' '.join(stop_filtered)
    return export

def create_stopword_filtered_corpus(file_name):
    ind = 0 
    with codecs.open(file_name,'a',encoding='utf8') as f:
        for rec in ch.find({'crossref_doi':True}):
            f.write(sanitise(rec['title'])+'u\n')
            ind+=1
            if ind%10000==0:
                print(ind)
                
def create_stopword_filtered_raspberry_corpus(file_name):
    ind = 0 
    with codecs.open(file_name,'a',encoding='utf8') as f:
        for rec in coops.find({'abstract': {'$exists': True}, '$where': "this.abstract.length>0"}):
            san_title = sanitise(rec['title'])
            san_abs = sanitise(rec['abstract'])
            f.write(san_title+' '+san_abs+'\n')
            ind+=1
            if ind%10000==0:
                print(ind)

In [6]:
samples = [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]
for samp in samples:
    tfidf_filtered_corpus_generator(samp)
    print('corpus generated')

0
500000
1000000
corpus generated
0
500000
1000000
corpus generated
0
500000
1000000
corpus generated
0
500000
1000000
corpus generated
0
500000
1000000
corpus generated
0
500000
1000000
corpus generated
0
500000
1000000
corpus generated
0
500000
1000000
corpus generated
0
500000
1000000
corpus generated


In [8]:
dictionary,corp,tfidf,tfidf_corp = load_models()


In [16]:
garbage = set()
word_loss = 0
threshold = 0.2
ind=0
for doc in tfidf_corp:
    for i,j in doc:
        if j<=threshold:
            word_loss+=1
            garbage.add(i)
    ind+=1
        if ind%100000==0:
            print(ind)

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000
6600000
6700000
6800000
6900000
7000000
7100000
7200000
7300000
7400000
7500000
7600000
7700000
7800000
7900000
8000000
8100000
8200000
8300000
8400000
8500000
8600000
8700000
8800000
8900000
9000000
9100000
9200000
9300000
9400000
9500000
9600000
9700000
9800000
9900000
10000000
10100000
10200000
10300000
10400000
10500000
10600000
10700000
10800000
10900000
11000000
11100000
11200000
11300000
11400000
11500000
11600000
11700000
11800000
11900000
12000000


In [17]:
for word in list(garbage)[0:500]:
    print(dictionary[word])

chemistry
heterocyclic
green
vanadium
pesticide
solution
clinical
⊂v
general
practical
chimie
fundamental
schools
sch3
51771
impact
2011
factor
conservation
art
polish
highlights
medicinal
future
n1
benzothiepino
scientific
committee
fifty
years
food
note
–
quantum
biology
effects
biological
correction
keith
bioinorganic
xv
editorial
6′
calixarene
40
conference
farber
enzyme
rechargeable
bioorganic
defined
supramolecular
iodovinyl
nitration
computational
theoretical
mechanical
anniversary
glossary
renewable
preface
z
great
phytoprostanes
war
congress
european
1st
analytical
space
prof
rosemary
dr
11h
heteroatom
coordination
laser
greece
sweet
education
nuclear
organometallic
light
high
material
temperature
solid
state
industry
journal
plasma
phosphorus
biomolecular
pharmaceutical
boron
solutions
organic
sulfur
polymers
website
thieme
beyond
chlorine
international
survey
carini
india
nature
proceeding
engineers
physical
indeno
initial
organotin
northwestern
interference
automated
presen

In [71]:
def ziphian(corp,di,file_name):
    li = []
    ind=0
    for doc in corp:
        li+=[i for i,j in doc]
    c = Counter(li)
    c_li = [(k,v) for k,v in c.items()]
    c_li_s = sorted(c_li,key=lambda x: x[1],reverse=True)
    with open(file_name,'ab+') as f:
        f.write('[')
        for k,v in c_li_s[:-1]:
            ex = json.dumps([di[k],v])
            f.write(ex)
            f.write(',\n')
    ex = json.dumps([di[c_li_s[-1][0]],c_li_s[-1][1]])
    f.write(ex)
    f.write(']')

In [67]:
def create_stopword_filtered_raspberry_corpus():
    ind = 0 
    with codecs.open('other_raspberry_corpus.txt','a',encoding='utf8') as f:
        for rec in coops.find({'abstract': {'$exists': True}, '$where': "this.abstract.length>0"}):
            san_title = sanitise(rec['title'])
            san_abs = sanitise(rec['abstract'])
            f.write(san_title+' '+san_abs+'\n')
            ind+=1
            if ind%10000==0:
                print(ind)

create_stopword_filtered_raspberry_corpus()

The history saving thread hit an unexpected error (OperationalError('unable to open database file',)).History will not be written to the database.
10000
20000
30000
40000
50000
60000


KeyboardInterrupt: 

In [14]:
sample = 'Teaching fundamental physical chemistry concepts such as the potential energy surface, transition state, and reaction path is a challenging task. The traditionally used oversimplified 2D representation of potential and free energy surfaces makes this task even more difficult and often confuses students. We show how this 2D representation can be expanded to more realistic potential and free energy surfaces by creating surface models using 3D printing technology. The printed models include potential energy surfaces for the hydrogen exchange reaction and for rotations of methyl groups in 1-fluoro-2-methylpropene calculated using quantum chemical methods. We also present several model surfaces created from analytical functions of two variables. These models include a free energy surface for protein folding, and potential energy surfaces for a linear triatomic molecule and surface adsorption, as well as simple double minimum, quadruple minimum, and parabolic surfaces. We discuss how these 3D models can be used in teaching different chemical kinetics, dynamics, and vibrational spectroscopy concepts including the potential energy surface, transition state, minimum energy reaction path, reaction trajectory, harmonic frequency, and anharmonicity.Keywords:  Upper-Division Undergraduate; Physical Chemistry; Hands-On Learning/Manipulatives; Quantum Chemistry; Kinetics; Molecular Mechanics/Dynamics; Spectroscopy"'

In [32]:
remove_unicode_punct(' '.join([i for i in sample.lower().strip().split() if i not in stop]),punct_filter).strip()

'teaching fundamental physical chemistry concepts potential energy surface  transition state  reaction path challenging task  traditionally used oversimplified 2d representation potential free energy surfaces makes task even difficult often confuses students  show 2d representation expanded realistic potential free energy surfaces creating surface models using 3d printing technology  printed models include potential energy surfaces hydrogen exchange reaction rotations methyl groups 1 fluoro 2 methylpropene calculated using quantum chemical methods  also present several model surfaces created analytical functions two variables  models include free energy surface protein folding  potential energy surfaces linear triatomic molecule surface adsorption  well simple double minimum  quadruple minimum  parabolic surfaces  discuss 3d models used teaching different chemical kinetics  dynamics  vibrational spectroscopy concepts including potential energy surface  transition state  minimum energy 

In [74]:
dictionary, corpus, tfidf_model,tfidf_corpus = create_models('first_raspberry_corpus.txt')

Created Dictionary
Created Corpus Object
Created TFIDF Model
Created TFIDF Corpus


In [78]:
tfidf_model.save('first_raspberry_tfidf_model')

In [None]:
tfidf_corpus.save('first_tfidf_corpus')