# Corpus Generator
Generates a corpus for training using the titles of CamHarvestCollection

In [84]:
import pymongo
from pymongo import MongoClient
import nltk
from nltk.corpus import stopwords
import json
import string
import codecs
import gensim 
from gensim import corpora, models
from collections import Counter
import re
import csv
import math
import matplotlib.pyplot as plt
import numpy as np
import sys
punct_filter = [u'"',u'#',u'$',u'%',u'&',u'\\',u"'",u'(',u')',u'*',u'+',u',',u'.',u'/',
     u'-',u':',u';',u'<',u'=',u'>',u'?',u'@',u'[',u']',u'^',u'_',u'`',u'{',
     u'|',u'}',u'–',u'\u2013',u'\u2010',u'\u2606',u'\u201D',u'\u2248',u'\u223C',u'\u2212',u'\u2014',u'\u2032',u'\u2018',u'\u2019',u'\u2022',u'\u2020',u'\u00B0',u'\u29B9',u'\uFF0D',u'\u2261']
stop = stopwords.words('english')
with open('chemistry_stopwords.json') as f:
    chem_stop = json.load(f)
max_stop = stop+chem_stop
#mongo_url = 'mongodb://localhost:6666/'
mongo_url = 'mongodb://localhost:27017/'
db = 'Cherry'
coll_in = 'Cranberry'
client = MongoClient(mongo_url)
ch = client[db][coll_in]
coops = client[db]['raspberry']
#corpusfile = 'corpus2.txt' 

In [85]:
class GensimCorpus(object):
    def __init__(self,corpus_text_file,diction):
        self.corpus_text_file = corpus_text_file
        self.dictionary = diction
        
    def __iter__(self):
        for line in open(self.corpus_text_file):
            yield self.dictionary.doc2bow(line.split())

def dictionary_generator(corpus_file):
    dictionary = corpora.Dictionary(line.split() for line in open(corpus_file))
    return dictionary

def create_models(corpus_file):
    dictionary = dictionary_generator(corpus_file)
    print('Created Dictionary')
    corp = GensimCorpus(corpus_file,dictionary)
    print('Created Corpus Object')
    tfidf = models.TfidfModel(corp)
    print('Created TFIDF Model')
    tfidf_corp = tfidf[corp]
    print('Created TFIDF Corpus')
    return dictionary,corp,tfidf,tfidf_corp

def load_models(dictionary_file,corpus_file,tfidf_file):
    dictionary = corpora.Dictionary.load(dictionary_file)
    corp = GensimCorpus(corpus_file,dictionary)
    tfidf = models.TfidfModel.load(tfidf_file)
    tfidf_corp = tfidf[corp]
    return dictionary,corp,tfidf,tfidf_corp

def tfidf_filtered_corpus_generator(corpus_filename,threshold):
    corpus_filename = 'tfidf_filtered_'+str(threshold).strip('.')+'.txt'
    ind=0
    with codecs.open(corpus_filename,'a',encoding='utf8') as f:
        for doc in tfidf_corp:
            if ind%500000 == 0:
                print(ind)
            f.write(' '.join([dictionary[i] for i,j in doc if j>=threshold]))
            f.write('\n')
            ind+=1

def raw_corpus_generator(file_name):
    ind = 0 
    with codecs.open(file_name,'a',encoding='utf8') as f:
        for rec in ch.find({'crossref_doi':True}):
            lt = rec['title'].lower()
            slt = lt.strip()
            tslt = slt.translate(punct_filter)
            export = tslt+u'\n'
            f.write(export)
            ind+=1
            if ind%100000==0:
                print(ind)

def remove_unicode_punct(subj, chars):
    return re.sub(u'(?u)[' + re.escape(''.join(chars)) + ']', ' ', subj)
                
def sanitise(title):
    lt = title.lower()
    slt = lt.strip()
    tslt = remove_unicode_punct(slt,punct_filter)
    stop_filtered = [i for i in tslt.split() if i not in stop]
    export = u' '.join(stop_filtered)
    return export

def create_stopword_filtered_corpus(file_name):
    ind = 0 
    with codecs.open(file_name,'a',encoding='utf8') as f:
        for rec in ch.find({'crossref_doi':True}):
            f.write(sanitise(rec['title'])+'u\n')
            ind+=1
            if ind%10000==0:
                print(ind)
                
def create_stopword_filtered_raspberry_corpus(file_name):
    ind = 0 
    with codecs.open(file_name,'a',encoding='utf8') as f:
        for rec in coops.find({'abstract': {'$exists': True}, '$where': "this.abstract.length>0"}):
            san_title = sanitise(rec['title'])
            san_abs = sanitise(rec['abstract'])
            f.write(san_title+' '+san_abs+'\n')
            ind+=1
            if ind%10000==0:
                print(ind)

In [86]:
def get_corpus_stats(in_file,diction,outfile_name):
    unique_word_count=0
    for k in diction.iterkeys():
        if unique_word_count<k:
            unique_word_count=k
    print('Counted Unique Words')
    word_freq = Counter()
    word_count = 0
    document_count = 0
    document_lengths = Counter()
    interim_corp = GensimCorpus(in_file,diction)
    ind=0
    for doc in interim_corp:
        word_count+=len(doc)
        document_count+=1
        document_lengths.update([len(doc)])
        upd = []
        for w_id,w_freq in doc:
            upd+=([w_id]*w_freq)
        word_freq.update(upd)
        ind+=1
        if ind%10000==0:
            sys.stdout.write('\r[{0}] {1}'.format('#'*(ind/10000), ind))
            sys.stdout.flush()
    mean_doc_length = float(word_count)/float(document_count)
    mode_doc_length = document_lengths.most_common(1)[0]
    print('\nGenerated Counting Stats')
    ranked_word_freq = word_freq.most_common()
    ziphian_table = []
    for rank in range(unique_word_count):
        w = dictionary[ranked_word_freq[rank][0]].encode('utf-8')
        r = rank+1
        log_r = math.log(r,10)
        f = ranked_word_freq[rank][1]
        log_f = math.log(f,10)
        ziphian_table.append((w,r,log_r,f,log_f))
    print('Generated Ziphian Data')
    sample = []
    log_ranks = list(zip(*ziphian_table)[2])
    for s in np.arange(ziphian_table[0][2],ziphian_table[-1][2],ziphian_table[1][2]):   
        sample.append(log_ranks.index(min(log_ranks,key=lambda x:abs(x-s))))
    z_grad,z_c = np.polyfit([ziphian_table[s][2] for s in sample],[ziphian_table[s][4] for s in sample],1)
    line_freq = map(lambda x: z_grad*x+z_c,log_ranks)
    plt.close()
    plt.plot(log_ranks,list(zip(*ziphian_table)[4]),'r')
    plt.plot(log_ranks,line_freq,'b')
    plt.savefig(outfile_name+'_ziphian_plot.png')
    print('Saved Ziphian Plot')
    plt.close()
    plt.plot(document_lengths.keys(),document_lengths.values())
    plt.savefig(outfile_name+'_document_word_lengths.png')
    plt.close()
    print('Saved Document Length Distribution Plot')
    with open(outfile_name+'_ziphian_data.csv', 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['word','rank','log rank','freqency','log_frequncy'])
        writer.writerows(ziphian_table)
    print('Writen Ziphian Data To file')
    with open(outfile_name+'_document_lengths.csv', 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['words per document','number of documents'])
        writer.writerows(zip(document_lengths.keys(),document_lengths.values()))
    print('Writen Document Length Distribution data to file')
    with open(outfile_name+'stats.txt','wb') as f:
        f.write('Word count : '+str(word_count)+'\n')
        f.write('Unique words : ' + str(unique_word_count)+'\n')
        f.write('Mean document word count : ' + str(mean_doc_length)+'\n')
        f.write('Mode document word count : '+ str(mode_doc_length[0])+'\n')
        f.write('Document count : ' + str(document_count)+'\n')
        f.write('Ziphian gradient : '+str(z_grad)+'\n')
        f.write('Ziphian intercept : '+str(z_c)+'\n')
        f.write('most_frequent 10 words : '+'\n')
        for w in ziphian_table[0:10]:
            f.write('"'+w[0]+'" : '+str(w[3])+' occurances\n')
    print('Written stats report to file')

In [43]:
#create_stopword_filtered_raspberry_corpus('second_raspberry_corpus.txt')
#dictionary, corpus, tfidf_model,tfidf_corpus = create_models('second_raspberry_corpus.txt')
#dictionary.save('second_raspberry_dictionary')
#tfidf_model.save('second_raspberry_tfidf_model')
#tfidf_corpus.save('second_raspberry_tfidf_corpus')
dictionary,corp,tfidf,tfidf_corp = load_models('second_raspberry_dictionary','second_raspberry_corpus.txt','second_raspberry_tfidf_model')
get_corpus_stats('second_raspberry_corpus.txt',dictionary,'RASPBERRY')

Counted Unique Words
[############################################] 440000Generated Counting Stats
Generated Ziphian Data
Saved Ziphian Plot
Saved Document Length Distribution Plot
Writen Ziphian Data To file
Writen Document Length Distribution data to file
Written stats report to file


In [87]:
def minimal_sanitise(title):
    lt = title.lower()
    slt = lt.strip()
    tslt = remove_unicode_punct(slt,punct_filter)
    export = tslt.strip()
    return export

def maximal_sanitise(title):
    lt = title.lower()
    slt = lt.strip()
    tslt = remove_unicode_punct(slt,punct_filter)
    stop_filtered = [i for i in tslt.split() if i not in max_stop]
    export = u' '.join(stop_filtered)
    return export


def create_custom_raspberry_corpus(file_name,san_fun):
    ind = 0 
    with codecs.open(file_name,'a',encoding='utf8') as f:
        for rec in coops.find({'abstract': {'$exists': True}, '$where': "this.abstract.length>0"}):
            san_title = san_fun(rec['title'])
            san_abs = san_fun(rec['abstract'])
            f.write(san_title+' '+san_abs+'\n')
            ind+=1
            if ind%10000==0:
                print(ind)

In [88]:
create_custom_raspberry_corpus('raw_raspberry_corpus.txt',minimal_sanitise)
create_custom_raspberry_corpus('chem_stop_raspberry_corpus.txt',maximal_sanitise)

dictionary, corpus, tfidf_model,tfidf_corpus = create_models('raw_raspberry_corpus.txt')
dictionary.save('raw_raspberry_dictionary')
tfidf_model.save('raw_raspberry_tfidf_model')
tfidf_corpus.save('raw_raspberry_tfidf_corpus')
#dictionary,corp,tfidf,tfidf_corp = load_models('raw_raspberry_dictionary','raw_raspberry_corpus.txt','raw_raspberry_tfidf_model')
get_corpus_stats('raw_raspberry_corpus.txt',dictionary,'raw_raspberry')

dictionary, corpus, tfidf_model,tfidf_corpus = create_models('chem_stop_raspberry_corpus.txt')
dictionary.save('chem_stop_raspberry_dictionary')
tfidf_model.save('chem_stop_raspberry_tfidf_model')
tfidf_corpus.save('chem_stop_raspberry_tfidf_corpus')
#dictionary,corp,tfidf,tfidf_corp = load_models('chem_stop_raspberry_dictionary','chem_stop_raspberry_corpus.txt','chem_stop_raspberry_tfidf_model')
get_corpus_stats('chem_stop_raspberry_corpus.txt',dictionary,'chem_stop_raspberry')


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
Created Dictionary
Created Corpus Object
Created TFIDF Model
Created TFIDF Corpus
Counted Unique Words
[############################################] 440000
Generated Counting Stats
Generated Ziphian Data
Saved Ziphian Plot
Saved Document Length Distribution Plot
Writen Ziphian Data To file
Writen Document Length Distribution data to file
Written stats report to file
Created Dictionary
Created Corpu

In [90]:
import nltk.stem

In [98]:
lancaster = nltk.stem.lancaster.LancasterStemmer()
porter = nltk.stem.porter.PorterStemmer()
snowball = nltk.stem.snowball.EnglishStemmer()
wordnet = nltk.stem.WordNetLemmatizer()
print(lancaster.stem('cats'))
print(porter.stem('cats'))
print(snowball.stem('cats'))
print(wordnet.lemmatize('cats'))


cat
cat
cat
cat


In [96]:
nltk.download()

showing info http://www.nltk.org/nltk_data/


True