In [41]:
%pylab inline
%load_ext autoreload
%autoreload 2
from __future__ import division
import reader
import re
from textblob import TextBlob
import nltk
import string
from collections import Counter
import cPickle

Populating the interactive namespace from numpy and matplotlib
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


`%matplotlib` prevents importing * from pylab and numpy


In [2]:
import os
from os import listdir
def find_files_folder(path, ext):
    onlyfiles = [ os.path.join(path,f) for f in listdir(path) if os.path.isfile(os.path.join(path,f)) and f[-3:] == ext ]
    return onlyfiles

In [71]:
re_review = re.compile(r'Reviewer\s+(?:\d)', flags=re.IGNORECASE+re.DOTALL+re.UNICODE)
re_remove_standard_text = re.compile(r'Research Project\s*(?:Ground-breaking nature and potential impact of the research project:)?(.*?)Scientific Approach:(.*?)Principal Investigator.*?\)(.*)', flags=re.IGNORECASE+re.DOTALL+re.UNICODE)
re_panel_comment = re.compile(r'in this report.\s*(.*?)\[', flags=re.IGNORECASE+re.DOTALL+re.UNICODE)
re_panel = re.compile(r'(\[.*?\])',flags=re.IGNORECASE+re.DOTALL+re.UNICODE)
re_panel_comment_parts = re.compile(r'(.*?)\s*\n\s*(.*?\n)?\s*(on the basis.*|overall.*)',flags=re.IGNORECASE+re.DOTALL+re.UNICODE)
re_no_comment = re.compile(r'No\s*Comments\s*received*', flags=re.IGNORECASE+re.DOTALL+re.UNICODE)
re_newlines = re.compile(r'((?<![\.!?])\s\n)',flags=re.DOTALL+re.UNICODE)
re_strip_newlines = re.compile(r'([\n\t])')
re_scores = re.compile(r'the ability to propose and conduct ground-(?:\s*breaking research\?)?\s*(Outstanding|Excellent|Very Good|Non-competitive).*?creative independent thinking\?\s*(Outstanding|Excellent|Very Good|Non-competitive).*?beyond the state of(?:\s*the art\?)?\s*(Outstanding|Excellent|Very Good|Non-competitive)', flags=re.IGNORECASE+re.DOTALL+re.UNICODE)


stopwords = set(word for word in nltk.corpus.stopwords.words('english')) - set(['no','not','few','nor','own','very'])
stopwords.add(u'also')

In [72]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'[\w-]+').tokenize
lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize
stemmer = nltk.stem.SnowballStemmer('english').stem
def parse_review(review):
    search = re_remove_standard_text.search(review)
    impact_project, approach, comments = search.groups()
    blob = ' '.join(search.groups())
    blob = re_panel.sub('',blob)
    blob = re_no_comment.sub('',blob)
    
    sentiment = TextBlob(blob).sentiment
    review = tokenizer(blob)
    
    word_list = []
    for word in review:
        lem_word = lemmatizer(word.lower())        
        if not (word in stopwords or lem_word in stopwords or word.isdigit()):
            word_list.append(lem_word)
    bigrams = list(nltk.bigrams(word_list))
    return blob, sentiment, Counter(word_list), Counter(bigrams)

def parse_evaluation_form(text):
    word_counter = Counter()
    sentiments = []
    new_r = []
    reviews = re_review.split(text)
    panel_comment = reviews[0]
    i = 0
    for review in reviews[1:]:
        if not review.strip():
            continue
        ret_r, sentiment, new_c, bigrams = parse_review(review)
        sentiments.append(sentiment)
        word_counter.update(new_c)
        new_r.append(ret_r)
    
    return new_r, sentiments, word_counter, bigrams
        

In [97]:
def n_matches(regex, review):
    result = regex.findall(review)
    print result
    pos = sum([1 for r in result if not r[0]])
    neg = sum([1 for r in result if r[0]])
    return (pos, neg)

In [98]:
re_standout = re.compile(r'(no[rt]?|few)?\s+(amazing|\w*?excellen\w*|exceptional\w*|extraordinar\w*|fabulous\w*|magnificent|most|outstanding|remarkable|superb\w*|suprem\w*|terrific\w*|unique|unmatched|unparalleled|wonderf\w*)',flags=re.IGNORECASE+re.DOTALL+re.UNICODE)
re_ability = re.compile(r'(no[rt]?|few)?\s+(abilit\w*|able|adept\w*|adroit\w*|analy\w*|aptitude|brain\w*|bright\w*|capab\w*|capacit\w*|clever\w*|compet\w*|creati\w*|expert\w*|flair|genius|gift\w*|inherent\w*|innate|insight\w*|instinct\w*|intell\w*|knack|natural\w*|proficien\w*|propensity|skill\w*|smart\w*|talent\w*)',flags=re.IGNORECASE+re.DOTALL+re.UNICODE)

def evaluate_review(reviews):
    for review in reviews:
        
        if review.startswith('Scores'):
            continue
        
        pos_standout, neg_standout = n_matches(re_standout, review)
        print "Standout pos:",pos_standout,"neg:",neg_standout
        pos_ability, neg_ability = n_matches(re_ability, review)
        print "Ability pos:",pos_ability,"neg:",neg_ability
        print '--'
        
        
    

In [99]:
files = find_files_folder('../eval','pdf')

save_file = open('reviews.csv','w+')

if not os.path.exists('cache/'):
    os.makedirs('cache')

if not os.path.exists('vienna/'):
    os.makedirs('vienna')

field_names = '_panel_appl,_panel_person,_panel_verdict,' + str('_r%d_appl,_r%d_meth,_r%d_scores,_r%d_person,' % i for i in range(1,8))
field_names = field_names[:-2].split(',')
save_file.write('identifier \t text \n')

for i,f in enumerate(files):
    #print f
    try:
        text = cPickle.load(open(doc.filepath_blob,'rb'))
    except Exception as e:
        filename_text = 'cache/'+os.path.splitext(os.path.basename(f))[0]
        text = reader.extract_text_pdf(f)
        with open(filename_text,'w') as fh:
            cPickle.dump(text,fh)
    
    panel_reviews = get_panel_reviews(text)
    
    base = os.path.splitext(os.path.basename(f))[0]
    vienna_file = open('vienna/'+base+'.txt','w+')
    for i, name in enumerate(field_names):
        try:
            field = panel_reviews[i].replace(u"\u2018", "'").replace(u"\u2019", "'")
            field = field.strip()
            field = field.replace('\n',' ')
            field = field.replace('\t',' ')
            field = field.replace(u'\xb4',"'")
            field = field.replace(u'\u201c','"')
            field = field.replace(u'\u201d','"')
            field = field.replace(u'\u2013','-')
            field = field.replace(u'\xef','i')
            field = field.encode("ascii",errors='ignore')
            
            save_file.write(base+name + '\t' + field + '\n')
            vienna_file.write(name[1:]+'\n' + field + '\n\n')
            
        except Exception as e:
            #print type(e), e
            #print 'Not here',name
            pass

    
    
    continue
    
print
print 'Done'
    


[]
Standout pos: 0.0 neg: 0.0
[]
Ability pos: 0.0 neg: 0.0
--
[]
Standout pos: 0.0 neg: 0.0
[]
Ability pos: 0.0 neg: 0.0
--
[(u'', u'excellence')]
Standout pos: 1 neg: 0.0
[]
Ability pos: 0.0 neg: 0.0
--
[]
Standout pos: 0.0 neg: 0.0
[]
Ability pos: 0.0 neg: 0.0
--
[]
Standout pos: 0.0 neg: 0.0
[(u'', u'expertise')]
Ability pos: 1 neg: 0.0
--
[]
Standout pos: 0.0 neg: 0.0
[]
Ability pos: 0.0 neg: 0.0
--
[]
Standout pos: 0.0 neg: 0.0
[(u'', u'natural'), (u'', u'innate'), (u'', u'natural'), (u'', u'innate')]
Ability pos: 4 neg: 0.0
--
[]
Standout pos: 0.0 neg: 0.0
[(u'', u'natural'), (u'', u'able'), (u'', u'natural'), (u'', u'able')]
Ability pos: 4 neg: 0.0
--
[]
Standout pos: 0.0 neg: 0.0
[(u'', u'skilled')]
Ability pos: 1 neg: 0.0
--
[]
Standout pos: 0.0 neg: 0.0
[(u'', u'natural'), (u'', u'competitive')]
Ability pos: 2 neg: 0.0
--
[]
Standout pos: 0.0 neg: 0.0
[]
Ability pos: 0.0 neg: 0.0
--
[]
Standout pos: 0.0 neg: 0.0
[]
Ability pos: 0.0 neg: 0.0
--
[]
Standout pos: 0.0 neg: 0.0
[]

In [None]:
import sqlalchemy
review, document = session.query(Review,Document.filepath).filter(Review.polarity == sqlalchemy.func.max(Review.polarity).select()).join(Document).one()
print review.polarity
print document, review.review+1
print review.text

In [None]:
review, document = session.query(Review,Document.filepath).filter(Review.polarity == sqlalchemy.func.min(Review.polarity).select()).join(Document).one()
print review.polarity
print document, review.review+1
print review.text

In [53]:
def get_text(session,id):
    return session.query(Word.text).filter(Word.id == id).one()[0]

In [56]:
import sqlalchemy
sum_wordcount = sqlalchemy.func.sum(WordUse.count)
ordered_wordcounts = [(c, id) for c, id in session.query(sum_wordcount,WordUse.word_id).group_by(WordUse.word_id).order_by(sum_wordcount.desc()).all()]

for c,id in ordered_wordcounts:
    text = get_text(session,id)
    #print "%-19s\t%d" %(text,c)

In [57]:
import sqlalchemy
sum_bigram = sqlalchemy.func.sum(BigramUse.count)
ordered_bigramcounts = [(c,w1,w2) for c,w1,w2 in session.query(sum_bigram, Bigram.word1_id, Bigram.word2_id).join(Bigram).group_by(Bigram.word1_id).group_by(Bigram.word2_id).join(Word,Word.id.label('word1') == Bigram.word1_id).order_by(sum_bigram.desc()).all()]
for c,w1,w2 in ordered_bigramcounts:
    t1 = get_text(session,w1)
    t2 = get_text(session, w2)
    #print "%-20s %-20s %d" % (t1,t2,c)

In [136]:
session.rollback()

In [4]:
import nltk
 
text = """This project is a rather naive prospective clinical study proposal that aims at correlative analysis of such 'traffic accidents', 
 such as invasive bacterial disease, and as such is based on vaguely formulated observational hypothesis on bioavailability 
 of trace metals as predisposing condition. The scientific rational behind this proposal is poorly defined and the approach is 
 unlikely to yield a mechanistic insight from such a superficially conceived study, which represents a 'fishing expedition' 
 rather than a scientific research program. Groundbreaking results are unlikely to be generated as the approach ignores 
 many predisposing conditions and physiological and genetic variations and predispositions of the test and control cohorts."""
 
# Used when tokenizing words
sentence_re = r'''(?x)      # set flag to allow verbose regexps
      ([A-Z])(\.[A-Z])+\.?  # abbreviations, e.g. U.S.A.
    | \w+(-\w+)*            # words with optional internal hyphens
    | \$?\d+(\.\d+)?%?      # currency and percentages, e.g. $12.40, 82%
    | \.\.\.                # ellipsis
    | [][.,;"'?():-_`]      # these are separate tokens
'''
 
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()
 
#Taken from Su Nam Kim Paper...
grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
"""
chunker = nltk.RegexpParser(grammar)
 
toks = nltk.regexp_tokenize(text, sentence_re)
postoks = nltk.tag.pos_tag(toks)
 
print postoks
 
tree = chunker.parse(postoks)
 
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
 
 
def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        yield subtree.leaves()
 
def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    #word = stemmer.stem_word(word)
    word = lemmatizer.lemmatize(word)
    return word
 
def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted
 
 
def get_terms(tree):
    for leaf in leaves(tree):
        term = [ normalise(w) for w,t in leaf if acceptable_word(w) ]
        yield term
 
terms = get_terms(tree)
 
for term in terms:
    for word in term:
        print word,
    print

[('This', 'DT'), ('project', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('rather', 'RB'), ('naive', 'JJ'), ('prospective', 'JJ'), ('clinical', 'JJ'), ('study', 'NN'), ('proposal', 'NN'), ('that', 'WDT'), ('aims', 'VBZ'), ('at', 'IN'), ('correlative', 'JJ'), ('analysis', 'NN'), ('of', 'IN'), ('such', 'JJ'), ("'", 'POS'), ('traffic', 'JJ'), ('accidents', 'NNS'), ("'", 'POS'), (',', ','), ('such', 'JJ'), ('as', 'IN'), ('invasive', 'JJ'), ('bacterial', 'JJ'), ('disease', 'NN'), (',', ','), ('and', 'CC'), ('as', 'IN'), ('such', 'JJ'), ('is', 'VBZ'), ('based', 'VBN'), ('on', 'IN'), ('vaguely', 'RB'), ('formulated', 'VBN'), ('observational', 'JJ'), ('hypothesis', 'NN'), ('on', 'IN'), ('bioavailability', 'NN'), ('of', 'IN'), ('trace', 'NN'), ('metals', 'NNS'), ('as', 'IN'), ('predisposing', 'VBG'), ('condition', 'NN'), ('.', '.'), ('The', 'DT'), ('scientific', 'JJ'), ('rational', 'JJ'), ('behind', 'NN'), ('this', 'DT'), ('proposal', 'NN'), ('is', 'VBZ'), ('poorly', 'RB'), ('defined', 'VBN'), ('and', '