Permalink
Browse files

ese multilang closes #35

  • Loading branch information...
1 parent 1ecfa77 commit b1a29d662cb5e7590394a5a956f75e897effd17a @pierrejdlf pierrejdlf committed Mar 9, 2012
View
Binary file not shown.
View
@@ -24,34 +24,39 @@
############################################################ DOCUMENTS meta_documents.csv
# A) meta_documents.csv : COLUMN *category1 that are accepted, and their translation in the view
+# QUALI "RESEARCH PHASE"
DOC_CAT_1={}
-DOC_CAT_1['preparatory'] = 'Preparatory'
-DOC_CAT_1['terrain'] = 'Fieldwork'
-DOC_CAT_1['data'] = 'Data'
-DOC_CAT_1['verbatim'] = 'Verbatim'
-DOC_CAT_1['result'] = 'Result'
-DOC_CAT_1['ese'] = 'ese'
-DOC_CAT_1['misc'] = 'Misc'
+DOC_CAT_1['preparatory'] = 'Preparatory'
+DOC_CAT_1['terrain'] = 'Fieldwork'
+DOC_CAT_1['data'] = 'Data'
+DOC_CAT_1['result'] = 'Result'
+DOC_CAT_1['enrich'] = 'Complement'
+DOC_CAT_1['misc'] = 'Misc'
+DOC_CAT_1['verbatim'] = '?Verbatim?'
# B) meta_documents.csv : COLUMN *category2 that are accepted, and their translation in the view
+# QUALI "DOCUMENT TYPE"
DOC_CAT_2={}
-DOC_CAT_2['pv'] = 'Minutes'
-DOC_CAT_2['note'] = 'Notes'
-DOC_CAT_2['transcr'] = 'Transcr'
-DOC_CAT_2['video'] = 'Video'
-DOC_CAT_2['audio'] = 'Audio'
-DOC_CAT_2['resum'] = 'Summary'
-DOC_CAT_2['analyse'] = 'Analysis'
-DOC_CAT_2['quest'] = 'Quest'
-DOC_CAT_2['publi'] = 'Publ'
-DOC_CAT_2['com'] = 'Com'
-DOC_CAT_2['rap'] = 'Rep'
-DOC_CAT_2['recrut'] = 'Recr'
-DOC_CAT_2['guide'] = 'Guide'
-DOC_CAT_2['misc'] = 'Misc'
-
-# C) meta_documents.csv : COLUMN *mimetype
-DOCUMENT_MIMETYPES=['xml','pdf','htm','csv','link','ref']
+DOC_CAT_2['transcr'] = 'Transcr'
+DOC_CAT_2['pv'] = 'Minutes'
+DOC_CAT_2['note'] = 'Notes'
+DOC_CAT_2['video'] = 'Video'
+DOC_CAT_2['audio'] = 'Audio'
+DOC_CAT_2['resum'] = 'Summary'
+DOC_CAT_2['analyse'] = 'Analysis'
+DOC_CAT_2['quest'] = 'Quest'
+DOC_CAT_2['publi'] = 'Publ'
+DOC_CAT_2['com'] = 'Com'
+DOC_CAT_2['rap'] = 'Rep'
+DOC_CAT_2['recrut'] = 'Recr'
+DOC_CAT_2['guide'] = 'Guide'
+DOC_CAT_2['misc'] = 'Misc'
+
+# C) meta_documents.csv : COLUMN *mimetype .
+DOCUMENT_MIMETYPES = ['ese','tei'] # special files (parsed)
+DOCUMENT_MIMETYPES += ['pdf','htm','csv'] # normaly displayed docs
+DOCUMENT_MIMETYPES += ['link','ref'] # doc without local file
+
# documents are parsed only if they are in A) & B) & C)
# note that ese is also processed, but in a different way. see importexport.py
@@ -73,16 +78,19 @@
############################################################ SPEAKERS meta_speakers.csv
# Speaker type translator from meta_speakers.csv
SPEAKER_TYPE_CSV_DICT = {
- 'speaker':'SPK',
- 'investigator':'INV',
- 'protagonist':'PRO',
+ 'investigator' :'INV', # (esBrowse off) researcher / interviewer
+ 'speaker' :'SPK', # (public) main spk(s) interviewed
+ 'protagonist' :'PRO', # (public) not interviewed
+ 'figurant' :'FIG', # (esBrowse off) just mentionned
}
+# only public (ie listed on esBrowse) speakers are considered for viz: attributes, ngrams, etc...
# Speaker types for django model
SPEAKER_TYPE_CHOICES = (
('INV', 'Investigator'),
('SPK', 'Speaker'),
('PRO', 'Protagonist'),
+ ('FIG', 'Figurant'),
('OTH', 'Unknown'),
)
@@ -91,6 +99,7 @@
SPK_COLORS['INV']='#EFEDFC'
SPK_COLORS['SPK']='#E3FBE9'
SPK_COLORS['PRO']='#FFDC98'
+SPK_COLORS['FIG']='#FFDC98'
SPK_COLORS['OTH']='#FFFFFF'
# To know if we show/hide spk attributesin the view - based on .startswith("_")
View
@@ -18,7 +18,8 @@
from django.contrib.contenttypes.models import ContentType
from django.core import serializers
-#from xml.etree.ElementTree import ElementTree
+
+from xml.etree.ElementTree import ElementTree
from lxml import etree
# converting parsing rtf
@@ -78,7 +79,7 @@ def doFiestaToEnquete(e):
except:
logger.info("["+str(e.id)+"] EXCEPT making streamtimeline viz: texteid="+str(t.id))
- logger.info("["+str(e.id)+"] all texts were sucessfully parsed")
+ logger.info("["+str(e.id)+"] all TEI files were sucessfully parsed")
####### UPDATE ALL TFIDF
# ie fetch ngrams from solr and store them in django model (easier then to make viz using thoses objects rather than fetching ngrams everytime)
@@ -193,14 +194,14 @@ def importEnqueteUsingMeta(folderPath):
except:
doc_date = datetime.datetime.today()
- ### special for ese
- if doc_category1=='ese':
- try:
- esedict = getEnqueteSurEnqueteJson(file_location,newEnquete)
- newEnquete.ese = simplejson.dumps(esedict,indent=4,ensure_ascii=False)
- newEnquete.save()
- except:
- logger.info(eidstr+"EXCEPT with ESE")
+ ### special for ese, don't create any texte() model, just parse ese.xml
+ if doc_mimetype=='ese':
+ #try:
+ esedict = getEnqueteSurEnqueteJson(file_location,newEnquete)
+ newEnquete.ese = simplejson.dumps(esedict,indent=4,ensure_ascii=False)
+ newEnquete.save()
+ #except:
+ #logger.info(eidstr+"EXCEPT with ESE")
### if normal cat create doc
elif doc_category1 in DOC_CAT_1.keys() and doc_category2 in DOC_CAT_2.keys():
if doc_mimetype in DOCUMENT_MIMETYPES:
@@ -218,7 +219,7 @@ def importEnqueteUsingMeta(folderPath):
except:
newDocument.filesize = -1
logger.info(eidstr+"EXCEPT file does not exist: "+doc_mimetype+" | "+doc_category1+" | "+doc_category2+" | "+file_location)
- if doc_mimetype=='xml' and doc_category1=='verbatim' and doc_category2=='transcr':
+ if doc_mimetype=='tei':
newDocument.doctype = 'TEI'
newDocument.status = '5'
newDocument.save()
@@ -279,6 +280,7 @@ def importEnqueteUsingMeta(folderPath):
spk_type = SPEAKER_TYPE_CSV_DICT.get(row['*type'],'OTH')
spk_name = row['*pseudo']
newSpeaker,isnew = Speaker.objects.get_or_create(enquete=newEnquete,ddi_id=spk_id,ddi_type=spk_type,name=spk_name)
+ newSpeaker.public = (spk_type=='SPK' or spk_type=='PRO')
for attype in attributetypes:
attval=row[attype.name]
if attval=='':
@@ -311,50 +313,65 @@ def importEnqueteUsingMeta(folderPath):
###########################################################################
# return json with all data from ese
def getEnqueteSurEnqueteJson(eseXmlPath,e):
- logger.info("["+str(e.id)+"] Fetching ese infos from xml: "+eseXmlPath)
+ eidstr = "["+str(e.id)+"] "
+ logger.info(eidstr+"=========== PARSING ESE XML: "+eseXmlPath)
res={}
- tree = ElementTree()
- tree.parse(eseXmlPath)
+ tree = etree.parse(eseXmlPath)
root = tree.getroot()
baseEseXmlFolder = '/'+'/'.join(eseXmlPath.split('/')[:-1])+'/'
- # Fetching summary
- summary = root.findall('StudyUnit/Summary')[0]
- res['reportpath'] = baseEseXmlFolder + summary.attrib['report']
- res['html'] = summary.text
- res['audiopaths'] = {}
+ out = {}
+ out['audiopaths'] = {}
apacount = 0
-
- # Fetching chapters
- thechapters = []
- for chapter in root.findall('StudyUnit/Chapters/Chapter'):
- chapt = {}
- chapt['name'] = chapter.attrib['name']
- chapt['html'] = chapter.findall('text')[0].text
- thesubchapters = []
- for subChapter in chapter.findall('SubChapter'):
- subchapt = {}
- subchapt['name'] = subChapter.attrib['name']
- subchapt['audiopath'] = subChapter.attrib['location']
- # as the mp3 files may be located
- # either (good) in the _ese folder
- # either in the REANALYSEESE_FILES folder
- # we need to check availability, mmh..
- if os.path.exists(baseEseXmlFolder+subchapt['audiopath']):
- subchapt['audiopath'] = baseEseXmlFolder + subchapt['audiopath']
- else:
- subchapt['audiopath'] = settings.REANALYSEESE_FILES+'/'+e.ddi_id+'/'+ subchapt['audiopath']
- # rather store an id referencing real path in res['audiopaths']
- res['audiopaths'][str(apacount)] = subchapt['audiopath']
- subchapt['audioid'] = str(apacount)
- apacount+=1
- thesubchapters.append(subchapt)
- chapt['subchapters'] = thesubchapters
- thechapters.append(chapt)
- res['chapters'] = thechapters
- return res
+ for lan in ['fr','en']:
+ res = {}
+
+ # Fetching report
+ rep = root.findall('Report')[0]
+ res['reportpath'] = baseEseXmlFolder + rep.find('file[@lang="'+lan+'"]').attrib['location']
+
+ # Fetching chapters
+ thechapters = []
+ for chapter in root.findall('Chapters/Chapter'):
+ chapt = {}
+ chapt['name'] = chapter.find('./title[@lang="'+lan+'"]').text
+ chapt['html'] = chapter.find('./text[@lang="'+lan+'"]').text
+ thesubchapters = []
+ for subChapter in chapter.findall('SubChapter'):
+ #try:
+ subchapt = {}
+ aud = subChapter.find('audio[@lang="'+lan+'"]')
+ subchapt['name'] = aud.attrib['name']
+ subchapt['audiopath'] = aud.attrib['location']
+ # as the mp3 files may be located
+ # either (good) in the _ese folder
+ # either in the REANALYSEESE_FILES folder
+ # we need to check availability, mmh..
+ patharchive = baseEseXmlFolder+subchapt['audiopath']
+ if os.path.exists( patharchive ):
+ subchapt['audiopath'] = patharchive
+ else:
+ pathserver = settings.REANALYSEESE_FILES+'/'+e.ddi_id+'/'+ subchapt['audiopath']
+ if os.path.exists( pathserver ):
+ subchapt['audiopath'] = pathserver
+ else:
+ logger.info("["+str(e.id)+"] EXCEPT no audio file: "+patharchive)
+ logger.info("["+str(e.id)+"] EXCEPT no audio file: "+pathserver)
+
+ # rather store an id referencing real path in out['audiopaths']
+ out['audiopaths'][str(apacount)] = subchapt['audiopath']
+ subchapt['audioid'] = str(apacount)
+ apacount+=1
+ thesubchapters.append(subchapt)
+ #except:
+ #logger.info("["+str(e.id)+"] EXCEPT with subchapter")
+ chapt['subchapters'] = thesubchapters
+ thechapters.append(chapt)
+ res['chapters'] = thechapters
+ out[lan] = res
+ return out
###########################################################################
View
@@ -7,25 +7,28 @@
from reanalyse.reanalyseapp.utils import *
from django.db import models
+
+# For math manipulations in TF,DF,TFIDF
+from django.db.models import Avg, Max, Min, Count
+
from xml.etree.ElementTree import ElementTree
+from lxml import etree
+
from django.conf import settings
# for date manip on parsing
import datetime
import simplejson
-from lxml import etree
from itertools import chain
from string import maketrans
import re, os
# Python memory lookup
-import psutil
+#import psutil
-# For math manipulations in TF,DF,TFIDF
-from django.db.models import Avg, Max, Min, Count
####################
import django_tables2 as tables
from django.utils.safestring import mark_safe
@@ -179,6 +182,7 @@ class Speaker(models.Model):
## USED TO KNOW (investigator/speaker/protagonist)
ddi_type = models.CharField(max_length=3, choices=SPEAKER_TYPE_CHOICES)
#################
+ public = models.BooleanField(default=False) # visible in esbrowse or not
attributes = models.ManyToManyField(Attribute)
color = models.CharField(max_length=7,default=HTML_COLORS[0])
# we may have to put all his text content in a TextField, to index with solr...
@@ -23,7 +23,7 @@ def docPublicCount(e):
return e.texte_set.count()
@register.filter
def spkPublicCount(e):
- return e.speaker_set.filter(ddi_type='SPK').count()
+ return e.speaker_set.filter(public=True).count()
###########################################################################
# to get speaker description ON THE LEFT
@register.filter
View
@@ -206,6 +206,9 @@ def eDelete(request,eid):
logger.info("["+str(eid)+"] removing graph file: "+v.locationpath)
os.system("rm -R "+v.locationpath)
e.delete()
+ # update index to avoid outdated data in lucene
+ update_index.Command().handle(verbosity=0)
+ logger.info("SOLR INDEX UPDATED")
return render_to_response('bq_e_browse.html', context_instance=RequestContext(request))
###########################################################################
def logoutuser(request):
@@ -686,7 +689,8 @@ def eseShow(request,eid):
ese = simplejson.loads(e.ese)
else:
ese = None
- ctx = {'bodyid':'e','pageid':'ese','enquete':e,'ese':ese}
+ lan = request.LANGUAGE_CODE
+ ctx = {'bodyid':'e','pageid':'ese','enquete':e,'ese':ese[lan]}
updateCtxWithPerm(ctx,request,e)
updateCtxWithSearchForm(ctx)
return render_to_response('bq_e_ese.html',ctx ,context_instance=RequestContext(request))
@@ -1187,7 +1191,8 @@ def ecShow(request,eid,cid):
def getEseReport(request,eid):
e = Enquete.objects.get(id=eid)
ese = simplejson.loads(e.ese)
- filepath = ese['reportpath']
+ lan = request.LANGUAGE_CODE
+ filepath = ese[lan]['reportpath']
logger.info("["+str(eid)+"] Downloading ESE report:"+filepath)
#pdfname = eseid + "_"+ese.report.split('/')[-1]
pdfname = 'enquetesurenquete.pdf'
@@ -99,7 +99,7 @@ def makeViz(e,typ,speakers=[],textes=[],attributetypes=[],count=0):
newVizu = makeVisualizationObject(e,typ,descr)
if speakers==[]:
if textes==[]:
- speakers = e.speaker_set.exclude(ddi_type='INV')
+ speakers = e.speaker_set.filter(public=True)
else:
speakers=[]
for t in textes:
@@ -122,7 +122,7 @@ def makeViz(e,typ,speakers=[],textes=[],attributetypes=[],count=0):
if textes==[]:
textes = Texte.objects.filter(doctype='TEI')
for t in textes:
- spks = t.speaker_set.exclude(ddi_type='INV')
+ spks = t.speaker_set.filter(public=True)
if len(spks)>0:
newVizu = makeVisualizationObject(e,typ,descr)
d = visMakeTagCloudFromTermVectors(e,{'count':count,'who':spks})
@@ -553,7 +553,7 @@ def visMakeOverview(e):
##### TRYOUT C : only speakers
links=[]
- for s in e.speaker_set.exclude(ddi_type='INV'):
+ for s in e.speaker_set.exclude(public=True):
links.append({'spk':{'id':s.id,'label':s.name,'weight':0}})
res['links']=links
return res
@@ -862,6 +862,7 @@ def getSolrTermVectorsDict(speakers,field,mintn): # field = 'text'/'ngrams'
####################################################################
# to avoid querying solr everyday, we store ngrams in DB
def makeAllTfidf(e):
+ logger.info("["+str(e.id)+"] now updating tfidf ...")
for s in e.speaker_set.all():
#logger.info("now reseting tfidf ngrams for speaker:"+str(s.id))
s.ngramspeaker_set.all().delete()
@@ -870,6 +871,7 @@ def makeAllTfidf(e):
d=termd[w]
newNgram,isnew = Ngram.objects.get_or_create(enquete=e,content=w,df=d['df'])
newNgramSpeaker,isnew = NgramSpeaker.objects.get_or_create(enquete=e,ngram=newNgram,speaker=s,tf=d['tf'],tn=d['tn'],tfidf=d['tfidf'])
+ logger.info("["+str(e.id)+"] tfidf sucessfully updated")
####################################################################

0 comments on commit b1a29d6

Please sign in to comment.