Skip to content
This repository
Browse code

ese multilang closes #35

  • Loading branch information...
commit b1a29d662cb5e7590394a5a956f75e897effd17a 1 parent 1ecfa77
Pierre JdlF authored March 09, 2012
BIN  media/samples/enquete.zip
Binary file not shown
63  reanalyseapp/globalvars.py
@@ -24,34 +24,39 @@
24 24
 
25 25
 ############################################################ DOCUMENTS meta_documents.csv
26 26
 # A) meta_documents.csv : COLUMN *category1 that are accepted, and their translation in the view
  27
+# QUALI "RESEARCH PHASE"
27 28
 DOC_CAT_1={}
28  
-DOC_CAT_1['preparatory'] = 'Preparatory'
29  
-DOC_CAT_1['terrain'] = 'Fieldwork'
30  
-DOC_CAT_1['data'] = 'Data'
31  
-DOC_CAT_1['verbatim'] = 'Verbatim'
32  
-DOC_CAT_1['result'] = 'Result'
33  
-DOC_CAT_1['ese'] = 'ese'
34  
-DOC_CAT_1['misc'] = 'Misc'
  29
+DOC_CAT_1['preparatory']	= 'Preparatory'
  30
+DOC_CAT_1['terrain'] 		= 'Fieldwork'
  31
+DOC_CAT_1['data'] 			= 'Data'
  32
+DOC_CAT_1['result'] 		= 'Result'
  33
+DOC_CAT_1['enrich'] 		= 'Complement'
  34
+DOC_CAT_1['misc'] 			= 'Misc'
  35
+DOC_CAT_1['verbatim'] 		= '?Verbatim?'	
35 36
 
36 37
 # B) meta_documents.csv : COLUMN *category2 that are accepted, and their translation in the view
  38
+# QUALI "DOCUMENT TYPE"
37 39
 DOC_CAT_2={}
38  
-DOC_CAT_2['pv'] = 'Minutes'
39  
-DOC_CAT_2['note'] = 'Notes'
40  
-DOC_CAT_2['transcr'] = 'Transcr'
41  
-DOC_CAT_2['video'] = 'Video'
42  
-DOC_CAT_2['audio'] = 'Audio'
43  
-DOC_CAT_2['resum'] = 'Summary'
44  
-DOC_CAT_2['analyse'] = 'Analysis'
45  
-DOC_CAT_2['quest'] = 'Quest'
46  
-DOC_CAT_2['publi'] = 'Publ'
47  
-DOC_CAT_2['com'] = 'Com'
48  
-DOC_CAT_2['rap'] = 'Rep'
49  
-DOC_CAT_2['recrut'] = 'Recr'
50  
-DOC_CAT_2['guide'] = 'Guide'
51  
-DOC_CAT_2['misc'] = 'Misc'
52  
-
53  
-# C) meta_documents.csv : COLUMN *mimetype
54  
-DOCUMENT_MIMETYPES=['xml','pdf','htm','csv','link','ref']
  40
+DOC_CAT_2['transcr'] 	= 'Transcr'
  41
+DOC_CAT_2['pv'] 		= 'Minutes'
  42
+DOC_CAT_2['note'] 		= 'Notes'
  43
+DOC_CAT_2['video'] 		= 'Video'
  44
+DOC_CAT_2['audio'] 		= 'Audio'
  45
+DOC_CAT_2['resum'] 		= 'Summary'
  46
+DOC_CAT_2['analyse'] 	= 'Analysis'
  47
+DOC_CAT_2['quest'] 		= 'Quest'
  48
+DOC_CAT_2['publi'] 		= 'Publ'
  49
+DOC_CAT_2['com'] 		= 'Com'
  50
+DOC_CAT_2['rap'] 		= 'Rep'
  51
+DOC_CAT_2['recrut'] 	= 'Recr'
  52
+DOC_CAT_2['guide'] 		= 'Guide'
  53
+DOC_CAT_2['misc'] 		= 'Misc'
  54
+
  55
+# C) meta_documents.csv : COLUMN *mimetype . 
  56
+DOCUMENT_MIMETYPES 	=  ['ese','tei']			# special files (parsed)
  57
+DOCUMENT_MIMETYPES	+= ['pdf','htm','csv']		# normaly displayed docs
  58
+DOCUMENT_MIMETYPES 	+= ['link','ref']			# doc without local file
  59
+
55 60
 
56 61
 # documents are parsed only if they are in A) & B) & C)
57 62
 # note that ese is also processed, but in a different way. see importexport.py
@@ -73,16 +78,19 @@
73 78
 ############################################################ SPEAKERS meta_speakers.csv
74 79
 # Speaker type translator from meta_speakers.csv
75 80
 SPEAKER_TYPE_CSV_DICT = {
76  
-	'speaker':'SPK',
77  
-	'investigator':'INV',
78  
-	'protagonist':'PRO',
  81
+	'investigator'	:'INV',		# (esBrowse off) researcher / interviewer
  82
+	'speaker'		:'SPK',		# (public) main spk(s) interviewed
  83
+	'protagonist'	:'PRO',		# (public) not interviewed
  84
+	'figurant'		:'FIG',		# (esBrowse off) just mentionned
79 85
 }
  86
+# only public (ie listed on esBrowse) speakers are considered for viz: attributes, ngrams, etc...
80 87
 
81 88
 # Speaker types for django model
82 89
 SPEAKER_TYPE_CHOICES = (
83 90
 	('INV', 'Investigator'),
84 91
 	('SPK', 'Speaker'),
85 92
 	('PRO', 'Protagonist'),
  93
+	('FIG', 'Figurant'),
86 94
 	('OTH', 'Unknown'),
87 95
 )
88 96
 
@@ -91,6 +99,7 @@
91 99
 SPK_COLORS['INV']='#EFEDFC'
92 100
 SPK_COLORS['SPK']='#E3FBE9'
93 101
 SPK_COLORS['PRO']='#FFDC98'
  102
+SPK_COLORS['FIG']='#FFDC98'
94 103
 SPK_COLORS['OTH']='#FFFFFF'
95 104
 
96 105
 # To know if we show/hide spk attributesin the view - based on .startswith("_")
113  reanalyseapp/imexport.py
@@ -18,7 +18,8 @@
18 18
 from django.contrib.contenttypes.models import ContentType
19 19
 
20 20
 from django.core import serializers
21  
-#from xml.etree.ElementTree import ElementTree
  21
+
  22
+from xml.etree.ElementTree import ElementTree
22 23
 from lxml import etree
23 24
 
24 25
 # converting parsing rtf
@@ -78,7 +79,7 @@ def doFiestaToEnquete(e):
78 79
 		except:
79 80
 			logger.info("["+str(e.id)+"] EXCEPT making streamtimeline viz: texteid="+str(t.id))
80 81
 		
81  
-	logger.info("["+str(e.id)+"] all texts were sucessfully parsed")
  82
+	logger.info("["+str(e.id)+"] all TEI files were sucessfully parsed")
82 83
 	
83 84
 	####### UPDATE ALL TFIDF
84 85
 	# ie fetch ngrams from solr and store them in django model (easier then to make viz using thoses objects rather than fetching ngrams everytime)
@@ -193,14 +194,14 @@ def importEnqueteUsingMeta(folderPath):
193 194
 				except:
194 195
 					doc_date = datetime.datetime.today()
195 196
 
196  
-				### special for ese
197  
-				if doc_category1=='ese':
198  
-					try:
199  
-						esedict = getEnqueteSurEnqueteJson(file_location,newEnquete)
200  
-						newEnquete.ese = simplejson.dumps(esedict,indent=4,ensure_ascii=False)
201  
-						newEnquete.save()
202  
-					except:
203  
-						logger.info(eidstr+"EXCEPT with ESE")
  197
+				### special for ese, don't create any texte() model, just parse ese.xml
  198
+				if doc_mimetype=='ese':
  199
+					#try:
  200
+					esedict = getEnqueteSurEnqueteJson(file_location,newEnquete)
  201
+					newEnquete.ese = simplejson.dumps(esedict,indent=4,ensure_ascii=False)
  202
+					newEnquete.save()
  203
+					#except:
  204
+						#logger.info(eidstr+"EXCEPT with ESE")
204 205
 				### if normal cat create doc
205 206
 				elif doc_category1 in DOC_CAT_1.keys() and doc_category2 in DOC_CAT_2.keys():
206 207
 					if doc_mimetype in DOCUMENT_MIMETYPES:
@@ -218,7 +219,7 @@ def importEnqueteUsingMeta(folderPath):
218 219
 							except:
219 220
 								newDocument.filesize = -1
220 221
 								logger.info(eidstr+"EXCEPT file does not exist: "+doc_mimetype+" | "+doc_category1+" | "+doc_category2+" | "+file_location)
221  
-							if doc_mimetype=='xml' and doc_category1=='verbatim' and doc_category2=='transcr':
  222
+							if doc_mimetype=='tei':
222 223
 								newDocument.doctype	= 'TEI'
223 224
 								newDocument.status	= '5'
224 225
 								newDocument.save()
@@ -279,6 +280,7 @@ def importEnqueteUsingMeta(folderPath):
279 280
 					spk_type = 	SPEAKER_TYPE_CSV_DICT.get(row['*type'],'OTH')
280 281
 					spk_name = 	row['*pseudo']
281 282
 					newSpeaker,isnew = Speaker.objects.get_or_create(enquete=newEnquete,ddi_id=spk_id,ddi_type=spk_type,name=spk_name)
  283
+					newSpeaker.public = (spk_type=='SPK' or spk_type=='PRO')
282 284
 					for attype in attributetypes:
283 285
 						attval=row[attype.name]
284 286
 						if attval=='':
@@ -311,50 +313,65 @@ def importEnqueteUsingMeta(folderPath):
311 313
 ###########################################################################
312 314
 # return json with all data from ese
313 315
 def getEnqueteSurEnqueteJson(eseXmlPath,e):
314  
-	logger.info("["+str(e.id)+"] Fetching ese infos from xml: "+eseXmlPath)
  316
+	eidstr = "["+str(e.id)+"] "
  317
+	logger.info(eidstr+"=========== PARSING ESE XML: "+eseXmlPath)
315 318
 	res={}
316 319
 	
317  
-	tree = ElementTree()
318  
-	tree.parse(eseXmlPath)
  320
+	tree = etree.parse(eseXmlPath)
319 321
 	root = tree.getroot()
320 322
 	
321 323
 	baseEseXmlFolder = '/'+'/'.join(eseXmlPath.split('/')[:-1])+'/'
322 324
 	
323  
-	# Fetching summary
324  
-	summary = root.findall('StudyUnit/Summary')[0]
325  
-	res['reportpath'] = baseEseXmlFolder + summary.attrib['report']
326  
-	res['html'] = summary.text
327  
-	res['audiopaths'] = {}
  325
+	out = {}
  326
+	out['audiopaths'] = {}
328 327
 	apacount = 0
329  
-	
330  
-	# Fetching chapters
331  
-	thechapters = []
332  
-	for chapter in root.findall('StudyUnit/Chapters/Chapter'):
333  
-		chapt = {}
334  
-		chapt['name'] = chapter.attrib['name']
335  
-		chapt['html'] = chapter.findall('text')[0].text
336  
-		thesubchapters = []
337  
-		for subChapter in chapter.findall('SubChapter'):
338  
-			subchapt = {}
339  
-			subchapt['name'] = subChapter.attrib['name']
340  
-			subchapt['audiopath'] = subChapter.attrib['location']
341  
-			# as the mp3 files may be located
342  
-			# either (good) in the _ese folder
343  
-			# either in the REANALYSEESE_FILES folder
344  
-			# we need to check availability, mmh..
345  
-			if os.path.exists(baseEseXmlFolder+subchapt['audiopath']):
346  
-				subchapt['audiopath'] = baseEseXmlFolder + subchapt['audiopath']
347  
-			else:
348  
-				subchapt['audiopath'] = settings.REANALYSEESE_FILES+'/'+e.ddi_id+'/'+ subchapt['audiopath']
349  
-			# rather store an id referencing real path in res['audiopaths']
350  
-			res['audiopaths'][str(apacount)] = subchapt['audiopath']
351  
-			subchapt['audioid'] = str(apacount)
352  
-			apacount+=1
353  
-			thesubchapters.append(subchapt)
354  
-		chapt['subchapters'] = thesubchapters
355  
-		thechapters.append(chapt)
356  
-	res['chapters'] = thechapters
357  
-	return res
  328
+	for lan in ['fr','en']:
  329
+		res = {}
  330
+		
  331
+		# Fetching report
  332
+		rep = root.findall('Report')[0]
  333
+		res['reportpath'] = baseEseXmlFolder + rep.find('file[@lang="'+lan+'"]').attrib['location']
  334
+		
  335
+		# Fetching chapters
  336
+		thechapters = []
  337
+		for chapter in root.findall('Chapters/Chapter'):
  338
+			chapt = {}
  339
+			chapt['name'] = chapter.find('./title[@lang="'+lan+'"]').text
  340
+			chapt['html'] = chapter.find('./text[@lang="'+lan+'"]').text
  341
+			thesubchapters = []
  342
+			for subChapter in chapter.findall('SubChapter'):
  343
+				#try:
  344
+				subchapt = {}
  345
+				aud = subChapter.find('audio[@lang="'+lan+'"]')
  346
+				subchapt['name'] 		= aud.attrib['name']
  347
+				subchapt['audiopath'] 	= aud.attrib['location']
  348
+				# as the mp3 files may be located
  349
+				# either (good) in the _ese folder
  350
+				# either in the REANALYSEESE_FILES folder
  351
+				# we need to check availability, mmh..
  352
+				patharchive = baseEseXmlFolder+subchapt['audiopath']
  353
+				if os.path.exists( patharchive ):
  354
+					subchapt['audiopath'] = patharchive
  355
+				else:
  356
+					pathserver = settings.REANALYSEESE_FILES+'/'+e.ddi_id+'/'+ subchapt['audiopath']
  357
+					if os.path.exists( pathserver ):
  358
+						subchapt['audiopath'] = pathserver
  359
+					else:
  360
+						logger.info("["+str(e.id)+"] EXCEPT no audio file: "+patharchive)
  361
+						logger.info("["+str(e.id)+"] EXCEPT no audio file: "+pathserver)
  362
+					
  363
+				# rather store an id referencing real path in out['audiopaths']
  364
+				out['audiopaths'][str(apacount)] = subchapt['audiopath']
  365
+				subchapt['audioid'] = str(apacount)
  366
+				apacount+=1
  367
+				thesubchapters.append(subchapt)
  368
+				#except:
  369
+					#logger.info("["+str(e.id)+"] EXCEPT with subchapter")
  370
+			chapt['subchapters'] = thesubchapters
  371
+			thechapters.append(chapt)
  372
+		res['chapters'] = thechapters
  373
+		out[lan] = res
  374
+	return out
358 375
 ###########################################################################
359 376
 
360 377
 
12  reanalyseapp/models.py
@@ -7,7 +7,13 @@
7 7
 from reanalyse.reanalyseapp.utils import *
8 8
 
9 9
 from django.db import models
  10
+
  11
+# For math manipulations in TF,DF,TFIDF
  12
+from django.db.models import Avg, Max, Min, Count
  13
+
10 14
 from xml.etree.ElementTree import ElementTree
  15
+from lxml import etree
  16
+
11 17
 from django.conf import settings
12 18
 
13 19
 # for date manip on parsing
@@ -15,17 +21,14 @@
15 21
 
16 22
 import simplejson
17 23
 
18  
-from lxml import etree
19 24
 from itertools import chain
20 25
 
21 26
 from string import maketrans
22 27
 import re, os
23 28
 
24 29
 # Python memory lookup
25  
-import psutil
  30
+#import psutil
26 31
 
27  
-# For math manipulations in TF,DF,TFIDF
28  
-from django.db.models import Avg, Max, Min, Count
29 32
 ####################
30 33
 import django_tables2 as tables
31 34
 from django.utils.safestring import mark_safe
@@ -179,6 +182,7 @@ class Speaker(models.Model):
179 182
 	## USED TO KNOW (investigator/speaker/protagonist)
180 183
 	ddi_type = models.CharField(max_length=3, choices=SPEAKER_TYPE_CHOICES)
181 184
 	#################
  185
+	public = models.BooleanField(default=False)					# visible in esbrowse or not
182 186
 	attributes = models.ManyToManyField(Attribute)
183 187
 	color = models.CharField(max_length=7,default=HTML_COLORS[0])
184 188
 	# we may have to put all his text content in a TextField, to index with solr...
2  reanalyseapp/templatetags/tags.py
@@ -23,7 +23,7 @@ def docPublicCount(e):
23 23
 	return e.texte_set.count()
24 24
 @register.filter
25 25
 def spkPublicCount(e):
26  
-	return e.speaker_set.filter(ddi_type='SPK').count()
  26
+	return e.speaker_set.filter(public=True).count()
27 27
 ###########################################################################    
28 28
 # to get speaker description ON THE LEFT
29 29
 @register.filter
9  reanalyseapp/views.py
@@ -206,6 +206,9 @@ def eDelete(request,eid):
206 206
 				logger.info("["+str(eid)+"] removing graph file: "+v.locationpath)
207 207
 				os.system("rm -R "+v.locationpath)
208 208
 		e.delete()
  209
+		# update index to avoid outdated data in lucene
  210
+		update_index.Command().handle(verbosity=0)
  211
+		logger.info("SOLR INDEX UPDATED")
209 212
 	return render_to_response('bq_e_browse.html', context_instance=RequestContext(request))
210 213
 ###########################################################################
211 214
 def logoutuser(request):
@@ -686,7 +689,8 @@ def eseShow(request,eid):
686 689
 		ese = simplejson.loads(e.ese)
687 690
 	else:
688 691
 		ese = None
689  
-	ctx = {'bodyid':'e','pageid':'ese','enquete':e,'ese':ese}
  692
+	lan = request.LANGUAGE_CODE
  693
+	ctx = {'bodyid':'e','pageid':'ese','enquete':e,'ese':ese[lan]}
690 694
 	updateCtxWithPerm(ctx,request,e)
691 695
 	updateCtxWithSearchForm(ctx)
692 696
 	return render_to_response('bq_e_ese.html',ctx ,context_instance=RequestContext(request))
@@ -1187,7 +1191,8 @@ def ecShow(request,eid,cid):
1187 1191
 def getEseReport(request,eid):
1188 1192
 	e = Enquete.objects.get(id=eid)
1189 1193
 	ese = simplejson.loads(e.ese)
1190  
-	filepath = ese['reportpath']
  1194
+	lan = request.LANGUAGE_CODE
  1195
+	filepath = ese[lan]['reportpath']
1191 1196
 	logger.info("["+str(eid)+"] Downloading ESE report:"+filepath)
1192 1197
 	#pdfname = eseid + "_"+ese.report.split('/')[-1]
1193 1198
 	pdfname = 'enquetesurenquete.pdf'
8  reanalyseapp/visualization.py
@@ -99,7 +99,7 @@ def makeViz(e,typ,speakers=[],textes=[],attributetypes=[],count=0):
99 99
 		newVizu = makeVisualizationObject(e,typ,descr)
100 100
 		if speakers==[]:
101 101
 			if textes==[]:
102  
-				speakers = e.speaker_set.exclude(ddi_type='INV')
  102
+				speakers = e.speaker_set.filter(public=True)
103 103
 			else:
104 104
 				speakers=[]
105 105
 				for t in textes:
@@ -122,7 +122,7 @@ def makeViz(e,typ,speakers=[],textes=[],attributetypes=[],count=0):
122 122
 			if textes==[]:
123 123
 				textes = Texte.objects.filter(doctype='TEI')
124 124
 			for t in textes:
125  
-				spks = t.speaker_set.exclude(ddi_type='INV')
  125
+				spks = t.speaker_set.filter(public=True)
126 126
 				if len(spks)>0:
127 127
 					newVizu = makeVisualizationObject(e,typ,descr)
128 128
 					d = visMakeTagCloudFromTermVectors(e,{'count':count,'who':spks})
@@ -553,7 +553,7 @@ def visMakeOverview(e):
553 553
 	
554 554
 	##### TRYOUT C : only speakers
555 555
 	links=[]
556  
-	for s in e.speaker_set.exclude(ddi_type='INV'):
  556
+	for s in e.speaker_set.exclude(public=True):
557 557
 		links.append({'spk':{'id':s.id,'label':s.name,'weight':0}})
558 558
 	res['links']=links
559 559
 	return res
@@ -862,6 +862,7 @@ def getSolrTermVectorsDict(speakers,field,mintn): # field = 'text'/'ngrams'
862 862
 ####################################################################
863 863
 # to avoid querying solr everyday, we store ngrams in DB
864 864
 def makeAllTfidf(e):
  865
+	logger.info("["+str(e.id)+"] now updating tfidf ...")
865 866
 	for s in e.speaker_set.all():
866 867
 		#logger.info("now reseting tfidf ngrams for speaker:"+str(s.id))
867 868
 		s.ngramspeaker_set.all().delete()
@@ -870,6 +871,7 @@ def makeAllTfidf(e):
870 871
 			d=termd[w]
871 872
 			newNgram,isnew = Ngram.objects.get_or_create(enquete=e,content=w,df=d['df'])	
872 873
 			newNgramSpeaker,isnew = NgramSpeaker.objects.get_or_create(enquete=e,ngram=newNgram,speaker=s,tf=d['tf'],tn=d['tn'],tfidf=d['tfidf'])
  874
+	logger.info("["+str(e.id)+"] tfidf sucessfully updated")
873 875
 ####################################################################
874 876
 
875 877
 

0 notes on commit b1a29d6

Please sign in to comment.
Something went wrong with that request. Please try again.