In [72]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-


# This file is (was?) in /l/rkarhila/speecon_wsj_phoneme_dnn/data_preprocessing

#
#  1. Divide each data file into single phoneme chunks based on aliged labels
#
#  2. Run the chunks through feature extraction shell script
#
#  3. Store the features and their associated phoneme information in arrays
#
#  4. Pickle for future normalisation (with other corpora) 
#

import io
import os
import numpy as np
from subprocess import Popen, PIPE, STDOUT
import re
import math 
import struct
import time
import sys
import struct
import random

#
# Use some funky structure from tensor flow to store 3d-matrices of variable length more compactly.
#
import tensorflow as tf

#
# A function that will be useful:
#

def mkdir(path):
    try:
        os.makedirs(path)        
    except OSError as exc:  # Python >2.5
        print ("dir %s exists" % path)

#
# Some more output?
#
debug=True
global debug


In [73]:
preprocessing_scripts = [{'script': '../feature_extraction_scripts/preprocess_pfstar.sh', 'name' : 'clean', 'parameters': [[0,0], [0,0]] },
                         {'script': '../feature_extraction_scripts/preprocess_pfstar.sh', 'name' : 'clean', 'parameters': [[0,0], [0,0]] },
                         {'script': '../feature_extraction_scripts/preprocess_pfstar_and_overdrive.sh', 'name' : 'overdrive', 'parameters': [[1,10], [-20,0]] },
                         {'script': '../feature_extraction_scripts/preprocess_pfstar_and_overdrive.sh', 'name' : 'underdrive', 'parameters': [[-40,-20], [0,0]] },
                         {'script': '../feature_extraction_scripts/preprocess_pfstar_and_add_babble.sh', 'name' : 'babbled', 'parameters': [[-40,-10],[-20,0]] },
                         {'script': '../feature_extraction_scripts/preprocess_pfstar_and_add_humming.sh', 'name' : 'volvo', 'parameters': [[-30,-10],[-20,0]] } ]

feature_extraction_script = '../feature_extraction_scripts/extract_with_start_end.sh'

In [74]:
samples_per_class_per_speaker = 20
fs = 16000


vowels = ['a','A','å','Å','ä','Ä','e','E','f','i','I','o','O','ö','u','U']

nonvow = ['b','C','d','D','g','H','j','J','k','l','m','n','N','p','P','Q','r','R','s','S','t','T','v','w','W','Y','z','Z']

combinations = []


used_classes = vowels+nonvow+combinations
classes_name = "mc_en_uk_all"


#
# Settings for feature extraction:
#

datatypelength = 2 # 16 bits = 2 bytes, no?

frame_length = 400
frame_step = 128

frame_leftovers = frame_length-frame_step

padding_array = bytearray()

progress_length = 80

max_num_samples=8000 # 0.5 should be enough for any reasonable phoneme, right?

max_num_classes = 10000
feature_dimension=30

max_num_frames=40
max_num_monoclasses = 200
max_num_monoclasses = 9


#max_num_samples=100160
assigned_num_samples=100



# tmp directory for feature extraction.
# This should reside in memory (tempfs or whatever it's called, often under /dev/shm/)

tmp_dir="/dev/shm/siak-feat-extract-python-"+str(time.time())
try:
    os.makedirs(tmp_dir)
except OSError as exc:  # Python >2.5
    if exc.errno == errno.EEXIST and os.path.isdir(tmp_dir):
        pass
    else:
        raise   


print ('using tmp dir %s' % tmp_dir)

using tmp dir /dev/shm/siak-feat-extract-python-1480952158.3560872


##  Dataset definitions ##
*In a very awkward manner, we'll specify some local files that contain list of audio and transcription files*

In [75]:

#
#   Data collection defitinions - train, dev and eval sets:
#


corpus = "en_uk_kids_align_from_clean"
pickle_dir='../features/work_in_progress/'+corpus+'/pickles'
statistics_dir = '../features/work_in_progress/'+corpus+'/statistics/'

collections = [                                                                                                          
    { 'name' : 'train-0',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.train.00',
      'condition' : 'clean',
      'numlines': 878 },
    { 'name' : 'train-1',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.train.01',
      'condition' : 'clean',
      'numlines': 1083 },
    { 'name' : 'train-2',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.train.02',
      'condition' : 'clean',
      'numlines': 946 },
    { 'name' : 'train-3',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.train.03',
      'condition' : 'clean',
      'numlines': 870 },
    { 'name' : 'train-4',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.train.04',
      'condition' : 'clean',
      'numlines': 651 },
    { 'name' : 'train-5',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.train.05',
      'condition' : 'clean',
      'numlines': 785},
    { 'name' : 'train-6',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.train.06',
      'condition' : 'clean',
      'numlines': 699 },
    { 'name' : 'train-7',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.train.07',
      'condition' : 'clean',
      'numlines': 699 },
    { 'name' : 'test-0',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.test.00',
      'condition' : 'clean',
      'numlines': 852 },
    { 'name' : 'test-1',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.test.01',
      'condition' : 'clean',
      'numlines': 752 },
    { 'name' : 'test-2',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.test.02',
      'condition' : 'clean',
      'numlines': 594 },
    { 'name' : 'test-3',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.test.03',
      'condition' : 'clean',
      'numlines': 758 },
    { 'name' : 'test-4',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.test.04',
      'condition' : 'clean',
      'numlines': 734 },
    { 'name' : 'test-5',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.test.05',
      'condition' : 'clean',
      'numlines': 393},
    { 'name' : 'eval-0',
      'recipe' : '/l/rkarhila/speecon_wsj_phoneme_dnn/kids_en_uk/leave_one_out_recipes/recipe.speakers.eval.00',
      'condition' : 'clean',
      'numlines': 837 }
]


featdim1 = -1;
featdim2 = -1;

means_set = False
means = -1;
stds = -1;
new_pickle_dir = "-1"

classes = {}



## Some helper functions ##
*Label processing etc.*

In [126]:
def process_label( labelfile ):
    global debug
    
    if not os.path.isfile(labelfile):
        print ("Can't find labelfile %s" % labelfile)
        return False
    
    with io.open(labelfile ,'r',encoding='iso-8859-15') as f:

        new_align = []

        current_start = 0
        current_end = 0
        current_model = False
        current_premodel = False
        current_postmodel = False

        skip = False

        phonect = 0
        statect = 0

        lcounter = 0

        # For printing the phoneme sequences into a log:
        skipmark=False

        startmark=-1
        endmark = -1

        discard = False

        phone={}

        for l in  f.readlines():

            # If we have a short pause model:
            #if '+' not in l:
            #    no_skipping = True
            #    skipmark = True

            # We'll process the label line by line with a two-phone delay:

            if '+' in l and not discard:
                #print "Looking at %s"%(l)
                [start, 
                 end, 
                 premodel, 
                 model, 
                 postmodel, 
                 state] = re.split(r'[ .+-]', l.strip() ) #, l.encode('utf-8').strip() )

                if state=='0':

                    phone = {'start':start, 
                             'premodel':premodel, 
                             'model': model,
                             'postmodel':postmodel,
                             'state':state,
                             'triphone': "%s-%s+%s" % (premodel, model, postmodel) }

                if state=='2':
                    phone['end'] = end

                    if (phone['model'] != '__'):

                        if (int(phone['end'])-int(phone['start']))/frame_step == 3:
                            #discard_counter+=1
                            #print "Discarding %i/%i: %s: (Too short! Discards: %0.2f%s)" % (recipefilecounter, collection['numlines'], labelfile, 100.0*discard_counter/collection['numlines'],"%" )

                            discard = True

                        #elif (int(phone['end'])-int(phone['start']))/frame_step > 40 and '_' not in phone['triphone']:
                        #    #print "Discarding %i/%i: %s (Too Long! Discards: %0.2f%s)" % (recipefilecounter, collection['numlines'], labelfile, 100.0*discard_counter/collection['numlines'],"%" )
                        #    discard_counter+=1
                        #    discard = True

                        #if debug:
                        #    print ("saving %s-%s+%s " %  (phone['premodel'], phone['model'],phone['postmodel']))
                        else:
                            new_align.append({'pre' : phone['premodel'],
                                          'model' : phone['model'],
                                          'post' : phone['postmodel'],
                                          'start' : phone['start'],
                                          'end' : phone['end'],
                                          'triphone' : phone['triphone'],
                                          'sortable': "%s--%s++%s" % (phone['model'] , phone['premodel'], phone['postmodel'])
                                      })


    return new_align
    

In [127]:
def get_labelstring( new_align ):
    labelstring = ''
    for phone in new_align:
        labelstring += '.'+phone['model']
    return labelstring    

In [128]:
def chop_features( audiodata, feature_array, new_align ):
    global debug
    
    triphones = []

    startmark = int(new_align[0]['start'])
    endmark= int(new_align[-1]['end'])
    
    for l in new_align:                

        lkey = l['sortable']
        mkey = l['model']
        tp = l['triphone']

        l_start = (int(l['start'])-startmark)/frame_step
        l_end =  (int(l['end'])-startmark)/frame_step
        l_length = l_end - l_start

        if (l_length == 3):
            tooshortcount+=1
            continue

        # For debugging, let's write this stuff to disk:
        if mkey not in quality_control_audio_files.keys():
            qual_file = os.path.join(quality_control_wavdir,  mkey+".wav")
            quality_control_audio_files[mkey] = open( qual_file , 'wb')

        win_i=0
        win_len=128
        max_val=32000
        norm=20000.0/max(abs(audiodata[int(l['start']):int(l['end'])]))
        #print norm

        for val in audiodata[int(l['start']):int(l['start'])+win_len]:
            (quality_control_audio_files[mkey]).write( 
                    struct.pack( 'h', int( min( max_val,norm * val * win_i / win_len ) ) ) )
            win_i+=1

        for val in audiodata[int(l['start'])+win_len:int(l['end'])-win_len]:
            (quality_control_audio_files[mkey]).write(
                    struct.pack( 'h', int(min(max_val,norm * val ) ) ) ) 

        for val in audiodata[int(l['end'])-win_len:int(l['end'])]:
            (quality_control_audio_files[mkey]).write(
                    struct.pack( 'h', int(min(max_val,norm * val * win_i / win_len ) ) ) ) 
            win_i-=1


        if (feature_array.shape[0] < l_end):
            print ("Not enough features: %i < %i" % (feature_array.shape[0], l_end))
            continue

        statistics_handle.write("%i\t%s\n" % (l_length, tp))

        if debug:
            print ("---------------------------")
            print ("Array stats: start %i -> %i length ?? -> %i end %i -> %i" % (
                        int(l['start'])-startmark, 
                        l_start, 
                        l_length, 
                        int(l['end'])-startmark, 
                        l_end ))
            print ("      phone data size: %i x %i" % (feature_array[l_start:l_end, :]).shape)
            print ("Data size: %i x %i" % feature_array.shape)

        triphonedata.append ({ 'data': feature_array[l_start:l_start+max_num_frames, :],
                                'counter': 0,
                                'mono' :l['model'],
                                'triphone' : l['triphone'],
                                'sorting' : l['sortable'] })
    return triphonedata


In [129]:
def get_features( audiofile, new_align):
    global debug
    
    audiodata = np.fromfile( audiofile, 'int16', -1)

    startmark = int(new_align[0]['start'])
    endmark= int(new_align[-1]['end'])

    if debug:
        print ("start feature extraction at %s (%f s) and end at %s (%f s) ==> %i frames"  % (
                startmark, 
                (float(startmark)/16000), 
                endmark, (float(endmark)/16000), 
                (endmark-startmark)/frame_step) )

    # Communication from: 
    # http://stackoverflow.com/questions/163542/python-how-do-i-pass-a-string-into-subprocess-popen-using-the-stdin-argument

    tmp_input=os.path.join(tmp_dir,str(tmpfilecounter)+"_in")
    tmp_output=os.path.join(tmp_dir,str(tmpfilecounter)+"_out")

    audiodata.tofile(tmp_input, "")

    process_progress = Popen([
            feature_extraction_script, 
            tmp_input, 
            tmp_output, 
            str(startmark), 
            str(endmark+frame_leftovers) ], 
                             stdout=PIPE, stdin=PIPE, stderr=STDOUT).communicate()

    feature_list = np.fromfile(tmp_output, dtype='float32', count=-1)
    feature_array = feature_list.reshape([-1,feature_dimension])

    f_end =  (int(new_align[-1]['end'])-startmark)/frame_step

    if debug:
        print ("Utterance data size: %i x %i" % (feature_array).shape)

    if (feature_array.shape[0] < f_end):
            print ("Not enough features for file %s: %i < %i" % ...
                   (audiofile, feature_array.shape[0], f_end))
            print ("panic save to /tmp/this_is_not_good")
            np.savetxt('/tmp/this_is_not_good', feature_array, delimiter='\t')
            raise ValueError("Not enough features for file %s: %i < %i" % (
                    audiofile, 
                    feature_array.shape[0], 
                    f_end) )
    else:

        chop_features( audiodata, feature_array, new_align )

        triphonedata = chop_features( audiodata, feature_array, new_align )
    
    os.remove(tmp_input)
    os.remove(tmp_output)
    
    return triphonedata


## Feature extraction (spectral/vocoder parameters) ##

In [130]:


print ("start!")

for collection in [collections[0]]:
    triphonedata = []

    recipefile = open( collection['recipe'] , 'r')
    recipefilecounter = 0
    too_long_counter = 0
    all_trips_counter = 0

    tmpfilecounter = 0

    progress_interval = math.ceil(collection['numlines']/1000.0)

    statistics_file=statistics_dir+"/"+corpus+"-"+collection['condition']+"-"+collection['name']+".triphone-frame-counts"
    statistics_handle = open(statistics_file, 'w')

    class_file=statistics_dir+"/"+corpus+"-"+collection['condition']+"-"+collection['name']+".triphone-classes"
    class_handle= open(class_file, 'w')

    phone_merge_file=statistics_dir+"/"+corpus+"-"+collection['condition']+"-"+collection['name']+".phone-merge"
    phone_merge_handle = open(phone_merge_file, 'w')

    quality_control_wavdir = os.path.join(pickle_dir, 'control-wav', collection['condition']+"-"+collection['name']+"-classes_"+classes_name)

    mkdir(quality_control_wavdir)

    quality_control_audio_files = {}

    discard_counter=0
    tooshortcount=0

    for r in recipefile.readlines()[0:3]:
        
        recipefilecounter += 1
        if debug:
            print ("Item %i/%i" % (recipefilecounter, collection['numlines']) )

        audiofile = re.sub('audio=', r'',  re.findall('audio=/[^ ]+', r)[0]).strip()
        labelfile = re.sub(r'transcript=', r'', re.findall('transcript=/[^ ]+', r)[0]).strip()
    
        new_align = process_label(labelfile)
        labelstring = get_labelstring( new_align )
        
        phone_merge_handle.write("%s\t%s\n" % (labelfile, labelstring))

        # OK, label file done.
        # Now it's time to process the audio.
        # We'll send to the feature extractor the bits of the file that 
        # match the speech segments.

        if len(new_align) > 0:
            triphonedata = get_features( audiofile, new_align)
        
            all_trips_counter += len( new_align )

        if not debug:
            if (recipefilecounter % int(progress_interval)) == 0:
                sys.stderr.write("\r%0.2f%s %s %s" % (
                        100.0*recipefilecounter/collection['numlines'], 
                        "%",
                        collection['condition'], 
                        collection['name'] ))
                sys.stderr.flush()

        if (recipefilecounter == collection['numlines']):
            print ("That's enough!")
            print ("recipefilecounter %i  == collection['numlines'] %i" % ( 
                    recipefilecounter, 
                    collection['numlines'] ))



start!
dir ../features/work_in_progress/en_uk_kids_align_from_clean/pickles/control-wav/clean-train-0-classes_mc_en_uk_all exists
Item 1/878
start feature extraction at 1792 (0.112000 s) and end at 30720 (1.920000 s) ==> 226 frames
Utterance data size: 229 x 30
---------------------------
Array stats: start 0 -> 0 length ?? -> 32 end 4096 -> 32
      phone data size: 32 x 30
Data size: 229 x 30
---------------------------
Array stats: start 4096 -> 32 length ?? -> 36 end 8704 -> 68
      phone data size: 36 x 30
Data size: 229 x 30




---------------------------
Array stats: start 8832 -> 69 length ?? -> 29 end 12544 -> 98
      phone data size: 29 x 30
Data size: 229 x 30
---------------------------
Array stats: start 12544 -> 98 length ?? -> 10 end 13824 -> 108
      phone data size: 10 x 30
Data size: 229 x 30
---------------------------
Array stats: start 13824 -> 108 length ?? -> 7 end 14720 -> 115
      phone data size: 7 x 30
Data size: 229 x 30
---------------------------
Array stats: start 15872 -> 124 length ?? -> 73 end 25216 -> 197
      phone data size: 73 x 30
Data size: 229 x 30
---------------------------
Array stats: start 25216 -> 197 length ?? -> 29 end 28928 -> 226
      phone data size: 29 x 30
Data size: 229 x 30
---------------------------
Array stats: start 0 -> 0 length ?? -> 32 end 4096 -> 32
      phone data size: 32 x 30
Data size: 229 x 30
---------------------------
Array stats: start 4096 -> 32 length ?? -> 36 end 8704 -> 68
      phone data size: 36 x 30
Data size: 229 x 30
---------

## Pickle/save ##

Next we'll save the audio data into variable length tensor flow thingies:

In [None]:

if 1 == 0:    
    for mono in sorted(triphonedata.keys()):
    
        grande_features =  np.zeros([0, max_num_frames, feature_dimension], dtype='float')
        grande_classes = np.zeros([0, max_num_classes ], dtype='float')

        for tripkey in sorted(triphonedata[mono].keys()):
        
            tripdata = triphonedata[mono][tripkey]
            trip =  tripdata['triphone']

            #print collection['name'][-1]
            if trip in triphoneclasses.keys():
                tripcl = triphoneclasses[trip]
            else:
                tripcl = len(triphoneclasses)
                triphoneclasses[trip] = tripcl                
                class_handle.write("%s\t%i\t%i\n" % (trip, tripcl, tripdata['counter']))
                
            grande_features = np.append(grande_features, tripdata['data'][0:tripdata['counter'],:,:], 0)

            piccolo_classes =  np.zeros([ max_num_classes ], dtype='float')
            piccolo_classes[tripcl] = 1
            grande_classes = np.append(grande_classes, np.tile(piccolo_classes,(tripdata['counter'],1)),0)

            

        modeldir_unicode = mono

        new_path=os.path.join(pickle_dir, collection['condition']+"-"+collection['name']+"-classes_"+classes_name)

        picklefile = os.path.join(new_path,  collection['condition']+"-"+collection['name'] +"."+modeldir_unicode+".pkl")


        mkdir(new_path)
            
        print ("pickling %i items to %s" % ( grande_features.shape[0], picklefile))
                
        outf = open(picklefile, 'wb')
        
        # Pickle the list using the highest protocol available.
        cPickle.dump({'data': grande_features, 'classes': grande_classes}, outf, protocol=cPickle.HIGHEST_PROTOCOL)
                


In [None]:

# Load pickles and create sort of balanced sets:

for collection in collections:

    pickle_path=os.path.join(pickle_dir, collection['condition']+"-"+collection['name']+"-classes_"+classes_name)

    classcount=0
    
    samplecounts=np.zeros([1000])

    all_data_dict = {}


    for picklefile in os.listdir(pickle_path):

        #m = re.search(r'\.([^.]+)\.pkl$', picklefile.encode('utf-8').strip() )
        m = re.search(r'\.([^.]+)\.pkl$', picklefile.strip() )
        mono = m.group(1)

        if mono:
            #print "Loading "+mono
            pickledata = cPickle.load(open(os.path.join(pickle_path,picklefile), 'r'))

            if (featdim1 < 0):
                featdim1 = pickledata['data'].shape[1]
                featdim2 = pickledata['data'].shape[2]
            
            #all_data_dict[mono] = pickledata['data']
            if mono not in classes.keys():
                classes[mono] = classcount

            print ("Phone %s: %i samples" % (mono, pickledata['data'].shape[0]))

            samplecounts[classcount] = pickledata['data'].shape[0]

            #grande_features =  np.zeros([0, max_num_frames, feature_dimension], dtype='float')
            #grande_classes = np.zeros([0, max_num_classes ], dtype='float')


            classcount+=1
    
    samplecounts = samplecounts[0:classcount]

    print "Samplecount %i   Classcount %i   Mean/median samples per class %0.2f / %0.1f    Min samples %i   Max samples %i" % \
        (np.sum(samplecounts), 
         samplecounts.shape[0], 
         np.mean(samplecounts),
         np.median(samplecounts),
         np.min(samplecounts),
         np.max(samplecounts));

    print "capping to min of avg/median"

    clip = math.ceil(min(np.median(samplecounts), np.mean(samplecounts) ))

    samplecounts = np.clip(samplecounts, np.min(samplecounts), clip)
    print "Samplecount %i   Classcount %i   Mean/median samples per class %0.2f / %0.1f    Min samples %i   Max samples %i" % \
        (np.sum(samplecounts), 
         samplecounts.shape[0], 
         np.mean(np.sum(samplecounts)/classcount),
         np.median(samplecounts),
         np.min(samplecounts),
         np.max(samplecounts));

    max_num_monoclasses=len(samplecounts)


    grande_feature_array = np.zeros([np.sum(samplecounts), featdim1, featdim2])
    grande_class_array = np.zeros([np.sum(samplecounts), max_num_monoclasses])

    sample_counter = 0

    for picklefile in os.listdir(pickle_path):

        m = re.search(r'\.([^.]+)\.pkl$', picklefile.strip() )
        mono = m.group(1)

        if mono:

            pickledata = cPickle.load(open(os.path.join(pickle_path,picklefile), 'r'))

            trclass = classes[mono]

            print "Mono: %s Class: %i" % (mono, trclass)

            for i in np.random.permutation(np.arange(0, 
                                                     pickledata['data'].shape[0]))[0:min( clip ,
                                                                                          pickledata['data'].shape[0])]:
                
                grande_feature_array[sample_counter,:,:]=pickledata['data'][i,:,:]
                grande_class_array[sample_counter,trclass] = 1

                sample_counter += 1

    

    if new_pickle_dir == "-1":
        new_pickle_dir = os.path.join(pickle_dir, collection['condition']+"-classes_"+classes_name+"_"+str(sample_counter)+"_pickled")

        try:
            os.makedirs(new_pickle_dir)
            
        except OSError as exc:  # Python >2.5
            if exc.errno == errno.EEXIST and os.path.isdir(new_pickle_dir):
                pass
            else:
                raise   

        classfile = os.path.join(new_pickle_dir,  collection['condition']+".classes")
    
        print "Saving class map to %s" % ( classfile);
        class_handle = open(classfile, 'w')
        for m in classes.keys():
            class_handle.write("%i\t%s\n" % (classes[m], m))
        class_handle.close()

        normalisation_file =  os.path.join(new_pickle_dir,  collection['condition']+".mean_and_std.pkl")


        means = np.mean(np.vstack(grande_feature_array),0)
        stds = np.std(np.vstack(grande_feature_array),0)
        normoutf= open(normalisation_file, 'wb')


        print "pickling normalisation stats to %s" % ( normalisation_file);

        cPickle.dump({'mean': means, 'std': stds}, normoutf, protocol=cPickle.HIGHEST_PROTOCOL)
        
        means_set = True

    shuf = np.random.permutation(np.arange(0,grande_feature_array.shape[0]))

    a = ((grande_feature_array-means)/stds)[shuf]
    b = grande_class_array[shuf]


    targetpicklefile = os.path.join(new_pickle_dir,collection['condition']+"-"+collection['name']+"-classes_"+classes_name+".pkl" )

    print "pickling %i items to %s" % ( grande_feature_array.shape[0], targetpicklefile);

    outf = open(targetpicklefile, 'wb')
    
    # Pickle the list using the highest protocol available.
    #cPickle.dump({'data': grande_feature_array, 'classes': grande_class_array}, outf, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump({'data': a, 'classes': b}, outf, protocol=cPickle.HIGHEST_PROTOCOL)
