# Converting matlab data into python

Last update (ymd): 19.09.11  
Last access (ymd): 19.09.16   

Use this Jupyter notebook to convert Expo data from .mat to .npy files, and to make adjustments to existing .npy files (e.g. renaming fields)

In [15]:
import numpy as np
import math, os
import sys
sys.path.insert(0, 'functions/'); # add this path for makeStimulus
import makeStimulus
import helper_fcns as hf
import autoreload

import pdb

# constants - directories
base_loc = '/arc/2.2/p1/plevy/SF_diversity/sfDiv-OriModel/sfDiv-python/';
# base_loc = '/users/plevy/SF_diversity/sfDiv-OriModel/sfDiv-python/';

loc_matData = 'V1/structuresTest/'; # where are the .mat files?
loc_pyData = 'V1/structures/'; # where do you want the .npy files?

recArea = 'V1'

## The original conversion

### Converting files

First, get the .mat files to convert; then, convert unless already done 

In [5]:
files = os.listdir(loc_matData)

In [6]:
sorted(files)

['m676p3l6_154_sfm.mat', 'm676p3l7_154_sfm.mat']

#### Renaming files
The cell below here is used to rename files so that the unit number is zero-padded (e.g. '01' instead of '1')

In [8]:
for i in files:
#     if i.find('#') >= 0:
#         os.rename(loc_matData + i, loc_matData + i.replace('#', ''))
#         print('IGNORE: renaming %s to %s' % (loc_matData + i, loc_matData + i.replace('#', '')))
    if i.find('m676') >=0 and i.find('.mat') >= 0: # or change to .xml/.exxd if changing names in /recordings
        r_ind = i.find('r');
        if r_ind < 0:
            r_ind = i.find('l')
            
#         rEnd_ind = i.find('#') # if changing in /recordings/
        rEnd_ind = i.find('_') # if changing in /structures/
        substr_to_replace = i[r_ind+1:rEnd_ind]
#         print('substr: %s' % substr_to_replace)
        new_str = i[0:r_ind+1] + '%02d' % int(substr_to_replace) + i[rEnd_ind:]
        if new_str == i:
            continue;
        os.rename(loc_matData + i, loc_matData + new_str)
        print('renaming %s to %s' % (loc_matData + i, loc_matData + new_str))

renaming V1/structuresTest/m676p3l6_154_sfm.mat to V1/structuresTest/m676p3l06_154_sfm.mat
renaming V1/structuresTest/m676p3l7_154_sfm.mat to V1/structuresTest/m676p3l07_154_sfm.mat


#### Now re-gather the files

**If loc_matData != loc_pyData**, *then run this first with loc_matData to convert the .mat files, then a second time to gather all of the .npy files for the datalist*

In [21]:
files = os.listdir(loc_pyData)

In [22]:
files = sorted(files)

In [23]:
files

['dataList.npy',
 'dataList_glx.npy',
 'dataList_glx_170.npy',
 'dataList_glx_full.npy',
 'dataList_glx_mr.npy',
 'deprecated_cells',
 'descrFits_190503_poiss_flex.npy',
 'descrFits_190503_poiss_sach.npy',
 'descrFits_190503_sach_flex.npy',
 'descrFits_190503_sach_sach.npy',
 'descrFits_190503_sqrt_flex.npy',
 'descrFits_190503_sqrt_sach.npy',
 'descrFits_poiss_sach.npy',
 'fitList_190131c_flat_chiSq.npy',
 'fitList_190131c_wght_chiSq.npy',
 'fitList_190202c_flat_chiSq.npy',
 'fitList_190202c_wght_chiSq.npy',
 'fitList_190206c_flat_chiSq.npy',
 'fitList_190206c_flat_chiSq_details.npy',
 'fitList_190206c_wght_chiSq.npy',
 'fitList_190206c_wght_chiSq_details.npy',
 'fitList_190226c_flat_chiSq.npy',
 'fitList_190226c_wght_chiSq.npy',
 'fitList_190301c_flat_chiSq.npy',
 'fitList_190301c_flat_chiSq_details.npy',
 'fitList_190301c_wght_chiSq.npy',
 'fitList_190301c_wght_chiSq_details.npy',
 'fitList_190315c_flat_chiSq.npy',
 'fitList_190315c_wght_chiSq.npy',
 'fitList_190321c_flat_chiSq.npy'

In [24]:
# convert individual files
unitName = [];
expType  = [];
unitArea = [];
for i in files:
    # if file has 'sfm' in it and starts with m then 
    if i.find('sfm') >= 0 and i.startswith('m6'):
        
        # don't convert if it already exists
        if os.path.exists(loc_pyData + i.replace('.mat', '.npy')):
            if i.endswith('.npy') and i.find('fullWave') == -1: # only add once (not also with .mat)
                unitName.append(i[0:i.find('_sfm')]) # go up to the '_sfm' character
                _, expName = hf.get_exp_ind(loc_pyData, i[0:i.find('_')])
                if expName is None:
                    pdb.set_trace();
                expType.append(expName);
                unitArea.append(recArea)
            continue;
                
        print("loading: " + i)
        matData = makeStimulus.loadmat(loc_matData + i);
        S = matData.get('S'); # the actual data structure
        _, expName = hf.get_exp_ind(loc_pyData, i[0:i.find('_')])
        
        print("now saving...")
        saveName = loc_pyData + i.replace('.mat', '.npy');
        np.save(saveName, S)
        
        unitName.append(i[0:i.find('_')]) # go up to the '_' character
        expType.append(expName);
        unitArea.append(recArea);

In [25]:
unitName

['m676l01_glx182',
 'm676p3l06_154',
 'm676p3l06_glx_170',
 'm676p3l07_154',
 'm676p3l07_glx',
 'm676p3l13',
 'm676p3l15_glx',
 'm678p5l06_glx170',
 'm678p5l06_glx174',
 'm678p5l07_glx',
 'm678p6l11_glx',
 'm678p6l12_glx',
 'm678p6l15_glx41',
 'm678p6l16_glx',
 'm678p6l18_glx55',
 'm678p7r03_glx']

### Create/update data list

In [26]:
dl_name = 'dataList_glx.npy'

In [27]:
dataList = hf.np_smart_load(loc_pyData + dl_name);

In [28]:
dataList['unitName']

['m676l01_glx182',
 'm676p3l06_glx_170',
 'm676p3l07_154',
 'm676p3l07_glx',
 'm676p3l13',
 'm676p3l15_glx',
 'm678p5l06_glx170',
 'm678p5l06_glx174',
 'm678p5l07_glx',
 'm678p6l11_glx',
 'm678p6l12_glx',
 'm678p6l15_glx41',
 'm678p6l16_glx',
 'm678p6l18_glx55',
 'm678p7r03_glx']

In [29]:
if os.path.exists(loc_pyData + dl_name):
    dataList = hf.np_smart_load(loc_pyData + dl_name);
    dataList['unitName'] = unitName;
    dataList['unitArea'] = unitArea;
    dataList['expType'] = expType;
    np.save(loc_pyData + dl_name, dataList);
else: # unitType, isolation, comment must be filled in by hand at later time
    dataList = dict();
    dataList['unitName'] = unitName;
    dataList['unitArea'] = unitArea;
    dataList['expType'] = expType;
    dataList['expType'] = expType;
    dataList['isolation'] = [];
    dataList['comment'] = [];
    np.save(loc_pyData + dl_name, dataList);

Now, check the saved/updated data list

In [30]:
dataList = np.load(loc_pyData + dl_name).item();

In [59]:
gt_6 = hf.np_smart_load(loc_pyData + dataList['unitName'][1] + '_sfm.npy')
gt_7 = hf.np_smart_load(loc_pyData + dataList['unitName'][3] + '_sfm.npy')
test = hf.np_smart_load(loc_pyData + dataList['unitName'][2] + '_sfm.npy')

In [85]:
[x['sfm']['exp']['trial']['tf'][0][500:510] for x in [gt_6, gt_7, test]]

[array([5.5, 7. , 5.5, 4.5, 6.5, 7. , 5. , 4.5, 5.5, 5.5]),
 array([4., 7., 4., 2., 6., 7., 3., 2., 4., 4.]),
 array([4., 7., 4., 2., 6., 7., 3., 2., 4., 4.])]

In [32]:
dataList['unitName']

['m676l01_glx182',
 'm676p3l06_154',
 'm676p3l06_glx_170',
 'm676p3l07_154',
 'm676p3l07_glx',
 'm676p3l13',
 'm676p3l15_glx',
 'm678p5l06_glx170',
 'm678p5l06_glx174',
 'm678p5l07_glx',
 'm678p6l11_glx',
 'm678p6l12_glx',
 'm678p6l15_glx41',
 'm678p6l16_glx',
 'm678p6l18_glx55',
 'm678p7r03_glx']

### Other changes
Likely not needed, this section was from converting the previous data set where normalization responses where already computed in Matlab

In [None]:
# change some things around...
for i in dataList['unitName']:

    print("changing: " + i)
    S = np.load(loc_pyData + i + '_sfm.npy').item(); # the actual data structure
    
    if S.get('sfm').get('mod'):
        if S.get('sfm').get('mod').get('normalization') and S.get('sfm').get('mod').get('normalization_py'):
            
            S['sfm']['mod']['norm_old'] = S['sfm']['mod']['normalization'];
            S['sfm']['mod']['normalization'] = S['sfm']['mod']['normalization_py'];
            S['sfm']['mod'].pop('normalization_py');
            
            print("now saving...")
            saveName = loc_pyData + i + '_sfm.npy';
            np.save(saveName, S)

## General 'update python structs' here

Use this section to update something about each cell in the dataList (change appropriate field(s))

Based on the results of analysis in sandbox_careful.ipynb, I've determined that the F1 calculation is in correct, as provided in the matlab files which we use to load the expo XML files. Thus, the 'f1' field associated with each cell has a value which is usually half the true F1 power. Why? Given the nature of the spike train as a real signal, the power at non-DC, positive-frequencies should be doubled from what is calculated in the FFT (see sandbox_careful.ipynb and helper_fcns.py/spike_fft for more details).

Below, I'll use this template to load each cell, move the 'f1' field to 'expo_f1', and create a new 'f1' field with the correct calculation.

**NOTE:** Now that this move has been completed, do not run the line below "now let's move the 'f1' field", since you will then overwrite the original expo F1 calculation ('f1_expo') with our own calculation (now stored in 'f1').

In [2]:
expDirs = ['V1_orig/', 'altExp/', 'V1/']
expNames = ['dataList.npy', 'dataList.npy', 'dataList_glx.npy']

In [4]:
for expDir, dL_nm in zip(expDirs, expNames):

    data_loc = base_loc + expDir + 'structures/';
    dataList = hf.np_smart_load(data_loc + dL_nm);

    print('dir: %s' % expDir)
    
    # Now, go through for each cell in the dataList                                                                                                                                                                                                                           
    nCells = len(dataList['unitName']);
    for cell_ind in range(nCells):

        # get experiment name, load cell                                                                                                                                                                                                                                        
        expName = dataList['unitName'][cell_ind];
        expInd = hf.get_exp_ind(data_loc, expName)[0];
        cell = hf.np_smart_load(data_loc + expName + '_sfm.npy');
        tr_inf = cell['sfm']['exp']['trial']
        
        # now, let's "move" the 'f1' field
#         cell['sfm']['exp']['trial']['f1_expo'] = cell['sfm']['exp']['trial']['f1'];
        # the real stuff: get the correct f1 calculation
        nTrials = len(tr_inf['num']);
        stimDur = hf.get_exp_params(expInd, forceDir=expDir).stimDur;
        spike_times = [tr_inf['spikeTimes'][x] for x in range(nTrials)]; 
        psth, bins = hf.make_psth(spike_times, stimDur=stimDur);
        n_trs = len(tr_inf['num']);
        try: # V1_orig does not have num_comps, but we cannot properly do f1 by component for mixtures with V1_orig (nor altExp)
            n_comps = tr_inf['num_comps']
            all_tf = [[tr_inf['tf'][c_i][i] for c_i in range(x)] for i, x in enumerate(n_comps)];
        except: # so in that case, just get the first grating from each trial
            all_tf = tr_inf['tf'][0]; # just take first grating???
        power, rel_power, full_ft = hf.spike_fft(psth, tfs=all_tf, stimDur=stimDur);
    
        cell['sfm']['exp']['trial']['f1'] = rel_power;
        
        # then save the update!
        np.save(data_loc + expName + '_sfm.npy', cell);

dir: V1_orig/
dir: altExp/
dir: V1/


I've noticed (19.05.12) some cells were not fitting/plotting, due to error in summing total_con it turns out that was caused by *con.shape = (nComps, nTrials) rather than (nComps, ); here, we fix that issue


In [None]:
dL = np.load(loc_pyData + 'dataList.npy').item();

In [None]:
for i in dL['unitName']:
    S = hf.np_smart_load(loc_pyData + i + '_sfm.npy');
        
    trial = S['sfm']['exp']['trial'];
    cons = trial['con'];

    if len(cons.shape) == 2:
        print('ha! issue with %s' % i);
        nComps = cons.shape[0];
        newCons = np.zeros((nComps, ), dtype='O')
        # for each component, pack as array, which is the default/working method
        for ci in range(nComps):
            newCons[ci] = np.array(cons[ci, :])
        
        S['sfm']['exp']['trial']['con'] = newCons;
        np.save(loc_pyData + i + '_sfm.npy', S);