# MED-PC Extracting the Recording Data and Metadata

## Importing the Python Libraries

In [1]:
import sys
import glob
from collections import defaultdict
import os
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from medpc2excel.medpc_read import medpc_read

In [3]:
# setting path
sys.path.append('../src')

In [4]:
# All the libraries that were created for this repository
import extract.dataframe
import processing.tone
import extract.metadata

## Getting the Metadata from all the MED-PC Recording Files

- Getting all the file paths of the recording files(that happen to all end in `.txt`

# NOTE: The following path must be changed to the directory where your MED-PC recording files are located, if they are not in the specied folder

In [5]:
all_med_pc_file = glob.glob("./data/timestamp_dataframes/*.txt")

In [6]:
all_med_pc_file[:10]

['./data/timestamp_dataframes/2022-05-06_12h59m_Subject 3.4 (2).txt',
 './data/timestamp_dataframes/2022-05-06_08h37m_Subject 2.3.txt',
 './data/timestamp_dataframes/2022-05-10_14h40m_Subject 4.3 (3).txt',
 './data/timestamp_dataframes/2022-05-06_12h59m_Subject 4.3 (3).txt',
 './data/timestamp_dataframes/2022-05-04_08h43m_Subject 2.3.txt',
 './data/timestamp_dataframes/2022-05-03_12h52m_Subject 2.1.txt',
 './data/timestamp_dataframes/2022-05-04_10h11m_Subject 1.2.txt',
 './data/timestamp_dataframes/2022-05-12_10h22m_Subject 1.2.txt',
 './data/timestamp_dataframes/2022-05-06_08h37m_Subject 1.1.txt',
 './data/timestamp_dataframes/2022-05-12_10h22m_Subject 1.3.txt']

- Example of what the MED-PC Recording file looks like

In [7]:
with open(all_med_pc_file[0]) as f:
    lines = f.readlines()
    for line in lines[:20]:
        print(line)

File: C:\MED-PC\Data\2022-05-06_12h59m_Subject 3.4 (2).txt







Start Date: 05/06/22

End Date: 05/06/22

Subject: 3.4 (2)

Experiment: Pilot of Pilot

Group: Cage 4

Box: 1

Start Time: 12:59:58

End Time: 14:02:38

MSN: levelNP_CS_reward_laserepochON1st_noshock

A:    4299.000

D:    9000.000

F:    2000.000

G:       0.000

H:       0.000

I:       0.000

L:       0.000



- We will be extracting the first 10 or so lines that look like:

```
File: C:\MED-PC\Data\2022-05-06_12h59m_Subject 3.4 (2).txt

Start Date: 05/06/22

End Date: 05/06/22

Subject: 3.4 (2)

Experiment: Pilot of Pilot

Group: Cage 4

Box: 1

Start Time: 12:59:58

End Time: 14:02:38

MSN: levelNP_CS_reward_laserepochON1st_noshock
```
    
- We will just find all the lines that start with "File", "Start Date", "End Date", "Subject", "Experiment", "Group", "Box", "Start Time", "End Time", or "MSN". And then stop once all the metadata types have been collected

In [8]:
# This makes a nested dictionary of file paths to each individual metadata type
file_path_to_meta_data = extract.metadata.get_all_med_pc_meta_data_from_files(list_of_files=all_med_pc_file)

In [9]:
for key, value in file_path_to_meta_data.items():
    print("File path: {}".format(key))
    print("Metadata types and associated values: {}".format(value))
    break

File path: ./data/timestamp_dataframes/2022-05-06_12h59m_Subject 3.4 (2).txt
Metadata types and associated values: {'File': 'C:\\MED-PC\\Data\\2022-05-06_12h59m_Subject 3.4 (2).txt', 'Start Date': '05/06/22', 'End Date': '05/06/22', 'Subject': '3.4 (2)', 'Experiment': 'Pilot of Pilot', 'Group': 'Cage 4', 'Box': '1', 'Start Time': '12:59:58', 'End Time': '14:02:38', 'MSN': 'levelNP_CS_reward_laserepochON1st_noshock'}


## Making a Dataframe out of the Metadata

In [10]:
# Turning the dictionary into a Pandas Dataframe
metadata_df = pd.DataFrame.from_dict(file_path_to_meta_data, orient="index")
# Resetting the index because currently the file path is the index 
metadata_df = metadata_df.reset_index()

In [11]:
metadata_df.head()

Unnamed: 0,index,File,Start Date,End Date,Subject,Experiment,Group,Box,Start Time,End Time,MSN
0,./data/timestamp_dataframes/2022-05-06_12h59m_...,C:\MED-PC\Data\2022-05-06_12h59m_Subject 3.4 (...,05/06/22,05/06/22,3.4 (2),Pilot of Pilot,Cage 4,1,12:59:58,14:02:38,levelNP_CS_reward_laserepochON1st_noshock
1,./data/timestamp_dataframes/2022-05-06_08h37m_...,C:\MED-PC\Data\2022-05-06_08h37m_Subject 2.3.txt,05/06/22,05/06/22,2.3,Pilot of Pilot,Cage 1,1,08:37:09,09:53:25,levelNP_CS_reward_laserepochON1st_noshock
2,./data/timestamp_dataframes/2022-05-10_14h40m_...,C:\MED-PC\Data\2022-05-10_14h40m_Subject 4.3 (...,05/10/22,05/10/22,4.3 (3),Pilot of Pilot,Cage 4,2,14:40:24,15:43:18,levelNP_CS_reward_laserepochON1st_noshock
3,./data/timestamp_dataframes/2022-05-06_12h59m_...,C:\MED-PC\Data\2022-05-06_12h59m_Subject 4.3 (...,05/06/22,05/06/22,4.3 (3),Pilot of Pilot,Cage 4,2,12:59:58,14:02:38,levelNP_CS_reward_laserepochON1st_noshock
4,./data/timestamp_dataframes/2022-05-04_08h43m_...,C:\MED-PC\Data\2022-05-04_08h43m_Subject 2.3.txt,05/04/22,05/04/22,2.3,Pilot of Pilot,Cage 1,3,08:43:11,09:54:22,levelNP_CS_reward_laserepochON1st_noshock


In [12]:
metadata_df.tail()

Unnamed: 0,index,File,Start Date,End Date,Subject,Experiment,Group,Box,Start Time,End Time,MSN
173,./data/timestamp_dataframes/2022-05-07_13h54m_...,C:\MED-PC\Data\2022-05-07_13h54m_Subject 4.3 (...,05/07/22,05/07/22,4.3 (3),Pilot of Pilot,Cage 4,3,13:54:26,14:55:23,levelNP_CS_reward_laserepochON1st_noshock
174,./data/timestamp_dataframes/2022-05-09_09h48m_...,C:\MED-PC\Data\2022-05-09_09h48m_Subject 1.1.txt,05/09/22,05/09/22,1.1,Pilot of Pilot,Cage 1,2,09:48:49,11:00:54,levelNP_CS_reward_laserepochON1st_noshock
175,./data/timestamp_dataframes/2022-05-16_11h28m_...,C:\MED-PC\Data\2022-05-16_11h28m_Subject 2.1.txt,05/16/22,05/16/22,2.1,Pilot of Pilot,Cage 2,3,11:28:27,12:35:20,levelNP_CS_reward_laserepochON1st_noshock
176,./data/timestamp_dataframes/2022-05-10_10h42m_...,C:\MED-PC\Data\2022-05-10_10h42m_Subject 2.1.txt,05/10/22,05/10/22,2.1,Pilot of Pilot,Cage 2,4,10:42:30,11:43:52,levelNP_CS_reward_laserepochON1st_noshock
177,./data/timestamp_dataframes/2022-05-08_11h14m_...,C:\MED-PC\Data\2022-05-08_11h14m_Subject 1.3.txt,05/08/22,05/08/22,1.3,Pilot of Pilot,Cage 2,4,11:14:36,12:19:20,levelNP_CS_reward_laserepochON1st_noshock


- Just getting the numbers out of the column that contains the cage information

In [13]:
metadata_df["cage"] = metadata_df["Group"].apply(lambda x: x.strip("Cage").strip())

In [14]:
# How many files there are for each subject
metadata_df.groupby("Subject").count()

Unnamed: 0_level_0,index,File,Start Date,End Date,Experiment,Group,Box,Start Time,End Time,MSN,cage
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,1,1,1,1,1,1,1,1,1,1,1
1.1,12,12,12,12,12,12,12,12,12,12,12
1.2,12,12,12,12,12,12,12,12,12,12,12
1.3,12,12,12,12,12,12,12,12,12,12,12
1.4,12,12,12,12,12,12,12,12,12,12,12
2.1,13,13,13,13,13,13,13,13,13,13,13
2.2,12,12,12,12,12,12,12,12,12,12,12
2.3,12,12,12,12,12,12,12,12,12,12,12
2.4,12,12,12,12,12,12,12,12,12,12,12
3.1 (1),10,10,10,10,10,10,10,10,10,10,10


## Inputting all the MED-PC log files

- Example of what the MED-PC Script looks like that was ran when recording the behaviors

In [15]:
with open("./scripts/levelNP_CS_reward_laserepochON1st_noshock.MPC") as f:
    lines = f.readlines()
    for line in lines[:100]:
        print(line)

\v3 stop tone with poke

\v3.2 monitor port entries AND exits



\INPUTS

^port = 8



\OUTPUTS

^fan = 16

^houselight = 11

^tone1 = 2

^tone2 = 3

^tone3 = 4

^tone4 = 5

^pump = 9

^whitenoise = 1

^laser=10

^csout = 5

^peout = 15

^cs1out = 12

^cs2out = 13

^cs3out = 14



\EXP SETTINGS

^ncsNoShock = 0

^initCS1trials = 3



\ARRAYS

DIM P = 20000 \Port entry time stamp array

DIM Q = 2500 \US delivery time stamp array (absolute)

DIM R = 2500 \US time stamp array (relative to last CS)

DIM W = 2500 \ITI values used for CS

DIM S = 2500 \CS presentation values (absolute - every time light turns on)

DIM N = 20000 \Port exit time stamp array

DIM K = 2500 \CS type

DIM B = 2500 \shock intensity



LIST V = 30", 30", 30", 30", 100", 95", 90", 80", 90", 100", 90", 120", 90", 85", 90", 95", 120", 80", 95", 80", 90", 80", 75", 100", 95", 90", 80", 90", 100", 90", 120", 90", 85", 90", 95", 90", 80", 90", 100", 90", 90", 90", 90", 90", 90", 90", 90", 90", 90", 90", 90", 90", 90", 90"

- We will be using the comments in the MED-PC script(which starts with the `\`) to create a name for the variables. By default, MED-PC uses a single letter as the name of the variable.
    - This will use the medpc2excel library found in https://github.com/cyf203/medpc2excel
- Example of the comments in the MED-PC script that we will use the names from:

```
\Variables

\A - Time since last CS

\B - Shock intensity

\C - Counter array

\D - Current ITI value

\E - CS ITI values for first few trials

\F - Shock duration

\G -

\H -

\I -

\J - Shock intensity repo

\K - CS type

\L -

\M - CS type repo

\N - Port exit time stamp array

\O -

\P - Port entry time stamp array

\Q - Sucrose delivery time stamp array (absolute)

\R - Sucrose delivery time stamp array (relative to last CS)

\S - CS presentation time stamp array

\T - Session timer

\U - Time since last CS presentation

\V - List of CS ITI values (tone + houselight)

\W - ITI values used for CS one each trial

\X -

\Y - Beam break monitor variable

\Z -
```

In [16]:
with open(all_med_pc_file[0]) as f:
    lines = f.readlines()
    for line in lines[:20]:
        print(line)

File: C:\MED-PC\Data\2022-05-06_12h59m_Subject 3.4 (2).txt







Start Date: 05/06/22

End Date: 05/06/22

Subject: 3.4 (2)

Experiment: Pilot of Pilot

Group: Cage 4

Box: 1

Start Time: 12:59:58

End Time: 14:02:38

MSN: levelNP_CS_reward_laserepochON1st_noshock

A:    4299.000

D:    9000.000

F:    2000.000

G:       0.000

H:       0.000

I:       0.000

L:       0.000



- **Please make sure that the corresponding `.mpc` file (aka the MED-PC script) that was ran to create the log file, is also in the same folder**

In [17]:
concatted_medpc_df = extract.dataframe.get_medpc_dataframe_from_list_of_files(medpc_files=all_med_pc_file)

Traceback (most recent call last):
  File "/home/riwata/Projects/med_pc_repo/jupyter_notebooks/../src/extract/dataframe.py", line 71, in get_medpc_dataframe_from_list_of_files
    ts_df, medpc_log = medpc_read(file=file_path, override=True, replace=False)
  File "/home/riwata/Projects/med_pc_repo/bin/conda_environments/env/med_pc_env/lib/python3.9/site-packages/medpc2excel/medpc_read.py", line 114, in medpc_read
    temp += re.split('\s+',d.split(':')[1])
IndexError: list index out of range

Invalid Formatting for file: ./data/timestamp_dataframes/2022-05-03_13h19m_Subject 1.2.txt
Traceback (most recent call last):
  File "/home/riwata/Projects/med_pc_repo/jupyter_notebooks/../src/extract/dataframe.py", line 71, in get_medpc_dataframe_from_list_of_files
    ts_df, medpc_log = medpc_read(file=file_path, override=True, replace=False)
  File "/home/riwata/Projects/med_pc_repo/bin/conda_environments/env/med_pc_env/lib/python3.9/site-packages/medpc2excel/medpc_read.py", line 114, in medpc_r

  MSN_dict[thisDate] = MSN_dict[thisDate].append({'ID':subject,'Box':box,'MSN':programname},ignore_index=True)
  MSN_dict[thisDate] = MSN_dict[thisDate].append({'ID':subject,'Box':box,'MSN':programname},ignore_index=True)
  MSN_dict[thisDate] = MSN_dict[thisDate].append({'ID':subject,'Box':box,'MSN':programname},ignore_index=True)


Traceback (most recent call last):
  File "/home/riwata/Projects/med_pc_repo/jupyter_notebooks/../src/extract/dataframe.py", line 71, in get_medpc_dataframe_from_list_of_files
    ts_df, medpc_log = medpc_read(file=file_path, override=True, replace=False)
  File "/home/riwata/Projects/med_pc_repo/bin/conda_environments/env/med_pc_env/lib/python3.9/site-packages/medpc2excel/medpc_read.py", line 134, in medpc_read
    for var, nm in TS_var_name_maps[program_nm].items():
KeyError: 'pumptest'

Invalid Formatting for file: ./data/timestamp_dataframes/2022-05-16_10h57m_Subject .txt
Traceback (most recent call last):
  File "/home/riwata/Projects/med_pc_repo/jupyter_notebooks/../src/extract/dataframe.py", line 71, in get_medpc_dataframe_from_list_of_files
    ts_df, medpc_log = medpc_read(file=file_path, override=True, replace=False)
  File "/home/riwata/Projects/med_pc_repo/bin/conda_environments/env/med_pc_env/lib/python3.9/site-packages/medpc2excel/medpc_read.py", line 114, in medpc_read
 

In [18]:
concatted_medpc_df

Unnamed: 0,(P)Portentry,(Q)USdelivery,(R)UStime,(W)ITIvalues,(S)CSpresentation,(N)Portexit,(K)CStype,(B)shockintensity,date,subject,file_path
0,12.34,64.0,399.0,0.0,60.01,12.39,1.0,0.0,20220506,3.4 (2),./data/timestamp_dataframes/2022-05-06_12h59m_...
1,14.60,144.0,399.0,0.0,140.01,14.79,1.0,0.0,20220506,3.4 (2),./data/timestamp_dataframes/2022-05-06_12h59m_...
2,23.95,234.0,399.0,0.0,230.01,24.88,1.0,0.0,20220506,3.4 (2),./data/timestamp_dataframes/2022-05-06_12h59m_...
3,31.83,314.0,399.0,0.0,310.01,31.90,1.0,0.0,20220506,3.4 (2),./data/timestamp_dataframes/2022-05-06_12h59m_...
4,31.99,389.0,399.0,0.0,385.01,32.09,1.0,0.0,20220506,3.4 (2),./data/timestamp_dataframes/2022-05-06_12h59m_...
...,...,...,...,...,...,...,...,...,...,...,...
2536,,,,,,,1.0,,20220508,1.3,./data/timestamp_dataframes/2022-05-08_11h14m_...
2537,,,,,,,1.0,,20220508,1.3,./data/timestamp_dataframes/2022-05-08_11h14m_...
2538,,,,,,,1.0,,20220508,1.3,./data/timestamp_dataframes/2022-05-08_11h14m_...
2539,,,,,,,1.0,,20220508,1.3,./data/timestamp_dataframes/2022-05-08_11h14m_...


- Getting the cage numbers and the dates so that we can include it in the file name

In [19]:
# removing blank spaces
cage_numbers = [number for number in metadata_df["cage"].unique() if number]
# sorting numbers
cage_numbers = sorted(cage_numbers)
cage_numbers_for_title = "_".join(cage_numbers)

In [20]:
cage_numbers_for_title

'1_2_3_4'

In [21]:
# Getting the first and last recording date to get a range
earliest_date = concatted_medpc_df["date"].min()
latest_date = concatted_medpc_df["date"].max()

In [22]:
earliest_date

'20220503'

In [23]:
latest_date

'20220516'

In [24]:
metadata_df.to_csv("./data/extracted_recording_data_and_metadata/metadata_cage_{}_date_{}_{}.csv".format(cage_numbers_for_title, earliest_date, latest_date))
metadata_df.to_excel("./data/extracted_recording_data_and_metadata/metadata_cage_{}_date_{}_{}.xlsx".format(cage_numbers_for_title, earliest_date, latest_date))

In [25]:
concatted_medpc_df.to_csv("./data/extracted_recording_data_and_metadata/MEDPC_recording_cage_{}_date_{}_{}.csv".format(cage_numbers_for_title, earliest_date, latest_date))
concatted_medpc_df.to_excel("./data/extracted_recording_data_and_metadata/MEDPC_recording_cage_{}_date_{}_{}.xlsx".format(cage_numbers_for_title, earliest_date, latest_date))

- Combining the recording and the metadata into one dataframe

In [26]:
recording_and_metadata_df = concatted_medpc_df.merge(metadata_df, left_on='file_path', right_on='index')

In [27]:
recording_and_metadata_df

Unnamed: 0,(P)Portentry,(Q)USdelivery,(R)UStime,(W)ITIvalues,(S)CSpresentation,(N)Portexit,(K)CStype,(B)shockintensity,date,subject,...,Start Date,End Date,Subject,Experiment,Group,Box,Start Time,End Time,MSN,cage
0,12.34,64.0,399.0,0.0,60.01,12.39,1.0,0.0,20220506,3.4 (2),...,05/06/22,05/06/22,3.4 (2),Pilot of Pilot,Cage 4,1,12:59:58,14:02:38,levelNP_CS_reward_laserepochON1st_noshock,4
1,14.60,144.0,399.0,0.0,140.01,14.79,1.0,0.0,20220506,3.4 (2),...,05/06/22,05/06/22,3.4 (2),Pilot of Pilot,Cage 4,1,12:59:58,14:02:38,levelNP_CS_reward_laserepochON1st_noshock,4
2,23.95,234.0,399.0,0.0,230.01,24.88,1.0,0.0,20220506,3.4 (2),...,05/06/22,05/06/22,3.4 (2),Pilot of Pilot,Cage 4,1,12:59:58,14:02:38,levelNP_CS_reward_laserepochON1st_noshock,4
3,31.83,314.0,399.0,0.0,310.01,31.90,1.0,0.0,20220506,3.4 (2),...,05/06/22,05/06/22,3.4 (2),Pilot of Pilot,Cage 4,1,12:59:58,14:02:38,levelNP_CS_reward_laserepochON1st_noshock,4
4,31.99,389.0,399.0,0.0,385.01,32.09,1.0,0.0,20220506,3.4 (2),...,05/06/22,05/06/22,3.4 (2),Pilot of Pilot,Cage 4,1,12:59:58,14:02:38,levelNP_CS_reward_laserepochON1st_noshock,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446884,,,,,,,1.0,,20220508,1.3,...,05/08/22,05/08/22,1.3,Pilot of Pilot,Cage 2,4,11:14:36,12:19:20,levelNP_CS_reward_laserepochON1st_noshock,2
446885,,,,,,,1.0,,20220508,1.3,...,05/08/22,05/08/22,1.3,Pilot of Pilot,Cage 2,4,11:14:36,12:19:20,levelNP_CS_reward_laserepochON1st_noshock,2
446886,,,,,,,1.0,,20220508,1.3,...,05/08/22,05/08/22,1.3,Pilot of Pilot,Cage 2,4,11:14:36,12:19:20,levelNP_CS_reward_laserepochON1st_noshock,2
446887,,,,,,,1.0,,20220508,1.3,...,05/08/22,05/08/22,1.3,Pilot of Pilot,Cage 2,4,11:14:36,12:19:20,levelNP_CS_reward_laserepochON1st_noshock,2


In [28]:
recording_and_metadata_df.to_csv("./data/extracted_recording_data_and_metadata/recording_and_metadata_cage_{}_date_{}_{}.csv".format(cage_numbers_for_title, earliest_date, latest_date))
recording_and_metadata_df.to_excel("./data/extracted_recording_data_and_metadata/recording_and_metadata_cage_{}_date_{}_{}.xlsx".format(cage_numbers_for_title, earliest_date, latest_date))