# Notebook 1: MED-PC Extracting the Recording Data and Metadata

Brief 1-2 sentence description of notebook.

In [1]:
import os
import sys
import glob
import re
from collections import defaultdict

In [2]:
# Imports of all used packages and libraries
import pandas as pd

In [3]:
# setting path
sys.path.append('../../src')

In [4]:
# All the libraries that were created for this repository
import extract.metadata
import extract.dataframe

## Inputs & Data

Explanation of each input and where it comes from.

In [5]:
# Inputs and Required data loading
# input varaible names are in all caps snake case
# Whenever an input changes or is used for processing 
# the vairables are all lower in snake case
INPUT_VARIABLE = 1
OUTPUT_DIR = r"/root/work/" # where data is saved should always be shown in the inputs
OUTPUT_PREFIX = "example"

In [6]:
# Getting all the file paths of the recording files(that happen to all end in `.txt`)
ALL_MED_PC_FILE = glob.glob("./data/*.txt")

## Outputs

Describe each output that the notebook creates. 

- Is it a plot or is it data?

- How valuable is the output and why is it valuable or useful?

## Processing

- Example of what the MED-PC Recording file looks like

In [7]:
with open(ALL_MED_PC_FILE[0]) as f:
    lines = f.readlines()
    for line in lines[:20]:
        print(line)

File: C:\MED-PC\Data\2022-09-24_16h16m_Subject 2.3.txt







Start Date: 09/24/22

End Date: 09/24/22

Subject: 2.3

Experiment: CD1_vs_C57_Comparison

Group: Cage_2

Box: 1

Start Time: 16:16:04

End Time: 17:19:06

MSN: C57_reward_training

A:    4399.000

D:    9000.000

E:       0.000

L:       0.000

M:       0.000

O:       0.000

T:    3660.000



- We will be extracting the first 10 or so lines that look like:

```
File: C:\MED-PC\Data\2022-05-06_12h59m_Subject 3.4 (2).txt

Start Date: 05/06/22

End Date: 05/06/22

Subject: 3.4 (2)

Experiment: Pilot of Pilot

Group: Cage 4

Box: 1

Start Time: 12:59:58

End Time: 14:02:38

MSN: levelNP_CS_reward_laserepochON1st_noshock
```

- We will just find all the lines that start with `"File", "Start Date", "End Date", "Subject", "Experiment", "Group", "Box", "Start Time", "End Time", or "MSN"`. And get the metadata from those lines. And then stop once all the metadata types have been collected

In [8]:
# This makes a nested dictionary of file paths to each individual metadata type
file_path_to_meta_data = extract.metadata.get_all_med_pc_meta_data_from_files(list_of_files=ALL_MED_PC_FILE)

In [9]:
# The metadata for the first file
for key, value in file_path_to_meta_data.items():
    print("File path: {}".format(key))
    print("Metadata types and associated values: {}".format(value))
    break

File path: ./data/2022-09-24_16h16m_Subject 2.3.txt
Metadata types and associated values: {'File': 'C:\\MED-PC\\Data\\2022-09-24_16h16m_Subject 2.3.txt', 'Start Date': '09/24/22', 'End Date': '09/24/22', 'Subject': '2.3', 'Experiment': 'CD1_vs_C57_Comparison', 'Group': 'Cage_2', 'Box': '1', 'Start Time': '16:16:04', 'End Time': '17:19:06', 'MSN': 'C57_reward_training'}


## Making a Dataframe out of the Metadata

- A Dataframe is essentially a "programmable" spreadsheet. But instead of clicking on cells, you will have to tell Python how you want to interact with the spreadsheet
    - For more information: https://realpython.com/pandas-dataframe/

In [10]:
# Turning the dictionary into a Pandas Dataframe
metadata_df = pd.DataFrame.from_dict(file_path_to_meta_data, orient="index")
# Resetting the index because currently the file path is the index 
metadata_df = metadata_df.reset_index()

- Removing all rows that don't have a cage number (which is called group in here)

In [11]:
metadata_df = metadata_df.dropna(subset=["Group"])

In [12]:
metadata_df = metadata_df[metadata_df["Group"] != ""]

- Getting the numbers out of the column that contains the cage information

In [13]:
metadata_df["cage"] = metadata_df["Group"].apply(lambda x: re.findall(r'\d+', x)[0])

In [14]:
metadata_df["cage"].unique()

array(['2', '1'], dtype=object)

- Labeling the cohort name based on the date

In [15]:
# Turning the string into a datetime object
metadata_df["date"] = pd.to_datetime(metadata_df["Start Date"])

- Grouping the cohorts

In [16]:
metadata_df = metadata_df.sort_values(["date", "Start Time", "Subject"]).reset_index(drop=True)

In [17]:
metadata_df.head()

Unnamed: 0,index,File,Start Date,End Date,Subject,Experiment,Group,Box,Start Time,End Time,MSN,cage,date
0,./data/2022-09-20_10h06m_Subject 1.1.txt,C:\MED-PC\Data\2022-09-20_10h06m_Subject 1.1.txt,09/20/22,09/20/22,1.1,CD1_vs_C57_Comparison,Cage_1,2,10:06:09,11:13:16,C57_reward_training,1,2022-09-20
1,./data/2022-09-20_10h06m_Subject 1.2.txt,C:\MED-PC\Data\2022-09-20_10h06m_Subject 1.2.txt,09/20/22,09/20/22,1.2,CD1_vs_C57_Comparison,Cage_1,4,10:06:09,11:13:16,C57_reward_training,1,2022-09-20
2,./data/2022-09-20_10h06m_Subject 1.3.txt,C:\MED-PC\Data\2022-09-20_10h06m_Subject 1.3.txt,09/20/22,09/20/22,1.3,CD1_vs_C57_Comparison,Cage_1,1,10:06:09,11:13:16,C57_reward_training,1,2022-09-20
3,./data/2022-09-20_10h06m_Subject 1.4.txt,C:\MED-PC\Data\2022-09-20_10h06m_Subject 1.4.txt,09/20/22,09/20/22,1.4,CD1_vs_C57_Comparison,Cage_1,3,10:06:09,11:13:16,C57_reward_training,1,2022-09-20
4,./data/2022-09-20_13h36m_Subject 2.1.txt,C:\MED-PC\Data\2022-09-20_13h36m_Subject 2.1.txt,09/20/22,09/20/22,2.1,CD1_vs_C57_Comparison,Cage_2,2,13:36:03,14:46:38,C57_reward_training,2,2022-09-20


In [18]:
concatted_medpc_df = extract.dataframe.get_medpc_dataframe_from_list_of_files(medpc_files=ALL_MED_PC_FILE)

In [19]:
concatted_medpc_df = concatted_medpc_df.drop(columns=["date"])

- Combining the recording and the metadata into one dataframe

In [20]:
recording_and_metadata_df = concatted_medpc_df.merge(metadata_df, left_on='file_path', right_on='index', how="inner")

In [21]:
recording_and_metadata_df = recording_and_metadata_df.sort_values(["date", "Start Time", "Subject"]).reset_index(drop=True)

In [22]:
recording_and_metadata_df.head()

Unnamed: 0,(P)Portentry,(Q)USdelivery,(R)UStime,(W)ITIvalues,(S)CSpresentation,(N)Portexit,(K)CStype,(G)controlled_stimulus_secondscomputer,(H)controlled_stimulus_minutescomputer,(I)controlled_stimulus_hourscomputer,...,End Date,Subject,Experiment,Group,Box,Start Time,End Time,MSN,cage,date
0,45.37,64.0,399.0,0.0,60.01,45.43,1.0,17.0,13.0,10.0,...,09/20/22,1.1,CD1_vs_C57_Comparison,Cage_1,2,10:06:09,11:13:16,C57_reward_training,1,2022-09-20
1,71.12,144.0,399.0,0.0,140.01,71.21,1.0,37.0,14.0,10.0,...,09/20/22,1.1,CD1_vs_C57_Comparison,Cage_1,2,10:06:09,11:13:16,C57_reward_training,1,2022-09-20
2,71.24,234.0,399.0,0.0,230.01,71.34,1.0,6.0,16.0,10.0,...,09/20/22,1.1,CD1_vs_C57_Comparison,Cage_1,2,10:06:09,11:13:16,C57_reward_training,1,2022-09-20
3,71.56,314.0,399.0,0.0,310.01,71.59,1.0,27.0,17.0,10.0,...,09/20/22,1.1,CD1_vs_C57_Comparison,Cage_1,2,10:06:09,11:13:16,C57_reward_training,1,2022-09-20
4,71.66,389.0,399.0,0.0,385.01,71.71,1.0,41.0,18.0,10.0,...,09/20/22,1.1,CD1_vs_C57_Comparison,Cage_1,2,10:06:09,11:13:16,C57_reward_training,1,2022-09-20


In [23]:
recording_and_metadata_df.tail()

Unnamed: 0,(P)Portentry,(Q)USdelivery,(R)UStime,(W)ITIvalues,(S)CSpresentation,(N)Portexit,(K)CStype,(G)controlled_stimulus_secondscomputer,(H)controlled_stimulus_minutescomputer,(I)controlled_stimulus_hourscomputer,...,End Date,Subject,Experiment,Group,Box,Start Time,End Time,MSN,cage,date
201495,,,,,,,,,,,...,09/27/22,1.4,CD1_vs_C57_Comparison,Cage_1,4,16:16:59,17:26:20,C57_reward_training,1,2022-09-27
201496,,,,,,,,,,,...,09/27/22,1.4,CD1_vs_C57_Comparison,Cage_1,4,16:16:59,17:26:20,C57_reward_training,1,2022-09-27
201497,,,,,,,,,,,...,09/27/22,1.4,CD1_vs_C57_Comparison,Cage_1,4,16:16:59,17:26:20,C57_reward_training,1,2022-09-27
201498,,,,,,,,,,,...,09/27/22,1.4,CD1_vs_C57_Comparison,Cage_1,4,16:16:59,17:26:20,C57_reward_training,1,2022-09-27
201499,,,,,,,,,,,...,09/27/22,1.4,CD1_vs_C57_Comparison,Cage_1,4,16:16:59,17:26:20,C57_reward_training,1,2022-09-27


## Adding other useful information

- Adding the date as the number of subsequent days since the start

In [24]:
recording_and_metadata_df["cage"] = recording_and_metadata_df["cage"].apply(lambda x: str(x).strip().lower()) 
recording_and_metadata_df["Subject"] = recording_and_metadata_df["Subject"].apply(lambda x: str(x).strip().lower()) 

In [25]:
recording_and_metadata_df["cohort_cage_id"] = recording_and_metadata_df.apply(lambda x: "_".join([x["cage"], x["Subject"]]), axis=1)

In [26]:
recording_and_metadata_df["cohort_cage_id"].unique()

array(['1_1.1', '1_1.2', '1_1.3', '1_1.4', '2_2.1', '2_2.2', '2_2.3',
       '2_2.4'], dtype=object)

In [27]:
recording_and_metadata_df["date_str"] = recording_and_metadata_df["date"].dt.strftime('%Y-%m-%d')

In [28]:
cohort_cage_id_to_date_to_session = defaultdict(dict)

for cohort_cage_id in recording_and_metadata_df["cohort_cage_id"].unique():
    cohort_cage_id_df = recording_and_metadata_df[recording_and_metadata_df["cohort_cage_id"] == cohort_cage_id]
    for index, date in enumerate(sorted(cohort_cage_id_df["date_str"].unique())):
        cohort_cage_id_to_date_to_session[cohort_cage_id][date] = index + 1

In [29]:
date

'2022-09-27'

In [30]:
recording_and_metadata_df["session"] = recording_and_metadata_df.apply(lambda x: cohort_cage_id_to_date_to_session[x["cohort_cage_id"]][x["date_str"]], axis=1)

In [31]:
recording_and_metadata_df.head()

Unnamed: 0,(P)Portentry,(Q)USdelivery,(R)UStime,(W)ITIvalues,(S)CSpresentation,(N)Portexit,(K)CStype,(G)controlled_stimulus_secondscomputer,(H)controlled_stimulus_minutescomputer,(I)controlled_stimulus_hourscomputer,...,Group,Box,Start Time,End Time,MSN,cage,date,cohort_cage_id,date_str,session
0,45.37,64.0,399.0,0.0,60.01,45.43,1.0,17.0,13.0,10.0,...,Cage_1,2,10:06:09,11:13:16,C57_reward_training,1,2022-09-20,1_1.1,2022-09-20,1
1,71.12,144.0,399.0,0.0,140.01,71.21,1.0,37.0,14.0,10.0,...,Cage_1,2,10:06:09,11:13:16,C57_reward_training,1,2022-09-20,1_1.1,2022-09-20,1
2,71.24,234.0,399.0,0.0,230.01,71.34,1.0,6.0,16.0,10.0,...,Cage_1,2,10:06:09,11:13:16,C57_reward_training,1,2022-09-20,1_1.1,2022-09-20,1
3,71.56,314.0,399.0,0.0,310.01,71.59,1.0,27.0,17.0,10.0,...,Cage_1,2,10:06:09,11:13:16,C57_reward_training,1,2022-09-20,1_1.1,2022-09-20,1
4,71.66,389.0,399.0,0.0,385.01,71.71,1.0,41.0,18.0,10.0,...,Cage_1,2,10:06:09,11:13:16,C57_reward_training,1,2022-09-20,1_1.1,2022-09-20,1


In [32]:
recording_and_metadata_df.tail()

Unnamed: 0,(P)Portentry,(Q)USdelivery,(R)UStime,(W)ITIvalues,(S)CSpresentation,(N)Portexit,(K)CStype,(G)controlled_stimulus_secondscomputer,(H)controlled_stimulus_minutescomputer,(I)controlled_stimulus_hourscomputer,...,Group,Box,Start Time,End Time,MSN,cage,date,cohort_cage_id,date_str,session
201495,,,,,,,,,,,...,Cage_1,4,16:16:59,17:26:20,C57_reward_training,1,2022-09-27,1_1.4,2022-09-27,8
201496,,,,,,,,,,,...,Cage_1,4,16:16:59,17:26:20,C57_reward_training,1,2022-09-27,1_1.4,2022-09-27,8
201497,,,,,,,,,,,...,Cage_1,4,16:16:59,17:26:20,C57_reward_training,1,2022-09-27,1_1.4,2022-09-27,8
201498,,,,,,,,,,,...,Cage_1,4,16:16:59,17:26:20,C57_reward_training,1,2022-09-27,1_1.4,2022-09-27,8
201499,,,,,,,,,,,...,Cage_1,4,16:16:59,17:26:20,C57_reward_training,1,2022-09-27,1_1.4,2022-09-27,8


In [33]:
recording_and_metadata_df.groupby(["subject", "date", "session"]).count().head(n=50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,(P)Portentry,(Q)USdelivery,(R)UStime,(W)ITIvalues,(S)CSpresentation,(N)Portexit,(K)CStype,(G)controlled_stimulus_secondscomputer,(H)controlled_stimulus_minutescomputer,(I)controlled_stimulus_hourscomputer,...,Subject,Experiment,Group,Box,Start Time,End Time,MSN,cage,cohort_cage_id,date_str
subject,date,session,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1.1,2022-09-20,1,919,40,40,2501,98,919,251,251,251,251,...,2508,2508,2508,2508,2508,2508,2508,2508,2508,2508
1.1,2022-09-21,2,1211,40,40,2501,98,1211,251,251,251,251,...,2508,2508,2508,2508,2508,2508,2508,2508,2508,2508
1.1,2022-09-22,3,2448,40,40,2501,98,2448,251,251,251,251,...,2508,2508,2508,2508,2508,2508,2508,2508,2508,2508
1.1,2022-09-23,4,1402,40,40,2501,98,1402,251,251,251,251,...,2508,2508,2508,2508,2508,2508,2508,2508,2508,2508
1.1,2022-09-24,5,1237,40,40,2501,98,1236,251,251,251,251,...,2508,2508,2508,2508,2508,2508,2508,2508,2508,2508
1.1,2022-09-25,6,435,40,40,2501,98,434,251,251,251,251,...,2508,2508,2508,2508,2508,2508,2508,2508,2508,2508
1.1,2022-09-26,7,2415,40,40,2501,98,2415,251,251,251,251,...,2508,2508,2508,2508,2508,2508,2508,2508,2508,2508
1.1,2022-09-27,8,2269,40,40,2501,98,2269,251,251,251,251,...,6008,6008,6008,6008,6008,6008,6008,6008,6008,6008
1.2,2022-09-20,1,1142,40,40,2501,98,1142,251,251,251,251,...,2508,2508,2508,2508,2508,2508,2508,2508,2508,2508
1.2,2022-09-21,2,1783,40,40,2501,98,1783,251,251,251,251,...,2508,2508,2508,2508,2508,2508,2508,2508,2508,2508


## Saving the dataframes(spreadsheets to files) with the metadata as part of the name

- Making necessary directories
    - If you want to use any of the other metadata as part of the name, you will have to swap out the variables in the `format()` and change the name of the folder to match your new name. The variable names are the word that's in front of the `=` at the last line of each cell. The `{}` are where the metadata variables will be inserted into the file name. For more information on formatting strings: https://www.w3schools.com/python/ref_string_format.asp
    - You can also just manually rename the files by replacing everything in `""` and removing the `.format()` part
- **NOTE: You may get an error that the file does not exist. If this is the case, it could be the file name is too long(an issue that may happen when using Jupyter Notebooks on Windows)**

In [34]:
output_directory = "./proc/extracted_recording_data_and_metadata"

In [35]:
output_directory

'./proc/extracted_recording_data_and_metadata'

In [36]:
os.makedirs(output_directory, exist_ok=True)

In [38]:
metadata_df.to_csv(os.path.join(output_directory, "{}_pilot_reward_training_metadata.csv".format(OUTPUT_PREFIX)))

In [39]:
concatted_medpc_df.to_csv(os.path.join(output_directory, "{}_pilot_reward_training_medpc.csv".format(OUTPUT_PREFIX)))

In [40]:
recording_and_metadata_df.to_csv(os.path.join(output_directory, "{}_pilot_reward_training_medpc_metadata.csv".format(OUTPUT_PREFIX)))

In [41]:
recording_and_metadata_df.groupby(["date_str", "session"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,(P)Portentry,(Q)USdelivery,(R)UStime,(W)ITIvalues,(S)CSpresentation,(N)Portexit,(K)CStype,(G)controlled_stimulus_secondscomputer,(H)controlled_stimulus_minutescomputer,(I)controlled_stimulus_hourscomputer,...,Subject,Experiment,Group,Box,Start Time,End Time,MSN,cage,date,cohort_cage_id
date_str,session,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2022-09-20,1,7426,296,296,20008,784,7426,2008,2008,2008,2008,...,20064,20064,20064,20064,20064,20064,20064,20064,20064,20064
2022-09-21,2,9132,312,312,20008,784,9131,2008,2008,2008,2008,...,20064,20064,20064,20064,20064,20064,20064,20064,20064,20064
2022-09-22,3,15101,319,319,20008,784,15101,2008,2008,2008,2008,...,21678,21678,21678,21678,21678,21678,21678,21678,21678,21678
2022-09-23,4,14235,320,320,20008,784,14235,2008,2008,2008,2008,...,20922,20922,20922,20922,20922,20922,20922,20922,20922,20922
2022-09-24,5,11211,320,320,20008,784,11208,2008,2008,2008,2008,...,20064,20064,20064,20064,20064,20064,20064,20064,20064,20064
2022-09-25,6,21602,320,320,20008,784,21600,2008,2008,2008,2008,...,27157,27157,27157,27157,27157,27157,27157,27157,27157,27157
2022-09-26,7,22486,320,320,20008,784,22483,2008,2008,2008,2008,...,23487,23487,23487,23487,23487,23487,23487,23487,23487,23487
2022-09-27,8,26881,320,320,20008,784,26878,2008,2008,2008,2008,...,48064,48064,48064,48064,48064,48064,48064,48064,48064,48064
