<a href="https://colab.research.google.com/github/obarnstedt/LINdoscope2023/blob/main/notebooks/intro_to_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Working with packages and modules

So far, we have been using Python's internal functions. These are very limited, but, thankfully, there is a plethora of packages that can do most tasks you desire. Because many of these packages are quite complex in the functionality they provide and their interdependencies with other packages, it is important to work with virtual environments, e.g., through conda, for specific projects/workflows/complex packages.

In [14]:
abs(-5)

5

In [15]:
#sqrt(4)

In [16]:
# importing packages
import numpy  # comprehensive package for numeric array calculations

In [17]:
numpy.sqrt(4)

2.0

In [18]:
import numpy as np  # if you're lazy...

In [19]:
np.sqrt(4)

2.0

In [20]:
#randint(10)  # this function doesn't exist in vanilla Python

In [21]:
np.random.randint(10)  # random number generator (integers from 0 to 10)

8

In [22]:
from numpy import random  # you can import specific "MODULES" from packages like so
random.randint(10)

5

In [23]:
from numpy import random as rnd  # you can also give them names you like
rnd.randint(10)

0

# Pandas basics
parts modified from https://pandas.pydata.org/docs/user_guide/10min.html

In [24]:
import pandas as pd  # numpy-based dataframe calculations

Pandas provides two types of classes for handling data:

Series: a one-dimensional labeled array holding data of any type
such as integers, strings, Python objects etc.

DataFrame: a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns.

In [25]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [26]:
array = np.random.rand(20, 4)
array

array([[9.64323036e-01, 1.25428928e-01, 6.16250747e-01, 3.77083957e-01],
       [1.27480965e-01, 9.78857229e-01, 5.93942779e-01, 6.93902303e-01],
       [7.38665192e-01, 7.03421384e-01, 7.46529956e-01, 3.16899538e-01],
       [5.74754556e-01, 3.85434326e-01, 6.71552798e-02, 2.10412946e-01],
       [5.21405779e-02, 5.49016881e-01, 6.98939945e-01, 1.77787312e-01],
       [7.41836792e-01, 8.53553072e-01, 2.39986278e-01, 9.10084300e-01],
       [1.53663447e-01, 6.46837965e-02, 3.61550032e-01, 3.51468103e-01],
       [4.41064090e-01, 3.01225295e-01, 6.04675512e-01, 2.58179752e-01],
       [5.92372355e-02, 9.86442023e-01, 9.63252434e-01, 1.04459499e-01],
       [1.04696003e-01, 7.35780422e-01, 8.85665732e-01, 9.66903987e-01],
       [3.76022332e-01, 1.33096791e-01, 3.06794588e-01, 6.03240740e-01],
       [5.93003009e-01, 7.11613208e-02, 5.33448366e-01, 5.41161358e-01],
       [9.38778609e-01, 6.62504932e-01, 4.17972945e-01, 8.07903346e-01],
       [7.82208890e-01, 6.53467544e-01, 2.35116568e

In [27]:
df = pd.DataFrame(data=array, index=range(20), columns=['col1', 'col2', 'col3', 'col4'])
df

Unnamed: 0,col1,col2,col3,col4
0,0.964323,0.125429,0.616251,0.377084
1,0.127481,0.978857,0.593943,0.693902
2,0.738665,0.703421,0.74653,0.3169
3,0.574755,0.385434,0.067155,0.210413
4,0.052141,0.549017,0.69894,0.177787
5,0.741837,0.853553,0.239986,0.910084
6,0.153663,0.064684,0.36155,0.351468
7,0.441064,0.301225,0.604676,0.25818
8,0.059237,0.986442,0.963252,0.104459
9,0.104696,0.73578,0.885666,0.966904


## Viewing

In [28]:
df.dtypes  # data types

col1    float64
col2    float64
col3    float64
col4    float64
dtype: object

In [29]:
df.head()

Unnamed: 0,col1,col2,col3,col4
0,0.964323,0.125429,0.616251,0.377084
1,0.127481,0.978857,0.593943,0.693902
2,0.738665,0.703421,0.74653,0.3169
3,0.574755,0.385434,0.067155,0.210413
4,0.052141,0.549017,0.69894,0.177787


In [30]:
df.tail(3)

Unnamed: 0,col1,col2,col3,col4
17,0.243039,0.845265,0.636679,0.618127
18,0.492308,0.970717,0.837672,0.333556
19,0.298806,0.120043,0.125621,0.682092


In [31]:
df.index

RangeIndex(start=0, stop=20, step=1)

In [32]:
df.columns

Index(['col1', 'col2', 'col3', 'col4'], dtype='object')

In [33]:
df.describe()

Unnamed: 0,col1,col2,col3,col4
count,20.0,20.0,20.0,20.0
mean,0.455614,0.588461,0.536793,0.490006
std,0.309614,0.341424,0.304428,0.297745
min,0.024783,0.064684,0.000891,0.01431
25%,0.147118,0.259193,0.290093,0.246238
50%,0.466686,0.682963,0.599309,0.459123
75%,0.71414,0.870682,0.769315,0.696715
max,0.964323,0.986442,0.988814,0.966904


In [34]:
df.T  # transpose

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
col1,0.964323,0.127481,0.738665,0.574755,0.052141,0.741837,0.153663,0.441064,0.059237,0.104696,0.376022,0.593003,0.938779,0.782209,0.705965,0.024783,0.699511,0.243039,0.492308,0.298806
col2,0.125429,0.978857,0.703421,0.385434,0.549017,0.853553,0.064684,0.301225,0.986442,0.73578,0.133097,0.071161,0.662505,0.653468,0.761703,0.92207,0.945352,0.845265,0.970717,0.120043
col3,0.616251,0.593943,0.74653,0.067155,0.69894,0.239986,0.36155,0.604676,0.963252,0.885666,0.306795,0.533448,0.417973,0.235117,0.000891,0.874909,0.988814,0.636679,0.837672,0.125621
col4,0.377084,0.693902,0.3169,0.210413,0.177787,0.910084,0.351468,0.25818,0.104459,0.966904,0.603241,0.541161,0.807903,0.705154,0.964474,0.162929,0.01431,0.618127,0.333556,0.682092


In [35]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,col4,col3,col2,col1
0,0.377084,0.616251,0.125429,0.964323
1,0.693902,0.593943,0.978857,0.127481
2,0.3169,0.74653,0.703421,0.738665
3,0.210413,0.067155,0.385434,0.574755
4,0.177787,0.69894,0.549017,0.052141
5,0.910084,0.239986,0.853553,0.741837
6,0.351468,0.36155,0.064684,0.153663
7,0.25818,0.604676,0.301225,0.441064
8,0.104459,0.963252,0.986442,0.059237
9,0.966904,0.885666,0.73578,0.104696


In [36]:
df.sort_values(by="col2")

Unnamed: 0,col1,col2,col3,col4
6,0.153663,0.064684,0.36155,0.351468
11,0.593003,0.071161,0.533448,0.541161
19,0.298806,0.120043,0.125621,0.682092
0,0.964323,0.125429,0.616251,0.377084
10,0.376022,0.133097,0.306795,0.603241
7,0.441064,0.301225,0.604676,0.25818
3,0.574755,0.385434,0.067155,0.210413
4,0.052141,0.549017,0.69894,0.177787
13,0.782209,0.653468,0.235117,0.705154
12,0.938779,0.662505,0.417973,0.807903


## Indexing

In [37]:
df["col3"]

0     0.616251
1     0.593943
2     0.746530
3     0.067155
4     0.698940
5     0.239986
6     0.361550
7     0.604676
8     0.963252
9     0.885666
10    0.306795
11    0.533448
12    0.417973
13    0.235117
14    0.000891
15    0.874909
16    0.988814
17    0.636679
18    0.837672
19    0.125621
Name: col3, dtype: float64

In [38]:
df[17:20]

Unnamed: 0,col1,col2,col3,col4
17,0.243039,0.845265,0.636679,0.618127
18,0.492308,0.970717,0.837672,0.333556
19,0.298806,0.120043,0.125621,0.682092


In [39]:
df.loc[17:20]  # more versatile

Unnamed: 0,col1,col2,col3,col4
17,0.243039,0.845265,0.636679,0.618127
18,0.492308,0.970717,0.837672,0.333556
19,0.298806,0.120043,0.125621,0.682092


In [40]:
df.loc[15:16, ["col2", "col4"]]  # always FIRST ROW, then SECOND COLUMN

Unnamed: 0,col2,col4
15,0.92207,0.162929
16,0.945352,0.01431


In [41]:
df.loc[(df.col1 > 0.5) & (df.col2 < 0.5)]  # select all rows where 'col1' is greater 0.5, 'col2' is smaller 0.5

Unnamed: 0,col1,col2,col3,col4
0,0.964323,0.125429,0.616251,0.377084
3,0.574755,0.385434,0.067155,0.210413
11,0.593003,0.071161,0.533448,0.541161


In [42]:
df.iloc[3:5, 0:2]  # selecting by position: ILOC

Unnamed: 0,col1,col2
3,0.574755,0.385434
4,0.052141,0.549017


## Missing values

In [43]:
df.loc[(df.col1>0.5)&(df.col2<0.5)] = np.nan  # set these rows to NAN
df

Unnamed: 0,col1,col2,col3,col4
0,,,,
1,0.127481,0.978857,0.593943,0.693902
2,0.738665,0.703421,0.74653,0.3169
3,,,,
4,0.052141,0.549017,0.69894,0.177787
5,0.741837,0.853553,0.239986,0.910084
6,0.153663,0.064684,0.36155,0.351468
7,0.441064,0.301225,0.604676,0.25818
8,0.059237,0.986442,0.963252,0.104459
9,0.104696,0.73578,0.885666,0.966904


In [44]:
df.dropna()  # drop all rows with missing values (not saved!)

Unnamed: 0,col1,col2,col3,col4
1,0.127481,0.978857,0.593943,0.693902
2,0.738665,0.703421,0.74653,0.3169
4,0.052141,0.549017,0.69894,0.177787
5,0.741837,0.853553,0.239986,0.910084
6,0.153663,0.064684,0.36155,0.351468
7,0.441064,0.301225,0.604676,0.25818
8,0.059237,0.986442,0.963252,0.104459
9,0.104696,0.73578,0.885666,0.966904
10,0.376022,0.133097,0.306795,0.603241
12,0.938779,0.662505,0.417973,0.807903


In [45]:
df.fillna(value=0)

Unnamed: 0,col1,col2,col3,col4
0,0.0,0.0,0.0,0.0
1,0.127481,0.978857,0.593943,0.693902
2,0.738665,0.703421,0.74653,0.3169
3,0.0,0.0,0.0,0.0
4,0.052141,0.549017,0.69894,0.177787
5,0.741837,0.853553,0.239986,0.910084
6,0.153663,0.064684,0.36155,0.351468
7,0.441064,0.301225,0.604676,0.25818
8,0.059237,0.986442,0.963252,0.104459
9,0.104696,0.73578,0.885666,0.966904


In [46]:
df.isna()

Unnamed: 0,col1,col2,col3,col4
0,True,True,True,True
1,False,False,False,False
2,False,False,False,False
3,True,True,True,True
4,False,False,False,False
5,False,False,False,False
6,False,False,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [47]:
df.interpolate(method='linear')  # interpolate missing values with 'linear' method

Unnamed: 0,col1,col2,col3,col4
0,,,,
1,0.127481,0.978857,0.593943,0.693902
2,0.738665,0.703421,0.74653,0.3169
3,0.395403,0.626219,0.722735,0.247343
4,0.052141,0.549017,0.69894,0.177787
5,0.741837,0.853553,0.239986,0.910084
6,0.153663,0.064684,0.36155,0.351468
7,0.441064,0.301225,0.604676,0.25818
8,0.059237,0.986442,0.963252,0.104459
9,0.104696,0.73578,0.885666,0.966904


## Operations

In [48]:
df.mean()

col1    0.410600
col2    0.658070
col3    0.559942
col4    0.510086
dtype: float64

In [49]:
df.mean(axis=1)

0          NaN
1     0.598546
2     0.626379
3          NaN
4     0.369471
5     0.686365
6     0.232841
7     0.401286
8     0.528348
9     0.673262
10    0.354789
11         NaN
12    0.706790
13    0.593987
14    0.608258
15    0.496172
16    0.661997
17    0.585777
18    0.658563
19    0.306640
dtype: float64

In [50]:
df.rolling(window=3).mean()[::3] # 3x downsampling; using pandas dataframe METHOD (note: this is not saved!)

Unnamed: 0,col1,col2,col3,col4
0,,,,
3,,,,
6,0.31588,0.489085,0.433492,0.47978
9,0.201666,0.674483,0.817865,0.443181
12,,,,
15,0.504319,0.77908,0.370305,0.610852
18,0.478286,0.920445,0.821055,0.321998


In [51]:
df.agg(lambda x: np.mean(x) * 5.6)  # apply for each column

col1    2.299362
col2    3.685195
col3    3.135674
col4    2.856484
dtype: float64

In [52]:
df.transform(lambda x: x * 101.2)  # apply for each cell

Unnamed: 0,col1,col2,col3,col4
0,,,,
1,12.901074,99.060352,60.107009,70.222913
2,74.752917,71.186244,75.548832,32.070233
3,,,,
4,5.276626,55.560508,70.732722,17.992076
5,75.073883,86.379571,24.286611,92.100531
6,15.550741,6.546,36.588863,35.568572
7,44.635686,30.484,61.193162,26.127791
8,5.994808,99.827933,97.481146,10.571301
9,10.595235,74.460979,89.629372,97.850684


In [53]:
df.fillna(0).multiply(10).astype(int)  # you can concatenate methods!

Unnamed: 0,col1,col2,col3,col4
0,0,0,0,0
1,1,9,5,6
2,7,7,7,3
3,0,0,0,0
4,0,5,6,1
5,7,8,2,9
6,1,0,3,3
7,4,3,6,2
8,0,9,9,1
9,1,7,8,9


## Merging

In [54]:
pieces = [df[:3], df[3:7], df[7:11]]
pieces

[       col1      col2      col3      col4
 0       NaN       NaN       NaN       NaN
 1  0.127481  0.978857  0.593943  0.693902
 2  0.738665  0.703421  0.746530  0.316900,
        col1      col2      col3      col4
 3       NaN       NaN       NaN       NaN
 4  0.052141  0.549017  0.698940  0.177787
 5  0.741837  0.853553  0.239986  0.910084
 6  0.153663  0.064684  0.361550  0.351468,
         col1      col2      col3      col4
 7   0.441064  0.301225  0.604676  0.258180
 8   0.059237  0.986442  0.963252  0.104459
 9   0.104696  0.735780  0.885666  0.966904
 10  0.376022  0.133097  0.306795  0.603241]

In [55]:
pd.concat(pieces)

Unnamed: 0,col1,col2,col3,col4
0,,,,
1,0.127481,0.978857,0.593943,0.693902
2,0.738665,0.703421,0.74653,0.3169
3,,,,
4,0.052141,0.549017,0.69894,0.177787
5,0.741837,0.853553,0.239986,0.910084
6,0.153663,0.064684,0.36155,0.351468
7,0.441064,0.301225,0.604676,0.25818
8,0.059237,0.986442,0.963252,0.104459
9,0.104696,0.73578,0.885666,0.966904


## Pivot tables

In [81]:
df2 = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)
df2

Unnamed: 0,A,B,C,D,E
0,one,A,foo,-0.360846,0.906995
1,one,B,foo,0.798604,-0.363912
2,two,C,foo,-0.431161,0.746849
3,three,A,bar,1.581721,0.12528
4,one,B,bar,-2.049706,1.547749
5,one,C,bar,-0.623813,-0.531619
6,two,A,foo,-2.386376,0.510941
7,three,B,foo,-0.406205,-1.033913
8,one,C,foo,-0.623031,0.430885
9,one,A,bar,0.735712,-0.79319


In [82]:
df_pivot = pd.pivot_table(df2, values="D", index=["A", "B"], columns=["C"])
df_pivot

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.735712,-0.360846
one,B,-2.049706,0.798604
one,C,-0.623813,-0.623031
three,A,1.581721,
three,B,,-0.406205
three,C,-0.743877,
two,A,,-2.386376
two,B,-0.23108,
two,C,,-0.431161


# Importing and exporting data

## Export formats

In [58]:
df.to_csv("../files/foo.csv")

In [60]:
pd.read_csv("../files/foo.csv")

Unnamed: 0.1,Unnamed: 0,col1,col2,col3,col4
0,0,,,,
1,1,0.127481,0.978857,0.593943,0.693902
2,2,0.738665,0.703421,0.74653,0.3169
3,3,,,,
4,4,0.052141,0.549017,0.69894,0.177787
5,5,0.741837,0.853553,0.239986,0.910084
6,6,0.153663,0.064684,0.36155,0.351468
7,7,0.441064,0.301225,0.604676,0.25818
8,8,0.059237,0.986442,0.963252,0.104459
9,9,0.104696,0.73578,0.885666,0.966904


In [61]:
df.to_clipboard()

In [62]:
df.to_excel("../files/foo.xlsx")

In [64]:
df.to_hdf("../files/foo.h5", key='df')

In [65]:
df.to_feather("../files/foo.feather")

# Playaround dataset

There's a "playaround.feather" file in the "files" folder. Try the following:
1. Open it
2. What is the 'likelihood' value at index 17?
3. What are the median values for the three columns?
4. Are there NAN values?
5. Fill the NAN values with -1
6. Find all values with -1 and set them to NAN
7. Interpolate missing values with a cubic spline function
8. Save the result as "playaround_edit.h5"

# Experimental dataset

The file "files/20230428_VGC_0647_LaserOFFDLC_testdataset.csv" contains the DeepLabCut output file of a mouse traversing 5 minutes in an elevated plus maze (EPM). Your task is to find out how much time in percentage the mouse spent in the open arms and how much time it spent in the closed arms.

Tip: In this case, the closed arm was positioned vertically, the open arms horizontally. Vertical walls are within 480 < x < 555.

For pros: Calculate the number of open arm entries

## Importing data

In [75]:
dlc = pd.read_csv("../files/20230428_VGC_0647_LaserOFFDLC_testdataset.csv")
dlc

Unnamed: 0,scorer,DLC_resnet50_EPMApr27shuffle1_850000,DLC_resnet50_EPMApr27shuffle1_850000.1,DLC_resnet50_EPMApr27shuffle1_850000.2,DLC_resnet50_EPMApr27shuffle1_850000.3,DLC_resnet50_EPMApr27shuffle1_850000.4,DLC_resnet50_EPMApr27shuffle1_850000.5,DLC_resnet50_EPMApr27shuffle1_850000.6,DLC_resnet50_EPMApr27shuffle1_850000.7,DLC_resnet50_EPMApr27shuffle1_850000.8,DLC_resnet50_EPMApr27shuffle1_850000.9,DLC_resnet50_EPMApr27shuffle1_850000.10,DLC_resnet50_EPMApr27shuffle1_850000.11,DLC_resnet50_EPMApr27shuffle1_850000.12,DLC_resnet50_EPMApr27shuffle1_850000.13,DLC_resnet50_EPMApr27shuffle1_850000.14,DLC_resnet50_EPMApr27shuffle1_850000.15,DLC_resnet50_EPMApr27shuffle1_850000.16,DLC_resnet50_EPMApr27shuffle1_850000.17
0,bodyparts,nose,nose,nose,left paw,left paw,left paw,right paw,right paw,right paw,left hindpaw,left hindpaw,left hindpaw,right hindpaw,right hindpaw,right hindpaw,tail,tail,tail
1,coords,x,y,likelihood,x,y,likelihood,x,y,likelihood,x,y,likelihood,x,y,likelihood,x,y,likelihood
2,0,484.6491394042969,201.5142364501953,0.9965528249740601,497.260009765625,184.5144500732422,0.9917073845863342,532.584716796875,198.62220764160156,0.9999322891235352,499.3367614746094,170.40240478515625,0.9999860525131226,540.806640625,163.20863342285156,0.9999994039535522,522.0670166015625,135.1444549560547,0.9999971389770508
3,1,484.1271667480469,202.01803588867188,0.9954521059989929,495.7225341796875,183.4581756591797,0.9817245006561279,532.56494140625,198.4882354736328,0.9999420642852783,499.2420349121094,170.74761962890625,0.9999852180480957,540.7224731445312,163.1923828125,0.9999994039535522,522.0958251953125,135.4122314453125,0.9999970197677612
4,2,484.54620361328125,201.7998809814453,0.9969022870063782,495.71978759765625,183.4489288330078,0.9802581071853638,532.5015869140625,198.50601196289062,0.9999428987503052,499.0804443359375,170.94410705566406,0.9999837875366211,540.7445068359375,163.1587371826172,0.9999994039535522,522.20166015625,135.3200225830078,0.9999972581863403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11997,11995,505.4471130371094,492.70587158203125,0.9998606443405151,496.9927978515625,447.9296569824219,0.9999939203262329,513.1210327148438,463.7508239746094,0.9999961853027344,488.94268798828125,389.22479248046875,0.999998927116394,541.878173828125,393.3909606933594,0.9999945163726807,516.3087158203125,373.951416015625,1.0
11998,11996,504.7735290527344,492.8782958984375,0.9997372031211853,496.9559631347656,447.8119812011719,0.9999938011169434,513.05322265625,463.7399597167969,0.9999960660934448,488.94091796875,389.2264099121094,0.9999990463256836,541.8779296875,393.4159851074219,0.9999945163726807,516.4034423828125,373.970458984375,1.0
11999,11997,504.75067138671875,493.0707092285156,0.9997734427452087,496.9162292480469,447.7670593261719,0.9999938011169434,512.94921875,463.75726318359375,0.9999960660934448,488.99041748046875,389.1574401855469,0.9999990463256836,542.2305908203125,393.5039367675781,0.9999945163726807,516.4120483398438,373.9803466796875,1.0
12000,11998,503.94317626953125,492.6711730957031,0.9997828602790833,496.9664611816406,447.7694396972656,0.9999940395355225,512.8672485351562,463.7599182128906,0.9999957084655762,488.9565124511719,389.1590270996094,0.9999990463256836,541.8826293945312,393.4291076660156,0.9999945163726807,516.3770141601562,373.92620849609375,1.0


## Clean up dataframe

In [76]:
bodyparts = list(dlc.iloc[0, 1:].unique()) + ['centroid']
dlc.columns = [dlc.iloc[0], dlc.iloc[1]]
dlc = dlc.drop('bodyparts', axis=1).drop([0, 1], axis=0).reset_index(drop=True)
dlc = dlc.astype(float)
dlc

  dlc = dlc.drop('bodyparts', axis=1).drop([0, 1], axis=0).reset_index(drop=True)


Unnamed: 0_level_0,nose,nose,nose,left paw,left paw,left paw,right paw,right paw,right paw,left hindpaw,left hindpaw,left hindpaw,right hindpaw,right hindpaw,right hindpaw,tail,tail,tail
1,x,y,likelihood,x,y,likelihood,x,y,likelihood,x,y,likelihood,x,y,likelihood,x,y,likelihood
0,484.649139,201.514236,0.996553,497.260010,184.514450,0.991707,532.584717,198.622208,0.999932,499.336761,170.402405,0.999986,540.806641,163.208633,0.999999,522.067017,135.144455,0.999997
1,484.127167,202.018036,0.995452,495.722534,183.458176,0.981725,532.564941,198.488235,0.999942,499.242035,170.747620,0.999985,540.722473,163.192383,0.999999,522.095825,135.412231,0.999997
2,484.546204,201.799881,0.996902,495.719788,183.448929,0.980258,532.501587,198.506012,0.999943,499.080444,170.944107,0.999984,540.744507,163.158737,0.999999,522.201660,135.320023,0.999997
3,483.503876,202.272873,0.997923,497.498322,183.699356,0.993420,532.487976,198.398743,0.999966,498.371246,171.108597,0.999988,540.778931,163.290070,0.999999,522.532532,135.116318,0.999998
4,484.679871,203.999573,0.999774,494.976318,184.692993,0.999932,532.068176,198.287735,0.999982,499.239136,161.985825,0.999999,540.774597,163.335007,1.000000,522.674927,135.367294,0.999998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,505.447113,492.705872,0.999861,496.992798,447.929657,0.999994,513.121033,463.750824,0.999996,488.942688,389.224792,0.999999,541.878174,393.390961,0.999995,516.308716,373.951416,1.000000
11996,504.773529,492.878296,0.999737,496.955963,447.811981,0.999994,513.053223,463.739960,0.999996,488.940918,389.226410,0.999999,541.877930,393.415985,0.999995,516.403442,373.970459,1.000000
11997,504.750671,493.070709,0.999773,496.916229,447.767059,0.999994,512.949219,463.757263,0.999996,488.990417,389.157440,0.999999,542.230591,393.503937,0.999995,516.412048,373.980347,1.000000
11998,503.943176,492.671173,0.999783,496.966461,447.769440,0.999994,512.867249,463.759918,0.999996,488.956512,389.159027,0.999999,541.882629,393.429108,0.999995,516.377014,373.926208,1.000000


## Clean up low confidence data

In [77]:
# confidence thresholding
conf_thresh = 0.9
dlc_clean = dlc.copy().drop('likelihood', axis=1, level=1)
for marker_name in bodyparts[:-1]:
    dlc_clean.loc[(dlc.loc[:, (marker_name, 'likelihood')]<conf_thresh), (marker_name, ['x', 'y'])] = np.nan
dlc_clean = dlc_clean.interpolate(method='linear')

## Calculate centroid

In [78]:
#calculate centroid
for x in ['x', 'y']:
    dlc_clean['centroid', x] = dlc_clean.loc[:,(dlc_clean.columns.get_level_values(0).unique(), x)].mean(axis=1)

## Median-filter traces


In [80]:
# smoothed traces
dlc_smoothed = dlc_clean.rolling(200, center=True).median()
dlc_smoothed1s = dlc_clean.rolling(40, center=True).median()

## Calculate relative time in open arm

In [None]:
xwalls = [480, 555]

In [None]:
# Time in open arm
dlc_clean['InOpen'] = False
dlc_clean.loc[((dlc_smoothed.centroid.x<xwalls[0])|(dlc_smoothed.centroid.x>xwalls[1])), 'InOpen'] = True

## (Calculate number of open arm entries)

In [None]:
dlc_clean['EnterOpen'] = dlc_clean.InOpen.astype(int).diff()==1
first_open_entry = dlc_clean.loc[(dlc_clean.EnterOpen)].groupby(['Group', 'Mouse', 'Opto']).min()['Time_s']
entrytimes = pd.Series(dlc_clean.loc[(dlc_clean.EnterOpen), 'Time_s'].diff())
dlc_clean.loc[entrytimes.loc[(entrytimes>0)&(entrytimes<5)].index, 'EnterOpen'] = False # set minimum of 5 seconds between entries