# Working with packages and modules

So far, we have been using Python's internal functions. These are very limited, but, thankfully, there is a plethora of packages that can do most tasks you desire. Because many of these packages are quite complex in the functionality they provide and their interdependencies with other packages, it is important to work with virtual environments, e.g., through conda, for specific projects/workflows/complex packages.

In [None]:
abs(-5)

In [None]:
sqrt(4)  # won't work – can you think of a workaround though?

In [None]:
# importing packages
import numpy  # comprehensive package for numeric array calculations

In [None]:
numpy.sqrt(4)

In [None]:
import numpy as np  # if you're lazy...

In [None]:
np.sqrt(4)

In [None]:
randint(10)  # this function doesn't exist in vanilla Python

In [None]:
np.random.randint(10)  # random number generator (integers from 0 to 10)

In [None]:
from numpy import random  # you can import specific "MODULES" from packages like so
random.randint(10)

In [None]:
from numpy import random as rnd  # you can also give them names you like
rnd.randint(10)

# Pandas basics
parts modified from https://pandas.pydata.org/docs/user_guide/10min.html

In [None]:
import pandas as pd  # numpy-based dataframe calculations

Pandas provides two types of classes for handling data:

Series: a one-dimensional labeled array holding data of any type
such as integers, strings, Python objects etc.

DataFrame: a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns.

In [None]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

In [None]:
array = np.random.rand(20, 4)
array

In [None]:
df = pd.DataFrame(data=array, index=range(20), columns=['col1', 'col2', 'col3', 'col4'])
df

## Viewing

In [None]:
df.dtypes  # data types – do you know which data types exist?

In [None]:
df.head()

In [None]:
df.tail(3)

In [None]:
df.index

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.T  # transpose

In [None]:
df.sort_index(axis=1, ascending=False)

In [None]:
df.sort_values(by="col2")

## Indexing

In [None]:
df["col3"]

In [None]:
df[17:20]

In [None]:
df.loc[17:20]  # more versatile

In [None]:
df.loc[15:16, ["col2", "col4"]]  # always FIRST ROW, then SECOND COLUMN

In [None]:
df.loc[(df.col1 > 0.5) & (df.col2 < 0.5)]  # select all rows where 'col1' is greater 0.5, 'col2' is smaller 0.5

In [None]:
df.iloc[3:5, 0:2]  # selecting by position: ILOC

## Missing values

In [None]:
df.loc[(df.col1>0.5)&(df.col2<0.5)] = np.nan  # set these rows to NAN
df

In [None]:
df.dropna()  # drop all rows with missing values (not saved!)

In [None]:
df.fillna(value=0)

In [None]:
df.isna()

In [None]:
df.interpolate(method='linear', inplace=True)  # interpolate missing values with 'linear' method, and saving the result
df

## Operations

In [None]:
df.mean()

In [None]:
df.mean(axis=1)

In [None]:
df.rolling(window=3).mean()[::3] # 3x downsampling; using pandas dataframe METHOD (note: this is not saved!)

In [None]:
df.agg(lambda x: np.mean(x) * 5.6)  # apply for each column

In [None]:
df.transform(lambda x: x * 101.2)  # apply for each cell

In [None]:
df.fillna(0).multiply(10).astype(int)  # you can concatenate methods!

## Merging

In [None]:
pieces = [df[:3], df[3:7], df[7:11]]
pieces

In [None]:
pd.concat(pieces)

## Pivot tables

In [None]:
df2 = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)
df2

In [None]:
df_pivot = pd.pivot_table(df2, values="D", index=["A", "B"], columns=["C"])
df_pivot

# Importing and exporting data

## Export formats

In [None]:
df.to_csv("../files/foo.csv")

In [None]:
pd.read_csv("../files/foo.csv")

In [None]:
df.to_clipboard()

In [None]:
df.to_excel("../files/foo.xlsx")

In [None]:
df.to_hdf("../files/foo.h5", key='df')

In [None]:
df.to_feather("../files/foo.feather")

# Playaround dataset

There's a "playaround.feather" file in the "files" folder. Try the following:
1. Open it
2. What is the 'likelihood' value at index 17?
3. What are the median values for the three columns?
4. Are there NAN values?
5. Fill the NAN values with -1
6. Find all values with -1 and set them to NAN
7. Interpolate missing values with a cubic spline function – would you say this is the optimal function for this case?
8. Save the result as "playaround_edit.h5"

In [None]:
# 1. Open
playaround = pd.read_feather('../files/playaround.feather')
playaround

In [None]:
# 2
playaround.loc[17, 'likelihood']

In [None]:
# 3
playaround.median()

In [None]:
# 4 – yes there are, in 'x' and 'y'
playaround.isna().sum()

In [None]:
# 5
playaround.fillna(-1, inplace=True)
playaround

In [None]:
# 6
playaround[playaround==-1] = np.nan
playaround

In [None]:
# 7
playaround.interpolate(method='cubicspline', inplace=True)
playaround

In [None]:
playaround.plot()  # cubic spline leads to extreme values; other methods would be more suitable

In [None]:
# 8
playaround.to_hdf('../files/playaround_edit.h5', key='playaround')

# Experimental dataset

The file "files/20230428_VGC_0647_LaserOFFDLC_testdataset.csv" contains the DeepLabCut output file of a mouse traversing 5 minutes in an elevated plus maze (EPM). Your task is to find out how much time in percentage the mouse spent in the open arms and how much time it spent in the closed arms.

Tip: In this case, the closed arm was positioned vertically, the open arms horizontally. Vertical walls are within 480 < x < 555.

For pros: Calculate the number of open arm entries

## Importing data

In [None]:
dlc = pd.read_csv("../files/20230428_VGC_0647_LaserOFFDLC_testdataset.csv")
dlc

## Clean up dataframe

In [None]:
bodyparts = list(dlc.iloc[0, 1:].unique()) + ['centroid']  # make list of bodyparts provided and add "centroid" to it
dlc.columns = [dlc.iloc[0], dlc.iloc[1]]  # set a multi-index column from indeces 0 and 1
dlc = dlc.drop('bodyparts', axis=1).drop([0, 1], axis=0).reset_index(drop=True)  # drop unnecessary header rows and columns
dlc = dlc.astype(float)  # make sure data contained are float data type
dlc

## Clean up low confidence data

In [None]:
# confidence thresholding
conf_thresh = 0.9
dlc_clean = dlc.copy().drop('likelihood', axis=1, level=1)  # make a new "clean" data frame copy without "likelihood"
for marker_name in bodyparts[:-1]:  # iterate over bodyparts (except the last, "centroid")
    # for each bodypart/marker_name where x or y are below the set threshold, set values to NAN
    dlc_clean.loc[(dlc.loc[:, (marker_name, 'likelihood')]<conf_thresh), (marker_name, ['x', 'y'])] = np.nan  
dlc_clean = dlc_clean.interpolate(method='linear')  # perform a linear interpolation of the missing values

## Calculate centroid

In [None]:
#calculate centroid
for x in ['x', 'y']:  # first x values, then y values
    # calculate the mean for x, then y, over all bodyparts, and send it to "centroid" column
    dlc_clean['centroid', x] = dlc_clean.loc[:,(dlc_clean.columns.get_level_values(0).unique(), x)].mean(axis=1)

## Median-filter traces


In [None]:
# smoothed traces
dlc_smoothed = dlc_clean.rolling(200, center=True).median()
dlc_smoothed1s = dlc_clean.rolling(40, center=True).median()

## Calculate relative time in open arm

In [None]:
xwalls = [480, 555]  # these are the boundaries

In [None]:
# Time in open arm
dlc_clean['InOpen'] = False  # make a new column with a Boolean data type if the mouse is in the open arm
# select all times where the mouse is either left (x<480) or right (x>555) of the walls and set these to "True"
dlc_clean.loc[((dlc_smoothed.centroid.x<xwalls[0])|(dlc_smoothed.centroid.x>xwalls[1])), 'InOpen'] = True

In [None]:
dlc_clean['InOpen'].mean()

## (Calculate number of open arm entries)

In [None]:
fps = 40  # frame rate per seconds
dlc_clean['Time_s'] = dlc_clean.index / fps  # adding a seconds-based time stamp to the data frame
# Creating a Boolean if at any time point the mouse went from InOpen=0 to InOpen=1 using ".diff()" method:
dlc_clean['EnterOpen'] = dlc_clean.InOpen.astype(int).diff()==1

# because this can be noisy if mice are on the border, you want to limit counting these for a given time window, say 5 seconds
entrytimes = pd.Series(dlc_clean.loc[(dlc_clean.EnterOpen), 'Time_s'].diff())
dlc_clean.loc[entrytimes.loc[(entrytimes>0)&(entrytimes<5)].index, 'EnterOpen'] = False # set minimum of 5 seconds between entries

In [None]:
dlc_clean['EnterOpen'].sum()