#setup:
- connect to google drive in this step

In [1]:
import pandas as pd
import math
import numpy as np
import os
import glob
import sklearn
import pickle
import glob
import scipy.io
import cv2
from google.colab.patches import cv2_imshow

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
project_path = '/content/drive/MyDrive/demo-me-2021-07-14'
output_dir = os.path.join(project_path, "behaviors/")
if not os.path.exists(output_dir):
  os.mkdir(output_dir)

In [4]:
def get_data(individual, bodypart, h5_file):
  mouse_data = h5_file.xs(individual,level='individuals',axis=1)
  out_data = mouse_data.xs(bodypart,level='bodyparts',axis=1)
  out_data.columns = out_data.columns.droplevel("scorer")
  output = out_data.copy()
  return output

In [5]:
def within_area(area_vector, input_coor):
  area_startx = area_vector[0]
  area_starty = area_vector[1]
  area_distx = area_vector[2]
  area_disty = area_vector[3]
  x = input_coor["x"].iloc[0]
  y = input_coor["y"].iloc[0]
  if (area_startx <= x <= (area_startx+area_distx)) and (area_starty <= y <= (area_starty+area_disty)):
    result = 1
  else:
    result = 0
  return result

In [6]:
def euclid_dist(point1_coor, point2_coor):
  point1 = np.array((point1_coor["x"].iloc[0], point1_coor["y"].iloc[0]))
  point2 = np.array((point2_coor["x"].iloc[0], point2_coor["y"].iloc[0]))
  output_dist = np.linalg.norm(point1 - point2)
  return output_dist

In [7]:
def euclid_angle(pointa_coor, pointb_coor, pointc_coor):
  #angle_{pointa, pointb, pointc}
  a = np.array((pointa_coor["x"].iloc[0], pointa_coor["y"].iloc[0]))
  b = np.array((pointb_coor["x"].iloc[0], pointb_coor["y"].iloc[0]))
  c = np.array((pointc_coor["x"].iloc[0], pointc_coor["y"].iloc[0]))

  ba = a - b
  bc = c - b

  cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
  angle = np.arccos(cosine_angle)

  result = np.degrees(angle)
  return result

#Open some corrected_behavior data and mount_events data:

In [104]:
video_name = 'PZ71_1'

corrected_behavior_file = open(output_dir + video_name + '_corrected_behavior.pickle', 'rb')
corrected_behavior = pickle.load(corrected_behavior_file)
corrected_behavior_file.close()

# area_vec = [area_startx, area_starty, area_distx, area_disty]
female_side_mat = scipy.io.loadmat(project_path + "/videos/" + video_name + "_female_side.mat")
female_side_vec = female_side_mat['croprect'][0]
male_side_mat = scipy.io.loadmat(project_path + "/videos/" + video_name + "_male_side.mat")
male_side_vec = male_side_mat['croprect'][0]

h5_file = pd.read_hdf(project_path +'/videos/'+ video_name + 'DLC_dlcrnetms5_demoJul14shuffle0_20000_el_filtered.h5')
[nframes, ncols] = h5_file.shape

#Open previously calculated training_mount_examples or training_nonmount_examples

In [None]:
training_mount_examples_file = open(output_dir + video_name + '_training_mount_examples.pickle', 'rb')
training_mount_examples = pickle.load(training_mount_examples_file)
training_mount_examples_file.close()

FileNotFoundError: ignored

In [None]:
training_nonmount_examples_file = open(output_dir + video_name + '_training_nonmount_examples.pickle', 'rb')
training_nonmount_examples = pickle.load(training_nonmount_examples_file)
training_nonmount_examples_file.close()

#features
- 7 points on the male, whether or not exist
- 7 points on the female, whether or not exist
- male points to every female point: 7 * 7 = 49 distances
- angle_1 between snout_1, shoulder_1, and snout_2
- angle_2 between snout_2, shoulder_2, and snout_1

_Note_: Euclidean distance doesn't care about which mouse is which, the distance/difference will still be the same

#analysis setup

In [122]:
# get all data ready for accessing as needed later on
mouse1_feature_points = {}
mouse1_feature_points['snout'] = get_data('mus1', 'snout', h5_file)
mouse1_feature_points['shoulder'] = get_data('mus1', 'shoulder', h5_file)
mouse1_feature_points['spine1'] = get_data('mus1', 'spine1', h5_file)
mouse1_feature_points['spine2'] = get_data('mus1', 'spine2', h5_file)
mouse1_feature_points['spine3'] = get_data('mus1', 'spine3', h5_file)
mouse1_feature_points['spine4'] = get_data('mus1', 'spine4', h5_file)
mouse1_feature_points['tailbase'] = get_data('mus1', 'tailbase', h5_file)

mouse2_feature_points = {}
mouse2_feature_points['snout'] = get_data('mus2', 'snout', h5_file)
mouse2_feature_points['shoulder'] = get_data('mus2', 'shoulder', h5_file)
mouse2_feature_points['spine1'] = get_data('mus2', 'spine1', h5_file)
mouse2_feature_points['spine2'] = get_data('mus2', 'spine2', h5_file)
mouse2_feature_points['spine3'] = get_data('mus2', 'spine3', h5_file)
mouse2_feature_points['spine4'] = get_data('mus2', 'spine4', h5_file)
mouse2_feature_points['tailbase'] = get_data('mus2', 'tailbase', h5_file)

mouse3_feature_points = {}
mouse3_feature_points['snout'] = get_data('mus3', 'snout', h5_file)
mouse3_feature_points['shoulder'] = get_data('mus3', 'shoulder', h5_file)
mouse3_feature_points['spine1'] = get_data('mus3', 'spine1', h5_file)
mouse3_feature_points['spine2'] = get_data('mus3', 'spine2', h5_file)
mouse3_feature_points['spine3'] = get_data('mus3', 'spine3', h5_file)
mouse3_feature_points['spine4'] = get_data('mus3', 'spine4', h5_file)
mouse3_feature_points['tailbase'] = get_data('mus3', 'tailbase', h5_file)


In [131]:
# male_side_vec = [x, y, width, height]; female_side_vec = [x, y, width, height]
relevant_area = female_side_vec
# relevant area is female_side_vec + male_side_vec size exactly right next to each other + 50 buffer
relevant_area[2] = female_side_vec[2] + male_side_vec[2] + 50
relevant_area[3] = female_side_vec[3] + male_side_vec[3] + 50

In [130]:
def torso_in_area(area_vector, mouse_num, i):
  # finds if any part of torso at all is in area
  if mouse_num == 1:
    feature_points = mouse1_feature_points
    name = 'mus1'
  elif mouse_num == 2:
    feature_points = mouse2_feature_points
    name = 'mus2'
  elif mouse_num == 3:
    feature_points = mouse3_feature_points
    name = 'mus3'

  if within_area(area_vector, feature_points['snout'].loc[[i]]):
    return 1
  elif within_area(area_vector, feature_points['shoulder'].loc[[i]]):
    return 1
  elif within_area(area_vector, feature_points['spine1'].loc[[i]]):
    return 1
  elif within_area(area_vector, feature_points['spine2'].loc[[i]]):
    return 1
  elif within_area(area_vector, feature_points['spine3'].loc[[i]]):
    return 1
  elif within_area(area_vector, feature_points['spine4'].loc[[i]]):
    return 1
  elif within_area(area_vector, feature_points['tailbase'].loc[[i]]):
    return 1
  else:
    return 0

In [129]:
def snout_in_area(area_vector, mouse_num, i):
  # finds if any part of torso at all is in area
  if mouse_num == 1:
    feature_points = mouse1_feature_points
    name = 'mus1'
  elif mouse_num == 2:
    feature_points = mouse2_feature_points
    name = 'mus2'
  elif mouse_num == 3:
    feature_points = mouse3_feature_points
    name = 'mus3'

  if within_area(area_vector, feature_points['snout'].loc[[i]]):
    result = 1
  else:
    result = 0
  return result

In [128]:
def spine2_in_area(area_vector, mouse_num, i):
  # finds if any part of torso at all is in area
  if mouse_num == 1:
    feature_points = mouse1_feature_points
    name = 'mus1'
  elif mouse_num == 2:
    feature_points = mouse2_feature_points
    name = 'mus2'
  elif mouse_num == 3:
    feature_points = mouse3_feature_points
    name = 'mus3'

  if within_area(area_vector, feature_points['spine2'].loc[[i]]):
    result = 1
  else:
    result = 0
  return result

In [132]:
def tailbase_in_area(area_vector, mouse_num, i):
  # finds if any part of torso at all is in area
  if mouse_num == 1:
    feature_points = mouse1_feature_points
    name = 'mus1'
  elif mouse_num == 2:
    feature_points = mouse2_feature_points
    name = 'mus2'
  elif mouse_num == 3:
    feature_points = mouse3_feature_points
    name = 'mus3'

  if within_area(area_vector, feature_points['tailbase'].loc[[i]]):
    result = 1
  else:
    result = 0
  return result

In [181]:
def get_i_features(i, input_features, first_mouse_feature_points, second_mouse_feature_points):
  col_num = 0

  if within_area(relevant_area, first_mouse_feature_points["tailbase"].loc[[i]]) and within_area(relevant_area, second_mouse_feature_points["tailbase"].loc[[i]]):
    input_features[i, col_num] = 1
  else:
    input_features[i, col_num] = 0
  col_num = col_num + 1
  
  if within_area(relevant_area, first_mouse_feature_points["spine2"].loc[[i]]) and within_area(relevant_area, second_mouse_feature_points["spine2"].loc[[i]]):
    input_features[i, col_num] = 1
  else:
    input_features[i, col_num] = 0
  col_num = col_num + 1

  if within_area(relevant_area, first_mouse_feature_points["snout"].loc[[i]]) and within_area(relevant_area, second_mouse_feature_points["snout"].loc[[i]]):
    input_features[i, col_num] = 1
  else:
    input_features[i, col_num] = 0
  col_num = col_num + 1

  input_features[i, col_num] = euclid_dist(first_mouse_feature_points['snout'].loc[[i]], second_mouse_feature_points['snout'].loc[[i]])
  col_num = col_num + 1
  input_features[i, col_num] = euclid_dist(first_mouse_feature_points['spine2'].loc[[i]], second_mouse_feature_points['spine2'].loc[[i]])
  col_num = col_num + 1
  input_features[i, col_num] = euclid_dist(first_mouse_feature_points['tailbase'].loc[[i]], second_mouse_feature_points['tailbase'].loc[[i]])
  col_num = col_num + 1

  cross_dist1 = euclid_dist(first_mouse_feature_points['snout'].loc[[i]], second_mouse_feature_points['tailbase'].loc[[i]])
  cross_dist2 = euclid_dist(first_mouse_feature_points['tailbase'].loc[[i]], second_mouse_feature_points['snout'].loc[[i]])
  input_features[i, col_num] = abs(cross_dist1 - cross_dist2)
  col_num = col_num + 1

  return input_features

#mount example features

In [111]:
forest_example_behavior = corrected_behavior.copy()
for i in range(0, 36001):
  beh = corrected_behavior["behavior"][i]
  if beh == 'mount':
    # this is probably redundant but oh well it takes 3s
    # 3 frames = 0.1 second surround
    forest_example_behavior["behavior"][i-1:i] = 'putative_mount'
    forest_example_behavior["behavior"][i:i+1] = 'putative_mount'

In [112]:
# find all frames with two individuals
# takes a little bit but not too long (e.g. 54s)
training_mount_examples = pd.DataFrame(index=range(36000), columns=['first_mouse', 'second_mouse', 'index'])

example_n = 0
for i in range(0, 36001):
  if forest_example_behavior["behavior"][i] == 'putative_mount':

    if torso_in_area(relevant_area, 1, i) and torso_in_area(relevant_area, 2, i):
      training_mount_examples['first_mouse'][example_n] = 1
      training_mount_examples['second_mouse'][example_n] = 2
      training_mount_examples['index'][example_n] = i
      example_n = example_n + 1

    elif torso_in_area(relevant_area, 1, i) and torso_in_area(relevant_area, 3, i):
      training_mount_examples['first_mouse'][example_n] = 1
      training_mount_examples['second_mouse'][example_n] = 3
      training_mount_examples['index'][example_n] = i
      example_n = example_n + 1

    elif torso_in_area(relevant_area, 2, i) and torso_in_area(relevant_area, 3, i):
      training_mount_examples['first_mouse'][example_n] = 2
      training_mount_examples['second_mouse'][example_n] = 3
      training_mount_examples['index'][example_n] = i
      example_n = example_n + 1

training_mount_examples = training_mount_examples.dropna(how='all')

In [113]:
num_mount_examples = training_mount_examples.shape[0]
print(num_mount_examples)

1804


In [182]:
# loop takes 20s
num_mount_examples = training_mount_examples.shape[0]
input_features = np.empty([num_mount_examples, 7])

#for i in range(188, 189):
for i in range(0, num_mount_examples):
  if training_mount_examples['first_mouse'][i] == 1:
    first_mouse_feature_points = mouse1_feature_points
  elif training_mount_examples['first_mouse'][i] == 2:
    first_mouse_feature_points = mouse2_feature_points
  elif training_mount_examples['first_mouse'][i] == 3:
    first_mouse_feature_points = mouse3_feature_points
    
  if training_mount_examples['second_mouse'][i] == 1:
    second_mouse_feature_points = mouse1_feature_points
  elif training_mount_examples['second_mouse'][i] == 2:
    second_mouse_feature_points = mouse2_feature_points
  elif training_mount_examples['second_mouse'][i] == 3:
    second_mouse_feature_points = mouse3_feature_points
  
  input_features = get_i_features(i, input_features, first_mouse_feature_points, second_mouse_feature_points)
  
training_mount_features = np.nan_to_num(input_features, nan=-1)

#nonmount example features
- requires previous section (mount example features) to have already run

In [49]:
forest_example_behavior = corrected_behavior.copy()
for i in range(0, 36001):
  beh = corrected_behavior["behavior"][i]
  if beh == 'mount':
    # this is probably redundant but oh well it takes 3s
    # 30 frames = 1 second surround
    forest_example_behavior["behavior"][i-30:i] = 'ignore'
    forest_example_behavior["behavior"][i:i+30] = 'ignore'

In [50]:
# find all frames with two individuals that are not putative mounts
# takes a little bit but not too long (e.g. 5 min)
training_nonmount_examples = pd.DataFrame(index=range(36000), columns=['first_mouse', 'second_mouse', 'index'])

example_n = 0
for i in range(0, 36001):
  # go through the rest of the (nonmount) frames
  if forest_example_behavior["behavior"][i] != 'ignore':

    if torso_in_area(relevant_area, 1, i) and torso_in_area(relevant_area, 2, i):
      training_nonmount_examples['first_mouse'][example_n] = 1
      training_nonmount_examples['second_mouse'][example_n] = 2
      training_nonmount_examples['index'][example_n] = i
      example_n = example_n + 1

    elif torso_in_area(relevant_area, 1, i) and torso_in_area(relevant_area, 3, i):
      training_nonmount_examples['first_mouse'][example_n] = 1
      training_nonmount_examples['second_mouse'][example_n] = 3
      training_nonmount_examples['index'][example_n] = i
      example_n = example_n + 1

    elif torso_in_area(relevant_area, 2, i) and torso_in_area(relevant_area, 3, i):
      training_nonmount_examples['first_mouse'][example_n] = 2
      training_nonmount_examples['second_mouse'][example_n] = 3
      training_nonmount_examples['index'][example_n] = i
      example_n = example_n + 1

training_nonmount_examples = training_nonmount_examples.dropna(how='all')

In [183]:
# takes 3 minutes
num_nonmount_examples = training_nonmount_examples.shape[0]
input_features = np.empty([num_nonmount_examples, 7])

for i in range(0, num_nonmount_examples):
#for i in range(0, 1000):
  if training_nonmount_examples['first_mouse'][i] == 1:
    first_mouse_feature_points = mouse1_feature_points
  elif training_nonmount_examples['first_mouse'][i] == 2:
    first_mouse_feature_points = mouse2_feature_points
  elif training_nonmount_examples['first_mouse'][i] == 3:
    first_mouse_feature_points = mouse3_feature_points
    
  if training_nonmount_examples['second_mouse'][i] == 1:
    second_mouse_feature_points = mouse1_feature_points
  elif training_nonmount_examples['second_mouse'][i] == 2:
    second_mouse_feature_points = mouse2_feature_points
  elif training_nonmount_examples['second_mouse'][i] == 3:
    second_mouse_feature_points = mouse3_feature_points
  
  input_features = get_i_features(i, input_features, first_mouse_feature_points, second_mouse_feature_points)
  
training_nonmount_features = np.nan_to_num(input_features, nan=-1)

#all features together

In [199]:
random_indices = np.random.choice(training_nonmount_features.shape[0], num_mount_examples, replace=False)
final_training_nonmount_features = training_nonmount_features[random_indices]

random_indices = np.random.choice(training_nonmount_features.shape[0], 10*num_mount_examples, replace=False)
final_testing_nonmount_features = training_nonmount_features[random_indices]

In [186]:
print("existences")
print(training_mount_features.mean(axis=0)[0:3])
print(training_nonmount_features.mean(axis=0)[0:3])

print("distances")
print(training_mount_features.mean(axis=0)[3:6])
print(training_nonmount_features.mean(axis=0)[3:6])

print("crossdist_diff")
print(training_mount_features.mean(axis=0)[6:7])
print(training_nonmount_features.mean(axis=0)[6:7])

existences
[0.37694013 0.4018847  0.28991131]
[0.52957993 0.55424259 0.31146604]
distances
[24.22572786 33.68712866 34.31726258]
[28.34916746 54.22111244 56.42825262]
crossdist_diff
[22.21653273]
[11.9683451]


In [200]:
training_features = np.vstack((training_mount_features, final_training_nonmount_features))
testing_features = np.vstack((training_mount_features, final_testing_nonmount_features))

In [201]:
training_labels = np.zeros(training_features.shape[0], dtype=int)
training_labels[0:num_mount_examples] = 1

testing_labels = np.zeros(testing_features.shape[0], dtype=int)
testing_labels[0:num_mount_examples] = 1

In [202]:
print(num_mount_examples)
print(num_nonmount_examples)

1804
26234


In [189]:
# save features and labels
with open(output_dir + video_name + '_training_features.pickle', 'wb') as f:
    pickle.dump(training_features, f)
with open(output_dir + video_name + '_training_labels.pickle', 'wb') as f:
    pickle.dump(training_labels, f)

#features from multiple videos

In [None]:
training_features = np.array([])
training_labels = np.array([])

In [None]:
video_name = 'PZ71_1'
video_training_features_file = open(output_dir + video_name + '_training_features.pickle', 'rb')
video_training_features = pickle.load(video_training_features_file)
video_training_features_file.close()

video_training_labels_file = open(output_dir + video_name + '_training_labels.pickle', 'rb')
video_training_labels = pickle.load(video_training_labels_file)
video_training_labels_file.close()

In [None]:
training_features = np.vstack([training_features, video_training_features]) if training_features.size else video_training_features
training_labels = np.vstack([training_labels, video_training_labels]) if training_labels.size else video_training_labels

#sklearn stuff

In [203]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import confusion_matrix

In [204]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(training_features, training_labels, test_size = 0.05, random_state = 42, stratify=training_labels)
print(X_train.shape)
print(X_test.shape)

(3427, 7)
(181, 7)


In [205]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [206]:
# Fitting Random Forest Classification to the Training set
classifier = BalancedRandomForestClassifier(n_estimators = 100, random_state = 42)
classifier.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=42)

In [207]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']))

Predicted   0   1
Actual           
0          51  40
1          11  79


In [208]:
# Feature Scaling
testing_test = scaler.transform(testing_features)
# Predicting the Test set results
y_pred = classifier.predict(testing_test)
# Making the Confusion Matrix
print(pd.crosstab(testing_labels, y_pred, rownames=['Actual'], colnames=['Predicted']))

Predicted     0     1
Actual               
0          8708  9332
1            26  1778


#optimizing ntrees

In [None]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']))

KeyboardInterrupt: ignored

In [None]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 5000, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']))

Predicted    0    1
Actual             
0          466  107
1           19  568


# save classifer

In [85]:
# save the model
with open(output_dir + 'classifer_1202v3.pickle', 'wb') as f:
  pickle.dump(classifier, f)
with open(output_dir + 'classifer_1202v3_scaler.pickle', 'wb') as f:
  pickle.dump(scaler, f)