# Preprocessing notebook

The goal of this notebook is to to process the data obtained from the ResNets and the extracted feature vectors. The main steps performed are creating positive and negative samples. While there was some experimentation with dimensionality reduction, there was no performance gain from it.

## Imports

In [369]:
import pandas as pd
import numpy as np
import json

### Data imports

In [370]:
df = pd.read_json('../data/tracks.json')
with open('../data/labels.json') as f:
    labels = json.load(f)

### Visualizating the data

In [371]:
df

Unnamed: 0,track1,track2,person1,person2,version,episode,segment,features,video
0,412,411,phoebe,gunther,0,episode01,"[1, 16]","[0.305748283863067, 0.403462648391723, 0.37505...",video412_411_0.mp4
1,412,411,phoebe,gunther,0,episode01,"[17, 32]","[0.9845066666603081, 0.00869304779917, 0.28160...",video412_411_0.mp4
2,412,411,phoebe,gunther,0,episode01,"[33, 48]","[0.601864755153656, 0.16546161472797302, 1.026...",video412_411_0.mp4
3,412,411,phoebe,gunther,0,episode01,"[49, 64]","[0.8926258087158201, 0.007743219844996, 1.2013...",video412_411_0.mp4
4,73,74,phoebe,rachel,0,episode01,"[1, 16]","[0.28641876578330905, 0.00654274970293, 1.8379...",video073_074_0.mp4
...,...,...,...,...,...,...,...,...,...
19343,558,557,monica,phoebe,0,episode14,"[49, 64]","[0.022156653925776003, 0.604685246944427, 0.00...",video558_557_0.mp4
19344,549,547,monica,joey,0,episode14,"[1, 16]","[1.074834704399108, 0.797054231166839, 2.67562...",video549_547_0.mp4
19345,549,547,monica,joey,0,episode14,"[17, 32]","[0.794210314750671, 0.7135680913925171, 1.6077...",video549_547_0.mp4
19346,549,547,monica,joey,0,episode14,"[33, 48]","[1.175412058830261, 1.063856244087219, 1.45035...",video549_547_0.mp4


The _version_ column is superfluous so we drop it.

In [234]:
df.drop(columns=['version'], inplace=True)

### Track example

Let us look at the track 16/17 in episode 1. This corresponds to the segments [1, 16] to [97, 112] of Chandler and Phoebe. We see that the associated label is False, i.e. there is no interaction.

In [237]:
df[(df['track1'] == 16) & (df['track2'] == 17) & (df['episode'] == 'episode01')]

Unnamed: 0,track1,track2,person1,person2,episode,segment,features,video
587,16,17,chandler,phoebe,episode01,"[1, 16]","[0.35464870929718, 1.900528669357299, 1.825586...",video016_017_0.mp4
588,16,17,chandler,phoebe,episode01,"[17, 32]","[0.009576153010129, 0.39936581254005404, 1.336...",video016_017_0.mp4
589,16,17,chandler,phoebe,episode01,"[33, 48]","[0.06511735171079601, 0.9418035745620721, 0.05...",video016_017_0.mp4
590,16,17,chandler,phoebe,episode01,"[49, 64]","[0.24777355790138203, 2.288475275039673, 1.437...",video016_017_0.mp4
591,16,17,chandler,phoebe,episode01,"[65, 80]","[0.33924889564514105, 1.943073034286499, 1.422...",video016_017_0.mp4
592,16,17,chandler,phoebe,episode01,"[81, 96]","[0.19470424950122803, 2.206629037857055, 0.656...",video016_017_0.mp4
593,16,17,chandler,phoebe,episode01,"[97, 112]","[0.23017062246799402, 1.040428400039672, 1.093...",video016_017_0.mp4


In [322]:
df[(df['track1'] == 17) & (df['track2'] == 16) & (df['episode'] == 'episode01')]

Unnamed: 0,track1,track2,person1,person2,episode,segment,features,video
979,17,16,phoebe,chandler,episode01,"[1, 16]","[0.019588408991694003, 0.625551521778106, 3.55...",video017_016_0.mp4
980,17,16,phoebe,chandler,episode01,"[17, 32]","[0.006818289868533001, 0.7460855245590211, 3.5...",video017_016_0.mp4
981,17,16,phoebe,chandler,episode01,"[33, 48]","[0.300980627536773, 0.442733347415924, 2.75530...",video017_016_0.mp4
982,17,16,phoebe,chandler,episode01,"[49, 64]","[0.111174792051315, 1.04053258895874, 3.037196...",video017_016_0.mp4
983,17,16,phoebe,chandler,episode01,"[65, 80]","[0.087654069066047, 1.101483702659607, 3.49819...",video017_016_0.mp4
984,17,16,phoebe,chandler,episode01,"[81, 96]","[0.022392775863409, 0.8720298409461971, 3.5221...",video017_016_0.mp4
985,17,16,phoebe,chandler,episode01,"[97, 112]","[0.442525416612625, 0.9762797951698301, 2.9311...",video017_016_0.mp4


In [323]:
for label in labels['episode01']:
    if (label[0], label[1]) in {(16, 17), (17, 16)}:
        print(label)

[16, 17, False]


This is an example of a track from which we could create type 1 negatives. 

## Creating positive examples

We loop over the labels and fetch the corresponding tracks.

In [240]:
def create_positive_instances(data, labels):
    """
    Create positive instances from the data.
    """
    positives = []
    for episode in labels:
        for track1, track2, label in labels[episode]:
            if label is True:
                person1_features = data[(data['track1'] == track1) & (data['track2'] == track2) & (data['episode'] == episode)].sort_values(by='segment', key=lambda col: col.map(lambda seg: seg[0]))
                person2_features = data[(data['track1'] == track2) & (data['track2'] == track1) & (data['episode'] == episode)].sort_values(by='segment', key=lambda col: col.map(lambda seg: seg[0]))
                assert len(person1_features) == len(person2_features)
                for person1_feature, person2_feature in zip(person1_features.itertuples(), person2_features.itertuples()):
                    positives.append({
                        # 'episode': episode,
                        # 'track1': track1,
                        # 'track2': track2,
                        # 'segment': person1_feature.segment,
                        'person1_features': person1_feature.features,
                        'person2_features': person2_feature.features
                    })
                
    return pd.DataFrame(positives)

In [241]:
df_pos = create_positive_instances(df, labels)
print(f'Number of positive instances: {len(df_pos)}')

Number of positive instances: 7514


## Creating negatives

At this stage, seeing that we have 7514 samples, we will only create "type 4" negative samples as they will be the most powerful and we can essentially create an infinite amount of them. The code for creating type 1 negatives is similar to the code for creating positive samples.

### Type 1 negatives

These are negatives from matching spatio-temporal windows but where we know the pair not be interacting.

In [243]:
def create_type1_negatives(data, labels):
    """
    Create type 1 negatives from matching spatio-temporal features.
    """
    negatives = []
    for episode in labels:
        for track1, track2, label in labels[episode]:
            if label is False:
                person1_features = data[(data['track1'] == track1) & (data['track2'] == track2) & (data['episode'] == episode)].sort_values(by='segment', key=lambda col: col.map(lambda seg: seg[0]))
                person2_features = data[(data['track1'] == track2) & (data['track2'] == track1) & (data['episode'] == episode)].sort_values(by='segment', key=lambda col: col.map(lambda seg: seg[0]))
                assert len(person1_features) == len(person2_features)
                for person1_feature, person2_feature in zip(person1_features.itertuples(), person2_features.itertuples()):
                    negatives.append({
                        # 'episode': episode,
                        # 'track1': track1,
                        # 'track2': track2,
                        # 'segment': person1_feature.segment,
                        'person1_features': person1_feature.features,
                        'person2_features': person2_feature.features
                    })
                
    return pd.DataFrame(negatives)

In [244]:
df_neg1 = create_type1_negatives(df, labels)
print(f'Number of type 1 negative instances: {len(df_neg1)}')

Number of type 1 negative instances: 498


### Type 2 negatives

These are negatives from different spatio-temporal windows taken from the same track. The implementation is not complete as in the end we did  need them.

In [245]:
# FIXME:
# def create_type2_negatives(data, labels):

### Type 4 negatives

These are negatives from different spatio-temporal windows and from different episodes and different characters. This creates a large number of negatives. We fix the number of negatives to be created as an argument in the function call. Given that we have 7514 samples, we will create a total of 100_000 for good measure as this seems reasonable. We can always reduce the number of negatives when training the model.

In [372]:
def create_type4_negatives(df: pd.DataFrame, labels, negatives_number=10_000):
    """Create type 4 negatives from different episodes and characters"""
    negatives = []
    while len(negatives) < negatives_number:
        episode1, episode2 = np.random.choice(list(labels.keys()), size=2, replace=False)
        tracki, trackj = df[df['episode'] == episode1].sample().iloc[0], df[df['episode'] == episode2].sample().iloc[0]
        if tracki['person1'] != trackj['person1']:
            negatives.append({
                'episode_1': tracki['episode'],
                'episode_2': trackj['episode'],
                'person_1': tracki['person1'],
                'person_2': trackj['person1'],
                'version_1': tracki['version'],
                'version_2': trackj['version'],
                'tracki1': tracki['track1'],
                'tracki2': tracki['track2'],
                'trackj1': trackj['track1'],
                'trackj2': trackj['track2'],
                'segmenti': tracki['segment'],
                'segmentj': trackj['segment'],
                'person1_features': tracki['features'],
                'person2_features': trackj['features']
            })
        if len(negatives) % 10_000 == 0:
            print(f'Negatives created: {len(negatives)}')
    return pd.DataFrame(negatives)

In [247]:
df_neg4 = create_type4_negatives(df, labels, negatives_number=100_000)
print(f'Number of type 4 negative instances: {len(df_neg4)}')
df_neg4

Negatives created: 0
Negatives created: 10000
Negatives created: 20000
Negatives created: 30000
Negatives created: 30000
Negatives created: 40000
Negatives created: 50000
Negatives created: 60000
Negatives created: 70000
Negatives created: 80000
Negatives created: 90000
Negatives created: 100000
Number of type 4 negative instances: 100000


Unnamed: 0,person1_features,person2_features
0,"[0.5935591459274291, 2.136274099349975, 0.2862...","[0.33274304866790705, 0.5014960169792171, 0.07..."
1,"[0.0018671104917300002, 0.9743065834045411, 2....","[0.016910068690776003, 1.055215001106262, 0.14..."
2,"[1.045803308486938, 0.0, 0.20327922701835602, ...","[1.455870866775512, 0.8921209573745721, 0.0077..."
3,"[0.38271686434745705, 1.103181362152099, 0.056...","[0.0, 0.111097387969493, 0.21661548316478701, ..."
4,"[0.053021796047687, 1.330795764923095, 1.25090...","[0.214696615934371, 0.5652663707733151, 0.0, 0..."
...,...,...
99995,"[0.042366411536931006, 0.6765475869178771, 0.7...","[0.20562626421451502, 0.9625936746597291, 1.12..."
99996,"[0.319629520177841, 2.245369434356689, 0.40146...","[0.150232255458831, 1.719058871269226, 0.0, 0...."
99997,"[0.09444165974855401, 0.9858267903327941, 0.08...","[1.3766940832138062, 0.44322270154953003, 0.03..."
99998,"[0.0, 0.0, 0.0, 0.029769795015454, 0.048692673...","[2.134280681610107, 0.49204528331756503, 0.036..."


## Concatenation of the positive and negative examples

In [316]:
df_pos['label'] = 1
df_neg4['label'] = 0
df_out = pd.concat([df_pos[['person1_features', 'person2_features', 'label']], df_neg4[['person1_features', 'person2_features', 'label']]], axis=0, ignore_index=True)

In [317]:
df_out

Unnamed: 0,person1_features,person2_features,label
0,"[0.073240458965301, 0.108803056180477, 1.90936...","[1.012074708938598, 0.40622571110725403, 0.160...",1
1,"[0.21179485321044902, 0.012390901334583001, 1....","[0.771273374557495, 0.16172519326210003, 0.277...",1
2,"[0.010071376338601001, 0.11898501217365201, 1....","[0.579812049865722, 0.25832888484001104, 0.559...",1
3,"[0.371864974498748, 0.019903786480426, 0.49261...","[0.13664688169956202, 0.9641482234001161, 0.31...",1
4,"[0.24769486486911702, 0.0, 0.562934577465057, ...","[0.6391886472702021, 0.071292459964752, 2.6756...",1
...,...,...,...
107509,"[0.042366411536931006, 0.6765475869178771, 0.7...","[0.20562626421451502, 0.9625936746597291, 1.12...",0
107510,"[0.319629520177841, 2.245369434356689, 0.40146...","[0.150232255458831, 1.719058871269226, 0.0, 0....",0
107511,"[0.09444165974855401, 0.9858267903327941, 0.08...","[1.3766940832138062, 0.44322270154953003, 0.03...",0
107512,"[0.0, 0.0, 0.0, 0.029769795015454, 0.048692673...","[2.134280681610107, 0.49204528331756503, 0.036...",0


## Normalization

We apply L2 normalization to the data.

In [318]:
# df_out[['person1_features', 'person2_features']] = df_out[['person1_features', 'person2_features']].applymap(lambda x: np.array(x) / (np.linalg.norm(np.array(x))))

## Saving data

In [319]:
from sklearn.utils import shuffle

In [320]:
df_out.to_json('../data/features.json', orient='records')

## Dimensionality reduction

This section of code has been commented out as it is no longer used.

In [None]:
# df_new = pd.DataFrame(index=np.arange(len(df_out)), columns=np.arange(1025))

In [None]:
# for row in df_out.itertuples(index=True):
#     df_new.loc[row[0]] = row[1] + row[2] + [row[3]]

In [None]:
# df_new

In [None]:
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler
# from sklearn.manifold import TSNE

In [None]:
# df_out
# x = []
# for row in df_out.itertuples():
#     x.append(row[1])
#     x.append(row[2])
# x = np.array(x)
# x.shape
# # x = np.array([[np.array(row[0]), np.array(row[1])] for row in df_out.itertuples()])
# # x.shape

In [None]:
# pca = PCA(n_components=100)

# # x = df_new.drop(columns=[1024])
# x = StandardScaler().fit_transform(x)
# # x = pca.fit_transform(x)
# x = TSNE(n_components=2).fit_transform(x)
# x.shape

In [None]:
# y = []
# for i in range(len(x) // 2):
#     y.append([x[2*i], x[2*i+1]])
# principal_df = pd.DataFrame(data=y, columns=['principal component 1', 'principal component 2'])
# final_df = pd.concat([principal_df, df_out[['label']]], axis=1)

In [None]:
# final_df.rename(columns={1024: 'label', 'principal component 1': 'person1_features', 'principal component 2': 'person2_features'}, inplace=True)
# final_df

In [376]:
df_inf = create_type4_negatives(df, labels, negatives_number=300)
print(f'Number of type 4 negative instances: {len(df_neg4)}')
df_inf

Number of type 4 negative instances: 100000


Unnamed: 0,episode_1,episode_2,person_1,person_2,version_1,version_2,tracki1,tracki2,trackj1,trackj2,segmenti,segmentj,person1_features,person2_features
0,episode05,episode13,phoebe,richard,0,0,454,455,48,50,"[49, 64]","[33, 48]","[0.43811732530593805, 0.9078449010848991, 0.85...","[1.522959351539611, 0.040202382951974, 1.57066..."
1,episode04,episode09,joey,ross,0,0,588,590,973,972,"[65, 80]","[49, 64]","[0.634814739227294, 1.243534088134765, 0.00273...","[1.757545471191406, 0.529473304748535, 1.14027..."
2,episode01,episode10,ross,rachel,0,0,77,78,1,0,"[17, 32]","[1, 16]","[0.294130891561508, 0.43723428249359103, 1.059...","[1.5743505954742432, 0.36301702260971, 1.01847..."
3,episode11,episode12,monica,workmate-rachel,0,0,46,45,74,72,"[97, 112]","[1, 16]","[0.134424820542335, 0.7555460333824151, 0.0, 0...","[0.327753096818923, 0.018381457775831, 1.14847..."
4,episode01,episode09,ross,rachel,0,0,82,81,435,436,"[17, 32]","[17, 32]","[0.327776193618774, 0.09699752181768401, 1.769...","[1.405845999717712, 0.7483466863632201, 0.1800..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,episode08,episode04,rachel,chandler,0,0,253,254,119,121,"[17, 32]","[1, 16]","[0.151677206158638, 0.24853734672069502, 0.935...","[0.5304523110389711, 0.12579086422920202, 0.39..."
296,episode02,episode12,joey,rachel,0,0,86,88,141,139,"[33, 48]","[1, 16]","[0.6403729319572441, 0.015634702518582, 3.0212...","[1.069051265716552, 0.0, 1.046332120895385, 0...."
297,episode12,episode08,phoebe,joey,0,0,149,148,320,322,"[161, 176]","[17, 32]","[1.216413140296936, 0.645316004753112, 0.01418...","[0.12728774547576902, 0.017002783715724, 2.724..."
298,episode11,episode07,chandler,ross,0,0,457,456,329,330,"[145, 160]","[1, 16]","[0.15722808241844102, 0.6819443106651301, 0.83...","[0.083166994154453, 0.317490935325622, 0.32072..."


In [377]:
import pandas as pd
import torch
from model import MLP

In [378]:
model = MLP([1024, 256, 1])
model.load('../models/model.pt')

In [379]:
df_false = pd.DataFrame(columns=df_inf.columns)

for row in df_inf.iterrows():
    f1, f2 = torch.tensor(row[1]['person1_features']), torch.tensor(row[1]['person2_features'])
    f1 = f1.reshape(1, -1)
    f2 = f2.reshape(1, -1)
    pred = model.forward(f1, f2)
    if pred > 0.5:
        df_false = df_false.append(row[1])

In [380]:
df_false

Unnamed: 0,episode_1,episode_2,person_1,person_2,version_1,version_2,tracki1,tracki2,trackj1,trackj2,segmenti,segmentj,person1_features,person2_features
108,episode04,episode08,chandler,rachel,0,0,199,201,400,398,"[17, 32]","[33, 48]","[0.429418236017227, 0.400749713182449, 0.02102...","[1.223331570625305, 0.208455681800842, 0.72956..."
110,episode11,episode04,chandler,phoebe,0,0,50,51,425,424,"[129, 144]","[97, 112]","[0.007038089446723001, 0.35684964060783303, 0....","[0.0, 0.202293902635574, 0.073096469044685, 0...."
185,episode14,episode13,rachel,chandler,0,0,526,525,24,23,"[49, 64]","[49, 64]","[0.433014869689941, 0.8793554902076721, 0.1635...","[0.001958217471837, 0.605260014533996, 0.0, 0...."
202,episode07,episode14,rachel,monica,0,0,466,467,228,227,"[49, 64]","[49, 64]","[0.26379138231277405, 0.39802846312522805, 0.5...","[0.265668958425521, 0.336497515439987, 0.60457..."
207,episode03,episode11,ross,rachel,0,0,209,211,230,229,"[129, 144]","[65, 80]","[0.9285734295845031, 0.7978246212005611, 0.0, ...","[0.0, 0.18642924726009302, 0.08208198100328401..."
208,episode03,episode04,rachel,phoebe,0,0,83,84,425,424,"[33, 48]","[17, 32]","[0.0, 0.32737752795219405, 0.00729865022003600...","[0.0, 0.064904153347015, 0.007927645929157, 0...."
228,episode06,episode11,Janice,monica,0,0,474,475,529,528,"[49, 64]","[65, 80]","[0.121960654854774, 1.95333707332611, 0.373495...","[0.0, 1.986918330192566, 0.15072381496429402, ..."


In [381]:
df_false.to_json('../data/features_false.json', orient='records')

In [385]:
df[(df['episode'] == 'episode04') & (df['track1'] == 564)]

Unnamed: 0,track1,track2,person1,person2,version,episode,segment,features,video
4365,564,565,chandler,monica,0,episode04,"[1, 16]","[0.318325847387313, 1.070584774017334, 0.10275...",video564_565_0.mp4
4366,564,565,chandler,monica,0,episode04,"[17, 32]","[1.059605598449707, 0.8226431012153621, 0.1067...",video564_565_0.mp4
4367,564,565,chandler,monica,0,episode04,"[33, 48]","[0.7463333606719971, 0.6440268158912651, 0.0, ...",video564_565_0.mp4
5105,564,563,chandler,joey,0,episode04,"[1, 16]","[0.318325847387313, 1.070584774017334, 0.10275...",video564_563_0.mp4
5106,564,563,chandler,joey,0,episode04,"[17, 32]","[1.092632412910461, 0.8223764896392821, 0.0980...",video564_563_0.mp4
5107,564,563,chandler,joey,0,episode04,"[33, 48]","[0.8497989773750301, 0.675683498382568, 0.0, 1...",video564_563_0.mp4
5108,564,563,chandler,joey,0,episode04,"[49, 64]","[0.8051264286041261, 0.167950615286827, 0.4708...",video564_563_0.mp4
5109,564,563,chandler,joey,0,episode04,"[65, 80]","[0.5299862027168271, 0.180139511823654, 0.0343...",video564_563_0.mp4
5110,564,563,chandler,joey,0,episode04,"[81, 96]","[0.928510427474975, 0.190083384513854, 0.00466...",video564_563_0.mp4
5111,564,563,chandler,joey,0,episode04,"[97, 112]","[0.07098451256752, 0.464162319898605, 0.053874...",video564_563_0.mp4
