# Augmented Feature Summary Statistics

In [1]:
import numpy as np
import pandas as pd

import plotly.express as px

import math
import random

In [2]:
data = pd.read_json("data/data_with_features.json")

In [3]:
bool_data = data.copy()
bool_data2 = data.copy()
count_data = data.copy()

In [4]:
threshold = 0.4

## Sort class lists

In [5]:
with open("class_files/coco_classes.txt", "r") as f:
    coco_objects = [i.strip() for i in f.readlines()]
coco_objects = [i for i in coco_objects if i not in ["__background__", "N/A"]]
coco_objects = [f"object_{i}" for i in coco_objects]

In [6]:
with open("class_files/expression_classes.txt", "r") as f:
    expressions = [i.strip() for i in f.readlines()]
expressions = [f"expression_{i}" for i in expressions]

In [7]:
with open("class_files/places365_classes.txt", "r") as f:
    places = [i.strip() for i in f.readlines()]
places = [i.split(" ")[0][3:] for i in places]
places = [f"place_{i}" for i in places]

In [8]:
attributes = list(pd.read_csv("external/vpa-master/attributes.tsv", sep="\t")["attribute_id"])
unsafe_attributes = [i for i in attributes if i != "a0_safe"]

In [9]:
other_columns = [
    i
    for i in data.columns
    if i
    not in [
        j
        for inner_list in [coco_objects, expressions, places, attributes]
        for j in inner_list
    ]
]

## Places

We assign true to the most likely place of all of the places and false to all other places

In [10]:
m = np.zeros_like(data[places].values)
m[np.arange(len(data[places])), data[places].values.argmax(1)] = 1
bool_data[places] = pd.DataFrame(m, columns=data[places].columns).astype(bool)

Gather together the data about places

In [11]:
place_data = (
    pd.DataFrame(
        bool_data[places]
        .astype(int)
        .sum()
    )
    .sort_values(0, ascending=False)
    .reset_index()
    .rename(
        columns={"index": "place", 0: "occurences"}
    )
)
place_data["place"] = place_data["place"].apply(lambda x: x[6:])

In [64]:
place_data.describe().round(2)

Unnamed: 0,occurences
count,365.0
mean,60.73
std,120.56
min,0.0
25%,11.0
50%,25.0
75%,58.0
max,1286.0


In [58]:
fig = px.bar(
    place_data,
    y="occurences",
    x="place",
    labels = {"occurences": "Occurrences", "place": "Location"},
)
fig.update_xaxes(showticklabels=False)
fig.write_image("images/basic_stats_places_thresh.pdf")
fig.show()

Assign places when the probability is above the threshold. Each photo may have zero, one or many places.

In [14]:
bool_data2[places] = data[places] > threshold

In [15]:
place_data2 = (
    pd.DataFrame(
        bool_data2[places]
        .astype(int)
        .sum()
    )
    .sort_values(0, ascending=False)
    .reset_index()
    .rename(
        columns={"index": "place", 0: "occurences"}
    )
)
place_data2["place"] = place_data2["place"].apply(lambda x: x[6:])

In [63]:
place_data2.describe().round(2)

Unnamed: 0,occurences
count,365.0
mean,25.98
std,61.45
min,0.0
25%,3.0
50%,9.0
75%,21.0
max,531.0


In [59]:
fig = px.bar(
    place_data2,
    y="occurences",
    x="place",
    labels = {"occurences": "Occurrences", "place": "Location"},
)
fig.update_xaxes(showticklabels=False)
fig.show()
fig.write_image("images/basic_stats_places_thresh.pdf")

What we see is that the distribution looks fairly similar. In fact, the top 4 places are the same, with approximately half as many photos.

# Objects

In [18]:
def list_to_count(lst, threshold=threshold):
    try:
        return len([i for i in lst if i > threshold])
    except:
        return 0

We will count the number of objects with a probability greater than the threshold. `count_data` counts the total number of each object in each photo; `bool_data` marks only if there is that pobject in the photo. 

In [19]:
count_data[coco_objects] = data[coco_objects].applymap(list_to_count)

In [20]:
bool_data[coco_objects] = count_data[coco_objects] > 0

In [21]:
coco_data = (
    pd.DataFrame(
        count_data[coco_objects]
        .astype(int)
        .sum()
    )
    .sort_values(0, ascending=False)
    .reset_index()
    .rename(
        columns={"index": "object", 0: "total_occurences"}
    )
)
coco_data["object"] = coco_data["object"].apply(lambda x: x[7:])

In [22]:
coco_data2 = (
    pd.DataFrame(
        bool_data[coco_objects]
        .astype(int)
        .sum()
    )
    .reset_index()
    .rename(
        columns={"index": "object", 0: "photos_containing"}
    )
)
coco_data2["object"] = coco_data2["object"].apply(lambda x: x[7:])

In [23]:
coco_data = pd.merge(
    left=coco_data,
    right=coco_data2,
    left_on="object",
    right_on="object",
)
del(coco_data2)

In [24]:
coco_data.describe().round().astype(int)

Unnamed: 0,total_occurences,photos_containing
count,80,80
mean,2638,705
std,14218,1572
min,5,4
25%,140,108
50%,320,248
75%,1154,750
max,127247,13057


We plot a bar chart, `person` dominates.

In [25]:
px.bar(
    coco_data,
    x="total_occurences", 
    y="object",
)

The same graph but omitting `person`. This is to be included.

In [26]:
fig = px.bar(
    coco_data[coco_data.object != "person"],
    y="total_occurences", 
    x="object",
    labels = {
        "total_occurences": "Total Occurrences", "object": "Object"
    }
)
fig.update_xaxes(showticklabels=False)
fig.show()
fig.write_image("images/object_occurrence.pdf")

In [27]:
px.bar(
    coco_data,
    x="photos_containing", 
    y="object",
)

In [28]:
px.bar(
    coco_data[coco_data.object != "person"],
    x="photos_containing", 
    y="object",
)

So we can see from the fact that the order is different that some objects occur more grouped together than others. We can put a number on that:

In [29]:
coco_data["occurence_ratio"] = coco_data.total_occurences / coco_data.photos_containing

In [30]:
coco_data.head()

Unnamed: 0,object,total_occurences,photos_containing,occurence_ratio
0,person,127247,13057,9.7455
1,chair,9474,3152,3.005711
2,car,8682,2487,3.490953
3,handbag,6029,3019,1.997019
4,book,5448,2867,1.900244


In [31]:
coco_data.describe().round(2)

Unnamed: 0,total_occurences,photos_containing,occurence_ratio
count,80.0,80.0,80.0
mean,2638.46,705.34,1.68
std,14218.41,1571.66,1.03
min,5.0,4.0,1.0
25%,140.25,107.75,1.26
50%,320.0,248.5,1.44
75%,1153.5,749.75,1.8
max,127247.0,13057.0,9.75


So person occurs the most frequently in groups

Return a random element

In [32]:
data[coco_objects][data.id == 201774734227].object_person

2307    [0.9985787868, 0.1362681091, 0.0999732316]
Name: object_person, dtype: object

A few random rows

In [33]:
# sample_data = coco_data[coco_data.object.isin(["person", "chair", "backpack", "skis", "sandwich"] )]
# sample_data["occurence_ratio"] = sample_data.occurence_ratio.round(2)
# sample_data.to_latex(index=False)

Write the object summary data to file for later use

In [34]:
coco_data.to_csv("data/summary/object.csv", index=False)

## Expression
`expression_likely` suggests the most likely facial expression for each detected face.

In [35]:
def count_expressions(lst):
    expressions = ['neutral', 'happiness', 'surprise', 'sadness', 'anger', 'disguest', 'fear']
    try:
        return {expression: lst.count(expression) for expression in expressions}
    except:
        return {expression: 0 for expression in expressions}

In [36]:
count_data[expressions] = (
    pd.DataFrame(list(data.expression_likely.apply(count_expressions)))
    .rename(columns=lambda x: f"expression_{x}")
)

In [37]:
bool_data[expressions] = count_data[expressions] > 0

In [38]:
expression_data = (
    count_data[expressions]
    .sum()
    .sort_values(0, ascending=False)
    .reset_index()
    .rename(columns={"index": "expression", 0: "total_count"})
)
expression_data2 = pd.DataFrame(
    bool_data[expressions]
    .astype(int)
    .sum()
)
expression_data = pd.merge(
    left=expression_data,
    right=expression_data2,
    left_on="expression",
    right_index=True,
).rename(columns={0: "photos_occuring"})
del(expression_data2)
expression_data["expression"] = [i[11:] for i in expression_data.expression]

We'll also calulate the ratio

In [39]:
expression_data["ratio"] = expression_data.total_count / expression_data.photos_occuring

In [40]:
expression_data["ratio"] = expression_data.ratio.round(2)
expression_data.to_latex(index=False)

'\\begin{tabular}{lrrr}\n\\toprule\nexpression &  total\\_count &  photos\\_occuring &  ratio \\\\\n\\midrule\n   neutral &        64645 &            11039 &   5.86 \\\\\n happiness &        16174 &             5694 &   2.84 \\\\\n     anger &        14719 &             5526 &   2.66 \\\\\n   sadness &         1554 &             1225 &   1.27 \\\\\n  surprise &          172 &              160 &   1.08 \\\\\n      fear &           28 &               27 &   1.04 \\\\\n  disguest &            2 &                2 &   1.00 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [41]:
expression_data.replace("disguest", "disgust", inplace=True)

In [42]:
expression_data.describe()

Unnamed: 0,total_count,photos_occuring,ratio
count,7.0,7.0,7.0
mean,13899.142857,3381.857143,2.25
std,23478.235825,4208.841623,1.775021
min,2.0,2.0,1.0
25%,100.0,93.5,1.06
50%,1554.0,1225.0,1.27
75%,15446.5,5610.0,2.75
max,64645.0,11039.0,5.86


In [43]:
px.bar(
    expression_data,
    x="total_count",
    y="expression",
)

Neutral dominates. Plot the same graph without neutral.

In [44]:
px.bar(
    expression_data[expression_data.expression != "neutral"],
    x="total_count",
    y="expression",
)

Now happiness and anger dominate

Now plot the photos occuring in metric

In [45]:
fig = px.bar(
    expression_data,
    x="photos_occuring",
    y="expression",
    labels={
        "expression": "Expression", "photos_occuring": "Images containing"
    },
    category_orders={
        "expression": ["neutral", "happiness", "anger", "sadness", "surprise", "fear", "disgust"]
    }
)
fig.write_image("images/basic_stats_expression.pdf")
fig.show()

Export in case we want to use it later

In [46]:
expression_data.to_json("data/summary/expression.json")

## Write datasets
Write the count_data and bool_data to file

In [47]:
count_data.to_json("data/augmented/count_data.json")
bool_data.to_json("data/augmented/bool_data.json")