In [1]:
import torch, io, json, random
import pandas as pd

In [2]:
print(torch.__version__)

1.8.1+cu102


## Load in the raw data

In [3]:
expression_dict = {}
for data_set in ["test", "train", "val"]:
    with open(f"generated_features/{data_set}/{data_set}_expression.pt", 'rb') as f:
        buffer = io.BytesIO(f.read())
        expression_dict[data_set] = torch.load(buffer,map_location=torch.device('cpu'));

expression_results = [pic for inner_list in expression_dict.values() for pic in inner_list]

In [4]:
object_dict = {}
for data_set in ["test", "train", "val"]:
    with open(f"generated_features/{data_set}/{data_set}_object_detection.pt", 'rb') as f:
        buffer = io.BytesIO(f.read())
        object_dict[data_set] = torch.load(buffer,map_location=torch.device('cpu'));
                       
object_results = [obj for inner_list in object_dict.values() for obj in inner_list]

In [5]:
places_dict = {}
for data_set in ["test", "train", "val"]:
    with open(f"generated_features/{data_set}/{data_set}_places365.json", "r") as json_file: 
        places_dict[data_set]=json.load(json_file)
        
places_result = {a: b for inner_dict in places_dict.values() for (a, b) in inner_dict.items()}        

In [6]:
expression_results[0]

{'2017_64540760': [{'0': {'probs': tensor([0.6970, 0.1748, 0.1163]),
    'classes': tensor([0, 1, 4])}}]}

**Get classes**

In [7]:
# Get expression classes
with open("class_files/expression_classes.txt", "r") as f:
    expression_classes = [s.strip() for s in f.readlines()]

In [8]:
# Get coco classes
with open("class_files/coco_classes.txt", "r") as f:
    coco_classes = [s.strip() for s in f.readlines()]

In [9]:
# Get places365 classes
file_name = 'class_files/places365_classes.txt'                                                                                                                                       
places365_classes = list()                                                                                         
with open(file_name) as class_file:                                                                      
    for line in class_file:                                                                              
        places365_classes.append(line.strip().split(' ')[0][3:])  

**Test output**

In [10]:
# Places
places_result['2017_10735550']

{'prob': [0.5115284323692322,
  0.1368655413389206,
  0.07560905069112778,
  0.05295965448021889,
  0.03759666159749031],
 'class': ['museum/indoor',
  'burial_chamber',
  'cemetery',
  'archaelogical_excavation',
  'kindergarden_classroom']}

In [11]:
# Coco
detections=object_results[15]['output']['labels']
for d in detections[:10]:
    print(coco_classes[d])

person
bottle
refrigerator
chair
person
dining table
person
refrigerator
couch
person


In [12]:
# Expressions
face_expression=expression_results[0]['2017_64540760'][0]['0']['classes']
for face in face_expression:
    print(expression_classes[int(face)])

neutral
happiness
anger


## Wrangling
### Expression
Load the existing data

In [13]:
main_data = pd.read_csv("data/data.csv")

Wrangle the expression data

In [14]:
dict_res = {}
for mini_dict in expression_results:
    key = list(mini_dict.keys())[0]
    faces = mini_dict[key]
    faces = list(mini_dict.values())[0]
    res = {class_name: [] for class_name in expression_classes}
    res["likely"] = []
    for face in faces:
        res.update({key: value + [0] for key, value in res.items()})
        face = list(face.values())[0]
        face_expression = face["classes"]
        probabiliites = face["probs"]
        for expression, probability in zip(face_expression, probabiliites):
            res[expression_classes[int(expression)]][-1] = float(probability.numpy())
        res["likely"][-1] = expression_classes[int(face_expression[probabiliites.argmax()])]

    dict_res[key] = res

Convert to a dataframe and rename columns

In [15]:
expression_data = pd.DataFrame.from_dict(dict_res, orient="index")
expression_data.rename(columns=lambda x: f"expression_{x}", inplace=True)

Join the two dataframes

In [16]:
data_with_expression = pd.merge(
    main_data,
    expression_data,
    left_on="id",
    right_index=True,
    how="left"
)

Some quick tests

In [17]:
assert(data_with_expression.shape[0] == max(expression_data.shape[0], main_data.shape[0]))
assert(data_with_expression.shape[1] == expression_data.shape[1] + main_data.shape[1])
for i in range(100):
    key = random.choice(expression_data.index)
    col = random.choice(expression_data.columns)
    assert(data_with_expression[data_with_expression.id == key][col].values[0] == expression_data.loc[key, col])

### Objects

In [18]:
object_class_list = [item for item in coco_classes if item != "N/A"]

Wrangle the object data

In [19]:
object_dictionary = {}
for photo in object_results:
    photo_id = photo["image"].split(".")[0]
    photo_dictionary = {obj: [] for obj in object_class_list}
    x = photo["output"]
    for obj, prob in zip(x["labels"], x["scores"]):
        photo_dictionary[coco_classes[obj]].append(float(prob))
    object_dictionary[photo_id] = photo_dictionary

Convert to DataFrame, drop `__background__` column and rename rest

In [20]:
object_data = pd.DataFrame.from_dict(object_dictionary, orient="index")
object_data.drop("__background__", axis=1, inplace=True)
object_data.rename(columns=lambda col: f"object_{col}", inplace=True)

Join the two dataframes

In [21]:
data_with_object = pd.merge(
    data_with_expression, 
    object_data,
    left_on="id",
    right_index=True,
    how="left"
) 

### Places

In [22]:
places_dictionary = {}
for photo_id, photo in places_result.items():
    photo_dictionary = {place: 0 for place in places365_classes}
    for place, prob in zip(photo["class"], photo["prob"]):
        photo_dictionary[place] = prob
    places_dictionary[photo_id] = photo_dictionary

Convert to DataFrame and rename columns

In [23]:
places_data = pd.DataFrame.from_dict(places_dictionary, orient="index")
places_data.rename(columns=lambda col: f"place_{col}", inplace=True)

In [24]:
all_data = pd.merge(
    data_with_object,
    places_data,
    left_on="id",
    right_index=True,
    how="left",
)

## Write data to file

In [25]:
all_data.to_json("data/data_with_features.json")
all_data.to_csv("data/data_with_features.csv", index=False)

In [26]:
all_data.dtypes

id                     object
image_path             object
openimages_id          object
source_url             object
a16_race                 bool
                       ...   
place_wind_farm       float64
place_windmill        float64
place_yard            float64
place_youth_hostel    float64
place_zen_garden      float64
Length: 526, dtype: object

In [27]:
t = pd.read_json("data/data_with_features.json")
t.sample()

Unnamed: 0,id,image_path,openimages_id,source_url,a16_race,a17_color,a1_age_approx,a2_weight_approx,a4_gender,a6_hair_color,...,place_waterfall,place_watering_hole,place_wave,place_wet_bar,place_wheat_field,place_wind_farm,place_windmill,place_yard,place_youth_hostel,place_zen_garden
18269,201733681453,images/train2017/2017_33681453.jpg,2184b0908737e8a7,https://farm8.staticflickr.com/5550/1083754945...,False,False,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
all_data.sample()

Unnamed: 0,id,image_path,openimages_id,source_url,a16_race,a17_color,a1_age_approx,a2_weight_approx,a4_gender,a6_hair_color,...,place_waterfall,place_watering_hole,place_wave,place_wet_bar,place_wheat_field,place_wind_farm,place_windmill,place_yard,place_youth_hostel,place_zen_garden
12280,2017_47440862,images/train2017/2017_47440862.jpg,da8f2d10b2839c14,https://farm5.staticflickr.com/3020/2753031663...,True,True,True,True,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
