In [1]:
import numpy as np
from scipy.io import loadmat  # this is the SciPy module that loads mat-files
import matplotlib.pyplot as plt
import pandas as pd

### Getting the data from .mat file to a dataframe

In [2]:
# Improved loadmat for complex and deeply nested mat files.
# source : https://stackoverflow.com/a/60364102

from scipy.io import loadmat, matlab
def load_mat(filename):
    """
    This function should be called instead of direct scipy.io.loadmat
    as it cures the problem of not properly recovering python dictionaries
    from mat files. It calls the function check keys to cure all entries
    which are still mat-objects
    """

    def _check_vars(d):
        """
        Checks if entries in dictionary are mat-objects. If yes
        todict is called to change them to nested dictionaries
        """
        for key in d:
            if isinstance(d[key], matlab.mio5_params.mat_struct):
                d[key] = _todict(d[key])
            elif isinstance(d[key], np.ndarray):
                d[key] = _toarray(d[key])
        return d

    def _todict(matobj):
        """
        A recursive function which constructs from matobjects nested dictionaries
        """
        d = {}
        for strg in matobj._fieldnames:
            elem = matobj.__dict__[strg]
            if isinstance(elem, matlab.mio5_params.mat_struct):
                d[strg] = _todict(elem)
            elif isinstance(elem, np.ndarray):
                d[strg] = _toarray(elem)
            else:
                d[strg] = elem
        return d

    def _toarray(ndarray):
        """
        A recursive function which constructs ndarray from cellarrays
        (which are loaded as numpy ndarrays), recursing into the elements
        if they contain matobjects.
        """
        if ndarray.dtype != 'float64':
            elem_list = []
            for sub_elem in ndarray:
                if isinstance(sub_elem, matlab.mio5_params.mat_struct):
                    elem_list.append(_todict(sub_elem))
                elif isinstance(sub_elem, np.ndarray):
                    elem_list.append(_toarray(sub_elem))
                else:
                    elem_list.append(sub_elem)
            return np.array(elem_list)
        else:
            return ndarray

    data = loadmat(filename, struct_as_record=False, squeeze_me=True)
    return _check_vars(data)

In [3]:
# Custom function for extracting only the following data from mpi pose dataset :
# .x1, .y1, .x2, .y2 - coordinates of the head rectangle
# .scale - person scale w.r.t. 200 px height
# x_id, y_id - keypoint coordinates of 16 different joints
# vis_id - visibility bool of 16 different joints
# act_name - activity name
# cat_name - category name

# structure of the .mat dataset is given here : http://human-pose.mpi-inf.mpg.de/#download


def extract_data(mat_data, idx):

    anncols = {}
    coldict = {}
    data_available = True
    names = {'x1':'rx1', 'y1':'ry1', 'x2':'rx2', 'y2':'ry2', 'scale':'scale'}
    

    annorect = mat_data["annolist"][idx]["annorect"]

    # getting .x1, .y1, .x2, .y2, .scale
    if (type(annorect) != dict):
        data_available = False
    else:
        for k in names.keys():
            # print(type(annorect), len(annorect))
            if k in annorect.keys() :
                anncols[names[k]] = annorect[k]
            else:
                data_available = False
                break

    if data_available and ('annopoints' in annorect.keys()) :
        points = annorect["annopoints"]["point"]

        # getting keypoints and visibility of joints.
        for point in points:
            anncols[f"x_{point['id']}"] = point['x']
            anncols[f"y_{point['id']}"] = point['y']

            if(type(point['is_visible']) == int):
                anncols[f"vis_{point['id']}"] = point['is_visible']
            else :
                anncols[f"vis_{point['id']}"] = 0
    else:
        data_available = False

    # print(data_available)

    # get the act_name and cat_name values for this record
    if (data_available) : 
        # print("hello")
        actcols = mat_data["act"][idx]
        if( type(actcols['cat_name']) != str or type(actcols['act_name']) != str ):
            data_available = False
        else :
            coldict = { **anncols, **actcols }
        

    return ( coldict , data_available)

In [4]:
# load mat-file
mat2 = load_mat('mpii_human_pose_v1_u12_1.mat')  

# get actual data from the RELEASE variable
mat2data = mat2["RELEASE"]


  if isinstance(d[key], matlab.mio5_params.mat_struct):
  if isinstance(elem, matlab.mio5_params.mat_struct):
  if isinstance(sub_elem, matlab.mio5_params.mat_struct):
  return np.array(elem_list)


In [5]:
# Extract required data and store to dataframe

records = []
for i in range(len(mat2data["annolist"])):
    record, is_available = extract_data(mat2data, i)
    if(is_available):
        records.append(record)

df = pd.DataFrame(records)

In [6]:
df.head()

Unnamed: 0,rx1,ry1,rx2,ry2,scale,x_6,y_6,vis_6,x_7,y_7,...,vis_13,x_14,y_14,vis_14,x_15,y_15,vis_15,cat_name,act_name,act_id
0,806,56,901,183,3.806403,904.0,237.0,1.0,858.0,135.0,...,1.0,995.0,163.0,0.0,961.0,223.0,0.0,sports,curling,1
1,595,79,798,358,8.28087,846.0,351.0,1.0,738.0,259.0,...,1.0,1112.0,384.0,1.0,1012.0,489.0,1.0,sports,curling,1
2,302,122,355,197,2.204083,332.0,346.0,1.0,325.0,217.0,...,1.0,396.0,309.0,1.0,393.0,290.0,1.0,occupation,"truck driving, loading and unloading truck, ty...",3
3,439,143,540,283,4.143112,,,,533.0,322.0,...,1.0,,,,,,,occupation,"truck driving, loading and unloading truck, ty...",3
4,200,127,318,269,4.431105,159.0,370.0,1.0,189.0,228.0,...,1.0,319.0,123.0,1.0,376.0,39.0,1.0,occupation,"truck driving, loading and unloading truck, ty...",3


### Filtering the data to include only few actions/categories

In [7]:
# len(df["cat_name"].unique())
df["cat_name"].unique()

array(['sports', 'occupation', 'water activities', 'home activities',
       'lawn and garden', 'miscellaneous', 'religious activities',
       'winter activities', 'bicycling', 'conditioning exercise',
       'fishing and hunting', 'walking', 'running', 'self care',
       'music playing', 'home repair', 'transportation',
       'inactivity quiet/light', 'dancing', 'volunteer activities'],
      dtype=object)

In [8]:
# filtering data based on category.
useful = ['sports','bicycling', 'conditionaing exercise', 'walking', 'running', 'dancing']
df_useful = df.query("cat_name in ['sports','bicycling', 'conditionaing exercise', 'walking', 'running', 'dancing']")

print(f"total records : {len(df_useful)}")
print(f"total unique acts : {len(df_useful.act_name.unique())}" )

total records : 2778
total unique acts : 105


In [9]:
acts = df_useful.act_name.unique()
acts.sort()

# acts
# acts[:40]
# acts[40:]

In [10]:
# def merge_duplicates(df, column, pattern, replacement):
#     df.loc[df[column].str.contains(pattern), column] = replacement

## some acts have multiple forms, 
# for example - 'bicycling, BMX', 'bicycling, general', 'bicycling, mountain','bicycling, racing and road'
# merging them into one - like bicycling.
replace = ['walking', 'bicycling', 'hockey', 'tennis' 'badminton', 'aerobic', 'basketball', 'boxing', 'football', 'trampoline', 'volleyball', ]
for i in replace:
    df_useful.loc[df_useful['act_name'].str.contains(i), 'act_name'] = i

df_useful.loc[df_useful['act_name'].str.contains('running|jogging'), 'act_name'] = 'running'

In [11]:
print(f"total records : {len(df_useful)}")
print(f"total unique acts : {len(df_useful.act_name.unique())}" )

acts = df_useful.act_name.unique()
acts.sort()

total records : 2778
total unique acts : 82


In [12]:
## dropping the activities that have 15 or less records.
for act in acts:
    a = len(df_useful.loc[df_useful['act_name'] == act])
    if a <= 15:
        df_useful.drop(df_useful[df_useful['act_name'] == act].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_useful.drop(df_useful[df_useful['act_name'] == act].index, inplace=True)


In [13]:
print(f"total records : {len(df_useful)}")
print(f"total unique acts : {len(df_useful.act_name.unique())}" )

total records : 2492
total unique acts : 45


In [14]:
df_useful.to_csv("poseDataset.csv")