# **1.1. Load packages**


In [0]:
import pandas as pd
import numpy as np
import pandas_profiling as pdp
pd.set_option('display.max_columns', None)
import folium
from folium.plugins import HeatMap, HeatMapWithTime

In [0]:
!pip install pandas-profiling==2.8.0

# **1.2. Load Data**

In [0]:
data_df = pd.read_csv("/content/drive/My Drive/datasets/facial-expression/fer2013/fer2013_gender_age_race.csv", sep=",",low_memory=False)

# **1.3. Glimpse the data**

In [0]:
print(f"data shape: rows: {data_df.shape[0]}, cols: {data_df.shape[1]}")
print(f"data columns: {list(data_df.columns)}")

data shape: rows: 44149, cols: 6
data columns: ['emotion', 'pixels', 'usage', 'age', 'gender', 'race']


Data statistics for numerical features

In [0]:
data_df.describe()

Missing data and data types.

In [0]:
def missing_data_and_data_types(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

missing_data_and_data_types(data_df)

Unnamed: 0,emotion,pixels,usage,age,gender,race
Total,0,0,0,0,0,0
Percent,0,0,0,0,0,0
Types,int64,object,object,object,object,int64


Unique values.

In [0]:
def unique_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    uniques = []
    for col in data.columns:
        unique = data[col].nunique()
        uniques.append(unique)
    tt['Uniques'] = uniques
    return(np.transpose(tt))
unique_values(data_df)

Unnamed: 0,emotion,pixels,usage,age,gender,race
Total,44149,44149,44149,44149,44149,44149
Uniques,7,41230,3,8,2,6


# **1.4. Data profiling report**

In [0]:
pdp.ProfileReport(data_df)

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=20.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…






# **1.5. Sankey Diagram**

In [0]:
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
        
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
    
    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum
       
    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
        
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
    
    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count']
        )
      )
    
    layout =  dict(
        title = title,
        font = dict(
          size = 10
        )
    )
       
    fig = dict(data=[data], layout=layout)
    return fig


In [0]:
data_agg = data_df.groupby(['emotion','age','usage'])[['gender']].count().reset_index()

In [0]:
data_agg

Unnamed: 0,emotion,age,usage,gender
0,0,(0-2),PrivateTest,46
1,0,(0-2),PublicTest,35
2,0,(0-2),Training,389
3,0,(15-20),PrivateTest,23
4,0,(15-20),PublicTest,22
...,...,...,...,...
163,6,(60-100),PublicTest,15
164,6,(60-100),Training,122
165,6,(8-12),PrivateTest,88
166,6,(8-12),PublicTest,72


In [0]:
fig = genSankey(data_agg,cat_cols=['emotion','age','usage'],\
                value_cols='gender',
                title='Sankey Diagram for {emotion | age | usage}')
iplot(fig, validate=False)

In [0]:
data_agg = data_df.groupby(['emotion','gender','usage'])[['age']].count().reset_index()

In [0]:
data_agg

Unnamed: 0,emotion,gender,usage,age
0,0,Female,PrivateTest,150
1,0,Female,PublicTest,141
2,0,Female,Training,1145
3,0,Male,PrivateTest,341
4,0,Male,PublicTest,326
5,0,Male,Training,2850
6,1,Female,PrivateTest,200
7,1,Female,PublicTest,242
8,1,Female,Training,2115
9,1,Male,PrivateTest,402


In [0]:
fig = genSankey(data_agg,cat_cols=['emotion','gender','usage'],\
                value_cols='age',
                title='Sankey Diagram for {emotion | gender | usage}')
iplot(fig, validate=False)