### Importing main libraries

In [53]:
%matplotlib notebook
import matplotlib as mpl
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import loadmat
import pandas as pd
import datetime as dt

# Question 1: Finding the IMDB-WIKI Dataset

It can be found on this [link](https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/). This jupyter uses the metadata found on that page, which can also be found in [drive folder](https://drive.google.com/drive/folders/1dg-T8gAxzBX8lGuYc-w_zB-Mqu2nC1t_?usp=sharing). It was uploaded here as apparently, there isn't a link to download only the metadata

In [56]:
data = loadmat("Datasets\imdb\imdb.mat") # loading the database metadata

In [3]:
data # displaying the raw dataset

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Jan 17 11:30:27 2016',
 '__version__': '1.0',
 '__globals__': [],
 'imdb': array([[(array([[693726, 693726, 693726, ..., 726831, 726831, 726831]]), array([[1968, 1970, 1968, ..., 2011, 2011, 2011]], dtype=uint16), array([[array(['01/nm0000001_rm124825600_1899-5-10_1968.jpg'], dtype='<U43'),
         array(['01/nm0000001_rm3343756032_1899-5-10_1970.jpg'], dtype='<U44'),
         array(['01/nm0000001_rm577153792_1899-5-10_1968.jpg'], dtype='<U43'),
         ...,
         array(['08/nm3994408_rm926592512_1989-12-29_2011.jpg'], dtype='<U44'),
         array(['08/nm3994408_rm943369728_1989-12-29_2011.jpg'], dtype='<U44'),
         array(['08/nm3994408_rm976924160_1989-12-29_2011.jpg'], dtype='<U44')]],
       dtype=object), array([[1., 1., 1., ..., 0., 0., 0.]]), array([[array(['Fred Astaire'], dtype='<U12'),
         array(['Fred Astaire'], dtype='<U12'),
         array(['Fred Astaire'], dtype='<U12'), ...,
         a

# Reading our dataset 

As this dataset is in a `.mat` format, it's required to convert it into a format Python can understand

## Obtaining an identifier of each observation

In [4]:
identifier = data['imdb']
id_type = identifier.dtype

## Turning the dataset into a Panda's dataframe

In [5]:
ndata = {n: identifier[n][0, 0] for n in id_type.names} # creating a dictionary with the dataset information in original format
new_ndata = {} # dictionary for storing the column values as an array per column

for key, value in ndata.items():
    new_ndata[key] = value[0] # assign to each column key its respective values

new_ndata.pop('celeb_names', None) # erasing column as it's the only case with a different number of observations than the rest
df = pd.DataFrame(new_ndata)

In [6]:
df # displaying the dataframe

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id
0,693726,1968,[01/nm0000001_rm124825600_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488
1,693726,1970,[01/nm0000001_rm3343756032_1899-5-10_1970.jpg],1.0,[Fred Astaire],"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488
2,693726,1968,[01/nm0000001_rm577153792_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.985660,6488
3,693726,1968,[01/nm0000001_rm946909184_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488
4,693726,1968,[01/nm0000001_rm980463616_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488
...,...,...,...,...,...,...,...,...,...
460718,726831,2011,[08/nm3994408_rm761245696_1989-12-29_2011.jpg],0.0,[Jane Levy],"[[453.8981431333457, 77.96623712908011, 539.79...",3.845884,,8410
460719,726831,2011,[08/nm3994408_rm784182528_1989-12-29_2011.jpg],0.0,[Jane Levy],"[[1, 1, 426, 640]]",-inf,,8410
460720,726831,2011,[08/nm3994408_rm926592512_1989-12-29_2011.jpg],0.0,[Jane Levy],"[[1, 1, 453, 640]]",-inf,,8410
460721,726831,2011,[08/nm3994408_rm943369728_1989-12-29_2011.jpg],0.0,[Jane Levy],"[[144.75225471724875, 126.76472287759263, 305....",4.450725,,8410


# Question 1:

In [7]:
age_gender_df = df[['dob', 'photo_taken', 'gender', 'celeb_id']]

In [8]:
def matlab2datetime(matlab_datenum):
    day = dt.datetime.fromordinal(int(matlab_datenum))
    dayfrac = dt.timedelta(days=matlab_datenum%1) - dt.timedelta(days = 366)
    return day + dayfrac

In [9]:
age_gender_df['birth'] = age_gender_df['dob'].apply(lambda matlab_datenum: dt.datetime.fromordinal(int(matlab_datenum)) + dt.timedelta(days=matlab_datenum%1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [37]:
age_gender_df["date_of_pic"] = age_gender_df["photo_taken"].apply(lambda row: dt.datetime.strptime("1-7-"+str(row), '%d-%m-%Y'))
age_gender_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,dob,photo_taken,gender,celeb_id,birth,date_of_pic
0,693726,1968,1.0,6488,1900-05-11 00:00:00,1968-07-01
1,693726,1970,1.0,6488,1900-05-11 00:00:00,1970-07-01
2,693726,1968,1.0,6488,1900-05-11 00:00:00,1968-07-01
3,693726,1968,1.0,6488,1900-05-11 00:00:00,1968-07-01
4,693726,1968,1.0,6488,1900-05-11 00:00:00,1968-07-01
...,...,...,...,...,...,...
460718,726831,2011,0.0,8410,1990-12-30 00:00:00,2011-07-01
460719,726831,2011,0.0,8410,1990-12-30 00:00:00,2011-07-01
460720,726831,2011,0.0,8410,1990-12-30 00:00:00,2011-07-01
460721,726831,2011,0.0,8410,1990-12-30 00:00:00,2011-07-01


In [52]:
age_gender_df['age'] = round((age_gender_df['date_of_pic'] - age_gender_df['birth']) / dt.timedelta(days=365), 1)
age_gender_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,dob,photo_taken,gender,celeb_id,birth,date_of_pic,age
0,693726,1968,1.0,6488,1900-05-11,1968-07-01,68.2
1,693726,1970,1.0,6488,1900-05-11,1970-07-01,70.2
2,693726,1968,1.0,6488,1900-05-11,1968-07-01,68.2
3,693726,1968,1.0,6488,1900-05-11,1968-07-01,68.2
4,693726,1968,1.0,6488,1900-05-11,1968-07-01,68.2
...,...,...,...,...,...,...,...
460718,726831,2011,0.0,8410,1990-12-30,2011-07-01,20.5
460719,726831,2011,0.0,8410,1990-12-30,2011-07-01,20.5
460720,726831,2011,0.0,8410,1990-12-30,2011-07-01,20.5
460721,726831,2011,0.0,8410,1990-12-30,2011-07-01,20.5
