In [None]:
import pandas as pd
import numpy as np
my_series = pd.Series([1,2,3,4,5], index=['row1','row2','row3','row4','row5'])
my_series

row1    1
row2    2
row3    3
row4    4
row5    5
dtype: int64

In [None]:
#show values
my_series.values

array([1, 2, 3, 4, 5])

In [None]:
#show index
my_series.index

Index(['row1', 'row2', 'row3', 'row4', 'row5'], dtype='object')

In [None]:
#select index
print(my_series.row2, ',', my_series['row2'])

2 , 2


In [None]:
#Boolean indexing
my_series[my_series>2]

row3    3
row4    4
row5    5
dtype: int64

In [None]:
#set alphebet label as new index
my_series.index = ['a', 'b', 'c', 'd', 'f']
my_series

a    1
b    2
c    3
d    4
f    5
dtype: int64

In [None]:
#create dataframe with array
my_array = np.array([[1, 2, 3, 4],[5, 6, 7, 8],[9, 10, 11, 12],[13, 14, 15, 16]])
my_df = pd.DataFrame( my_array, index=['row1','row2','row3','row4'], columns=['col1','col2','col3','col4'] )
my_df

Unnamed: 0,col1,col2,col3,col4
row1,1,2,3,4
row2,5,6,7,8
row3,9,10,11,12
row4,13,14,15,16


In [None]:
#create dataframe with dictionary
my_dict = {'col1':[1, 2, 3, 4],'col2':[1, 2, 3, 4],'col3':[1, 2, 3, 4],'col4':[1, 2, 3, 4]}
my_df = pd.DataFrame(my_array, index=['row1','row2','row3','row4'], columns=['col1','col2','col3','col4'] )
my_df

Unnamed: 0,col1,col2,col3,col4
row1,1,2,3,4
row2,5,6,7,8
row3,9,10,11,12
row4,13,14,15,16


In [None]:
#show index
my_df.index

Index(['row1', 'row2', 'row3', 'row4'], dtype='object')

In [None]:
#show columns
my_df.columns

Index(['col1', 'col2', 'col3', 'col4'], dtype='object')

In [None]:
#show values
my_df.values

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14, 15, 16]])

- selecting

In [None]:
my_df

Unnamed: 0,col1,col2,col3,col4
row1,1,2,3,4
row2,5,6,7,8
row3,9,10,11,12
row4,13,14,15,16


In [None]:
my_df.loc['row1']['col2']

2

In [None]:
my_df.iloc[1][3]
my_df.iloc[1][:]

col1    5
col2    6
col3    7
col4    8
Name: row2, dtype: int64

- Edit a dataframe

In [None]:
# adding columnns
my_df['col5'] = [2,4,6,8]
my_df

Unnamed: 0,col1,col2,col3,col4,col5
row1,1,2,3,4,2
row2,5,6,7,8,4
row3,9,10,11,12,6
row4,13,14,15,16,8


In [None]:
my_df.loc[['row1','row2'], 'col5']

row1    2
row2    4
Name: col5, dtype: int64

In [None]:
# Reset indexing
my_df.reset_index(drop=True)

Unnamed: 0,col1,col2,col3,col4,col5
0,1,2,3,4,2
1,5,6,7,8,4
2,9,10,11,12,6
3,13,14,15,16,8


In [None]:
my_df.drop('col5',axis =1)

Unnamed: 0,col1,col2,col3,col4
row1,1,2,3,4
row2,5,6,7,8
row3,9,10,11,12
row4,13,14,15,16


In [None]:
# Renaming
my_df.rename(columns={'col4': 'columns4'})

Unnamed: 0,col1,col2,col3,columns4
row1,1,2,3,4
row2,5,6,7,8
row3,9,10,11,12
row4,13,14,15,16


In [None]:
my_df.replace({5:15})

Unnamed: 0,col1,col2,col3,col4
row1,1,2,3,4
row2,15,6,7,8
row3,9,10,11,12
row4,13,14,15,16


- Apply function on index

In [None]:
my_df

Unnamed: 0,col1,col2,col3,col4
row1,1.0,2,3,4
row2,5.0,6,7,8
row3,9.0,10,11,12
row4,13.0,14,15,16


In [None]:
my_df.col1 = [int(x) for x in my_df.iloc[:,0]]
my_df 

Unnamed: 0,col1,col2,col3,col4
row1,1,2,3,4
row2,5,6,7,8
row3,9,10,11,12
row4,13,14,15,16


In [None]:
my_df['col1'] = my_df['col1'].apply(lambda x:float(x))
my_df

Unnamed: 0,col1,col2,col3,col4
row1,1.0,2,3,4
row2,5.0,6,7,8
row3,9.0,10,11,12
row4,13.0,14,15,16


- Sorting

In [None]:
#based on index
my_df.sort_index(axis=0,ascending=False)

Unnamed: 0,col4,col3,col2,col1
row1,4,3,2,1.0
row2,8,7,6,5.0
row3,12,11,10,9.0
row4,16,15,14,13.0


In [None]:
#based on values
my_df.sort_values(by = 'col1', ascending=False)

Unnamed: 0,col1,col2,col3,col4
row4,13.0,14,15,16
row3,9.0,10,11,12
row2,5.0,6,7,8
row1,1.0,2,3,4


- Head and tail

In [None]:
my_df.head(3)


Unnamed: 0,col1,col2,col3,col4
row1,1.0,2,3,4
row2,5.0,6,7,8
row3,9.0,10,11,12


In [None]:
my_df.tail(1)

Unnamed: 0,col1,col2,col3,col4
row4,13.0,14,15,16


- read data in directory

In [None]:
data = pd.read_csv('')

# Preprocessing
by : pandas-numpy.sklearn

- My dataset

In [None]:
#install kaggle
!pip install -q kaggle
from google.colab import files
files.upload()
#create a kaggle folder
! mkdir ~/.kaggle
# copy the kaggle.json to folder created
! cp kaggle.json ~/.kaggle/
# permission for the json to act
! chmod 600 ~/.kaggle/kaggle.json
# to list all dataset in kaggle
! kaggle datasets list
! kaggle datasets download -d jangedoo/utkface-new
! unzip utkface-new

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: utkface_aligned_cropped/crop_part1/34_1_0_20170109004755204.jpg.chip.jpg  
  inflating: utkface_aligned_cropped/crop_part1/34_1_0_20170111182452832.jpg.chip.jpg  
  inflating: utkface_aligned_cropped/crop_part1/34_1_1_20170103230340961.jpg.chip.jpg  
  inflating: utkface_aligned_cropped/crop_part1/34_1_1_20170104011329697.jpg.chip.jpg  
  inflating: utkface_aligned_cropped/crop_part1/34_1_1_20170104165020320.jpg.chip.jpg  
  inflating: utkface_aligned_cropped/crop_part1/34_1_1_20170108230211421.jpg.chip.jpg  
  inflating: utkface_aligned_cropped/crop_part1/34_1_2_20170104022134829.jpg.chip.jpg  
  inflating: utkface_aligned_cropped/crop_part1/34_1_2_20170104023010725.jpg.chip.jpg  
  inflating: utkface_aligned_cropped/crop_part1/34_1_2_20170104172537171.jpg.chip.jpg  
  inflating: utkface_aligned_cropped/crop_part1/34_1_2_20170104201443273.jpg.chip.jpg  
  inflating: utkface_aligned_cropped/crop_part1/34_1_2_

In [None]:
import numpy as np 
import pandas as pd
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
dataset_folder_name = 'UTKFace'
TRAIN_TEST_SPLIT = 0.7
IM_WIDTH = IM_HEIGHT = 198
dataset_dict = {
    'race_id': {
        0: 'white', 
        1: 'black', 
        2: 'asian', 
        3: 'indian', 
        4: 'others'
    },
    'gender_id': {
        0: 'male',
        1: 'female'
    }
}

In [None]:
dataset_dict

{'gender_id': {0: 'male', 1: 'female'},
 'race_id': {0: 'white', 1: 'black', 2: 'asian', 3: 'indian', 4: 'others'}}

In [None]:
dataset_dict['gender_alias'] = dict((g, i) for i, g in dataset_dict['gender_id'].items())
dataset_dict['race_alias'] = dict((r, i) for i, r in dataset_dict['race_id'].items())

In [None]:
dataset_dict

{'gender_alias': {'female': 1, 'male': 0},
 'gender_id': {0: 'male', 1: 'female'},
 'race_alias': {'asian': 2, 'black': 1, 'indian': 3, 'others': 4, 'white': 0},
 'race_id': {0: 'white', 1: 'black', 2: 'asian', 3: 'indian', 4: 'others'}}

In [None]:
string = '/content/crop_part1/100_1_0_20170110183726390.jpg.chip.jpg'
string=os.path.split(string)[1]
string

'100_1_0_20170110183726390.jpg.chip.jpg'

In [None]:
string=os.path.splitext(string)[0]

In [None]:
 age, gender, race, _ =string.split('_')

'100'

Unnamed: 0,0,1,2
0,9,male,white
1,4,male,asian
2,8,female,asian


In [None]:
my_list={(79, 'male', 'white')
,(4, 'female', 'asian')
,(32, 'female', 'white')
,(29, 'female', 'asian')
,(7, 'male', 'white')
,(29, 'female', 'black')
,(8, 'female', 'white')
,(36, 'female', 'indian')}

In [None]:
df=pd.DataFrame(my_list)
df = df.dropna()
df

Unnamed: 0,0,1,2
0,36,female,indian
1,8,female,white
2,32,female,white
3,29,female,black
4,29,female,asian
5,7,male,white
6,4,female,asian
7,79,male,white


In [None]:
def parse_dataset(dataset_path, ext='jpg'):
    """
    Used to extract information about our dataset. It does iterate over all images and return a DataFrame with
    the data (age, gender and sex) of all files.
    """
    def parse_info_from_file(path):
        """
        Parse information from a single file
        """
        try:
            filename = os.path.split(path)[1]
            filename = os.path.splitext(filename)[0]
            age, gender, race, _ = filename.split('_')
            return int(age), dataset_dict['gender_id'][int(gender)], dataset_dict['race_id'][int(race)]
        except Exception as ex:
            return None, None, None
        
    files = glob.glob(os.path.join('/content/crop_part1', "*.%s" % ext))
    
    records = []
    for file in files:
        info = parse_info_from_file(file)
        print(info)
        records.append(info)
        
    df = pd.DataFrame(records)
    df['file'] = files
    df.columns = ['age', 'gender', 'race', 'file']
    df = df.dropna()
    
    return df
face_df = parse_dataset(dataset_folder_name)
#face_df.head(5)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
(26, 'male', 'indian')
(6, 'male', 'others')
(25, 'female', 'white')
(58, 'female', 'white')
(52, 'female', 'white')
(6, 'male', 'asian')
(72, 'female', 'white')
(3, 'female', 'indian')
(5, 'male', 'black')
(8, 'male', 'asian')
(75, 'female', 'white')
(29, 'female', 'white')
(24, 'female', 'indian')
(2, 'female', 'white')
(52, 'female', 'white')
(27, 'male', 'others')
(47, 'female', 'white')
(25, 'female', 'indian')
(65, 'female', 'white')
(57, 'male', 'white')
(5, 'female', 'others')
(26, 'female', 'indian')
(31, 'female', 'white')
(73, 'female', 'white')
(34, 'male', 'white')
(38, 'female', 'black')
(7, 'female', 'white')
(70, 'male', 'white')
(10, 'male', 'white')
(7, 'male', 'others')
(4, 'female', 'others')
(2, 'male', 'asian')
(16, 'female', 'black')
(19, 'female', 'white')
(49, 'female', 'white')
(11, 'male', 'white')
(83, 'male', 'white')
(60, 'female', 'white')
(46, 'male', 'white')
(25, 'female', 'indian')
(1, '

In [None]:
face_df.rename(columns={'age':'people_age'})

Unnamed: 0,people_age,gender,race,file
0,45.0,female,white,/content/crop_part1/45_1_0_20170104205614347.j...
1,58.0,female,white,/content/crop_part1/58_1_0_20170109142427954.j...
2,69.0,female,white,/content/crop_part1/69_1_0_20170110131527801.j...
3,6.0,male,white,/content/crop_part1/6_0_0_20170110215629811.jp...
4,18.0,female,white,/content/crop_part1/18_1_0_20170109205411880.j...
...,...,...,...,...
9775,4.0,female,others,/content/crop_part1/4_1_4_20161223230002625.jp...
9776,11.0,male,white,/content/crop_part1/11_0_0_20170110225459361.j...
9777,39.0,female,indian,/content/crop_part1/39_1_3_20170110173815028.j...
9778,1.0,male,others,/content/crop_part1/1_0_4_20170103210731970.jp...


In [None]:
face_df.shape

(9778, 4)

In [None]:
face_df.drop('file', axis=1 ,inplace=True)


In [None]:
face_df

Unnamed: 0,age,gender,race
0,45.0,female,white
1,58.0,female,white
2,69.0,female,white
3,6.0,male,white
4,18.0,female,white
...,...,...,...
9775,4.0,female,others
9776,11.0,male,white
9777,39.0,female,indian
9778,1.0,male,others


In [None]:
len(set(face_df.iloc[:,0]))

99

In [None]:
len(set(face_df.iloc[:,1]))

2

In [None]:
len(set(face_df.iloc[:,2]))

5

In [None]:
face_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9778 entries, 0 to 9779
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     9778 non-null   float64
 1   gender  9778 non-null   object 
 2   race    9778 non-null   object 
dtypes: float64(1), object(2)
memory usage: 625.6+ KB
