# Pandas Intro: Gender Prediction using Voice
We are using the following the dataset from kaggle: https://www.kaggle.com/primaryobjects/voicegender

It is about 1Mb of data and included in this repo.

## Imports

In [1]:
import pandas as pd

## Loading Data
Loading csv or most other common data formats is a one liner:

In [2]:
df_data = pd.read_csv("voice.csv")
df_data

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.000000,0.000000,male
1,0.066009,0.067310,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.250000,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.007990,0.007812,0.015625,0.007812,0.046512,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.250000,0.201497,0.007812,0.562500,0.554688,0.247119,male
4,0.135120,0.079146,0.124656,0.078720,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.135120,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male
5,0.132786,0.079557,0.119090,0.067958,0.209592,0.141634,1.932562,8.308895,0.963181,0.738307,...,0.132786,0.110132,0.017112,0.253968,0.298222,0.007812,2.726562,2.718750,0.125160,male
6,0.150762,0.074463,0.160106,0.092899,0.205718,0.112819,1.530643,5.987498,0.967573,0.762638,...,0.150762,0.105945,0.026230,0.266667,0.479620,0.007812,5.312500,5.304688,0.123992,male
7,0.160514,0.076767,0.144337,0.110532,0.231962,0.121430,1.397156,4.766611,0.959255,0.719858,...,0.160514,0.093052,0.017758,0.144144,0.301339,0.007812,0.539062,0.531250,0.283937,male
8,0.142239,0.078018,0.138587,0.088206,0.208587,0.120381,1.099746,4.070284,0.970723,0.770992,...,0.142239,0.096729,0.017957,0.250000,0.336476,0.007812,2.164062,2.156250,0.148272,male
9,0.134329,0.080350,0.121451,0.075580,0.201957,0.126377,1.190368,4.787310,0.975246,0.804505,...,0.134329,0.105881,0.019300,0.262295,0.340365,0.015625,4.695312,4.679688,0.089920,male


## Analyzing Data
After loading our data we want to get an overview about its structure and its size:

In [3]:
# Columns of our csv:
print(df_data.columns)

Index(['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'IQR', 'skew', 'kurt',
       'sp.ent', 'sfm', 'mode', 'centroid', 'meanfun', 'minfun', 'maxfun',
       'meandom', 'mindom', 'maxdom', 'dfrange', 'modindx', 'label'],
      dtype='object')


### Pitfall: Size
**Pitfall:** the `size` property of a dataframe is not the amount of rows the dataset contains but the amount of items (rows * cols)

In [62]:
print("Size:", df_data.size)
print("Colums:", df_data.columns.size)
print("Row Count:", df_data.index.size)
# Another option is to use shape which returns a tuple
df_data.shape

Size: 66528
Colums: 21
Row Count: 3168


(3168, 21)

### Selecting rows


In [5]:
# Select the first n rows
df_data.head(n=1)

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,male


In [6]:
# Select the last n rows
df_data.tail(n=2)

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
3166,0.143659,0.090628,0.184976,0.043508,0.219943,0.176435,1.591065,5.388298,0.950436,0.67547,...,0.143659,0.172375,0.034483,0.25,0.79136,0.007812,3.59375,3.585938,0.311002,female
3167,0.165509,0.092884,0.183044,0.070072,0.250827,0.180756,1.705029,5.769115,0.938829,0.601529,...,0.165509,0.185607,0.062257,0.271186,0.227022,0.007812,0.554688,0.546875,0.35,female


In [7]:
# Select any row by its index
df_data.loc[[1337]]

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
1337,0.178696,0.059246,0.192232,0.121158,0.220375,0.099216,1.322067,4.430811,0.923825,0.474675,...,0.178696,0.136461,0.049383,0.27907,0.925465,0.023438,4.5,4.476562,0.127681,male


In [8]:
# Select multiple rows by their indices
df_data.loc[[23, 42, 1337]]

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
23,0.170213,0.075105,0.146053,0.123989,0.250126,0.126137,2.816793,13.764582,0.913832,0.487966,...,0.170213,0.077698,0.015702,0.192771,0.101562,0.007812,0.5625,0.554688,0.161791,male
42,0.166764,0.064489,0.160124,0.116847,0.229583,0.112736,2.117203,8.939176,0.937849,0.592123,...,0.166764,0.101681,0.01564,0.253968,0.419531,0.007812,2.820312,2.8125,0.099573,male
1337,0.178696,0.059246,0.192232,0.121158,0.220375,0.099216,1.322067,4.430811,0.923825,0.474675,...,0.178696,0.136461,0.049383,0.27907,0.925465,0.023438,4.5,4.476562,0.127681,male


### Selecting columns

In [9]:
# Select one column by name (head to only print first 5 entries)
df_data.meanfreq.head(n=5)

0    0.059781
1    0.066009
2    0.077316
3    0.151228
4    0.135120
Name: meanfreq, dtype: float64

In [10]:
# Selecting multiple columns (head to only print first 5 entries)
df_data[["meanfreq", "maxfun", "label"]].head(n=5)

Unnamed: 0,meanfreq,maxfun,label
0,0.059781,0.275862,male
1,0.066009,0.25,male
2,0.077316,0.271186,male
3,0.151228,0.25,male
4,0.13512,0.266667,male


### Analyzing value ranges


In [11]:
# Printing various stats about a column
df_data.meanfreq.describe()

count    3168.000000
mean        0.180907
std         0.029918
min         0.039363
25%         0.163662
50%         0.184838
75%         0.199146
max         0.251124
Name: meanfreq, dtype: float64

In [12]:
# Describe even works on the whole dataset and prints the stats for all numeric columns
df_data.describe()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,mode,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx
count,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0
mean,0.180907,0.057126,0.185621,0.140456,0.224765,0.084309,3.140168,36.568461,0.895127,0.408216,0.165282,0.180907,0.142807,0.036802,0.258842,0.829211,0.052647,5.047277,4.99463,0.173752
std,0.029918,0.016652,0.03636,0.04868,0.023639,0.042783,4.240529,134.928661,0.04498,0.177521,0.077203,0.029918,0.032304,0.01922,0.030077,0.525205,0.063299,3.521157,3.520039,0.119454
min,0.039363,0.018363,0.010975,0.000229,0.042946,0.014558,0.141735,2.068455,0.738651,0.036876,0.0,0.039363,0.055565,0.009775,0.103093,0.007812,0.004883,0.007812,0.0,0.0
25%,0.163662,0.041954,0.169593,0.111087,0.208747,0.04256,1.649569,5.669547,0.861811,0.258041,0.118016,0.163662,0.116998,0.018223,0.253968,0.419828,0.007812,2.070312,2.044922,0.099766
50%,0.184838,0.059155,0.190032,0.140286,0.225684,0.09428,2.197101,8.318463,0.901767,0.396335,0.186599,0.184838,0.140519,0.04611,0.271186,0.765795,0.023438,4.992188,4.945312,0.139357
75%,0.199146,0.06702,0.210618,0.175939,0.24366,0.114175,2.931694,13.648905,0.928713,0.533676,0.221104,0.199146,0.169581,0.047904,0.277457,1.177166,0.070312,7.007812,6.992188,0.209183
max,0.251124,0.115273,0.261224,0.247347,0.273469,0.252225,34.725453,1309.612887,0.981997,0.842936,0.28,0.251124,0.237636,0.204082,0.279114,2.957682,0.458984,21.867188,21.84375,0.932374


In [13]:
# Printing only the stats one is interested in
meanfreq_col = df_data.meanfreq
print("Max:", meanfreq_col.max())
print("Mean:", meanfreq_col.mean())
print("Min:", meanfreq_col.min())

Max: 0.25112375872
Mean: 0.18090661037084196
Min: 0.0393633425836


In [14]:
# Stats for non-numeric columns
label_col = df_data.label
label_col.unique()

array(['male', 'female'], dtype=object)

### Selecting with a condition

In [15]:
# Select all `male` rows
df_data[df_data.label == "male"].head(n=5)

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,male
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119,male
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male


In [16]:
# Select all rows where the mean freq is greater than its mean
# First show the comparison (true false vector as result) and then use it to select
df_data[df_data.meanfreq > df_data.meanfreq.mean()].head(n=5)

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
13,0.181225,0.060042,0.190953,0.128839,0.229532,0.100693,1.36943,5.4756,0.937446,0.53708,...,0.181225,0.131504,0.025,0.275862,1.277114,0.007812,2.804688,2.796875,0.41655,male
14,0.183115,0.066982,0.191233,0.129149,0.240152,0.111004,3.568104,35.384748,0.940333,0.571394,...,0.183115,0.102799,0.020833,0.275862,1.245739,0.203125,6.742188,6.539062,0.139332,male
16,0.190846,0.06579,0.207951,0.13228,0.244357,0.112076,1.562304,7.83435,0.938546,0.53881,...,0.190846,0.113323,0.017544,0.275862,1.434115,0.007812,6.320312,6.3125,0.25478,male
21,0.181015,0.074369,0.169299,0.128673,0.254175,0.125502,2.587325,12.281432,0.915284,0.475317,...,0.181015,0.098643,0.016145,0.275862,0.209844,0.007812,3.695312,3.6875,0.05994,male
69,0.185098,0.054289,0.176772,0.14642,0.216629,0.070208,1.601739,5.223786,0.910321,0.358023,...,0.185098,0.130983,0.016162,0.266667,0.568287,0.007812,5.140625,5.132812,0.095827,male


### Selecting with multiple conditions

In [17]:
# Select all rows where mean freq is greater than its mean and the label is female
# ** The parantheses around each condition are mandatory!
df_data[(df_data.meanfreq > df_data.meanfreq.mean()) & (df_data.label == "female")].head(n=5)

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
1585,0.182855,0.067789,0.200639,0.175489,0.226068,0.050579,3.00189,19.865482,0.910458,0.506099,...,0.182855,0.15959,0.018713,0.266667,0.25897,0.054688,0.804688,0.75,0.269231,female
1586,0.199807,0.061974,0.211358,0.184422,0.235687,0.051265,2.543841,14.921964,0.904432,0.425289,...,0.199807,0.156465,0.016194,0.266667,0.250446,0.054688,0.898438,0.84375,0.329521,female
1587,0.19528,0.072087,0.204656,0.180611,0.255954,0.075344,2.392326,10.061489,0.907115,0.524209,...,0.19528,0.182629,0.024922,0.275862,0.269531,0.054688,0.703125,0.648438,0.294717,female
1588,0.208504,0.05755,0.220229,0.190343,0.249759,0.059416,1.707786,5.670912,0.879674,0.343548,...,0.208504,0.162043,0.016807,0.262295,0.260789,0.054688,0.8125,0.757812,0.251546,female
1591,0.204518,0.068569,0.216026,0.19223,0.255276,0.063046,1.918543,7.537126,0.896919,0.409583,...,0.204518,0.163077,0.016754,0.275862,0.205,0.046875,0.757812,0.710938,0.235348,female


## First ML attempts
### Data Description from Kaggle
* **meanfreq**: mean frequency (in kHz)
* **sd**: standard deviation of frequency
* **median**: median frequency (in kHz)
* **Q25**: first quantile (in kHz)
* **Q75**: third quantile (in kHz)
* **IQR**: interquantile range (in kHz)
* **skew**: skewness (see note in specprop description)
* **kurt**: kurtosis (see note in specprop description)
* **sp.ent**: spectral entropy
* **sfm**: spectral flatness
* **mode**: mode frequency
* **centroid**: frequency centroid (see specprop)
* **peakf**: peak frequency (frequency with highest energy)
* **meanfun**: average of fundamental frequency measured across acoustic signal
* **minfun**: minimum fundamental frequency measured across acoustic signal
* **maxfun**: maximum fundamental frequency measured across acoustic signal
* **meandom**: average of dominant frequency measured across acoustic signal
* **mindom**: minimum of dominant frequency measured across acoustic signal
* **maxdom**: maximum of dominant frequency measured across acoustic signal
* **dfrange**: range of dominant frequency measured across acoustic signal
* **modindx**: modulation index. Calculated as the accumulated absolute difference between adjacent measurements of fundamental frequencies divided by the frequency range
* **label**: male or female

The interesting part is the label which describes whether the set of measurements belongs to a male or female. Our aim is to predict this label.

The Kaggle description of the dataset states that the baseline accuracy is at $50 \%$ when always predicting male.

### Splitting the data sets

In [18]:
df_train = df_data.sample(df_data.index.size // 2).sort_index()
df_train

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.000000,0.000000,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.250000,0.201497,0.007812,0.562500,0.554688,0.247119,male
5,0.132786,0.079557,0.119090,0.067958,0.209592,0.141634,1.932562,8.308895,0.963181,0.738307,...,0.132786,0.110132,0.017112,0.253968,0.298222,0.007812,2.726562,2.718750,0.125160,male
7,0.160514,0.076767,0.144337,0.110532,0.231962,0.121430,1.397156,4.766611,0.959255,0.719858,...,0.160514,0.093052,0.017758,0.144144,0.301339,0.007812,0.539062,0.531250,0.283937,male
9,0.134329,0.080350,0.121451,0.075580,0.201957,0.126377,1.190368,4.787310,0.975246,0.804505,...,0.134329,0.105881,0.019300,0.262295,0.340365,0.015625,4.695312,4.679688,0.089920,male
12,0.137343,0.080877,0.124263,0.083145,0.209227,0.126082,1.378728,5.008952,0.963514,0.736150,...,0.137343,0.092644,0.016789,0.213333,0.481671,0.015625,5.015625,5.000000,0.088500,male
13,0.181225,0.060042,0.190953,0.128839,0.229532,0.100693,1.369430,5.475600,0.937446,0.537080,...,0.181225,0.131504,0.025000,0.275862,1.277114,0.007812,2.804688,2.796875,0.416550,male
14,0.183115,0.066982,0.191233,0.129149,0.240152,0.111004,3.568104,35.384748,0.940333,0.571394,...,0.183115,0.102799,0.020833,0.275862,1.245739,0.203125,6.742188,6.539062,0.139332,male
16,0.190846,0.065790,0.207951,0.132280,0.244357,0.112076,1.562304,7.834350,0.938546,0.538810,...,0.190846,0.113323,0.017544,0.275862,1.434115,0.007812,6.320312,6.312500,0.254780,male
18,0.168346,0.074121,0.145618,0.115756,0.239824,0.124068,2.704335,18.484703,0.934523,0.559742,...,0.168346,0.083484,0.015717,0.231884,0.146563,0.007812,3.125000,3.117188,0.059537,male


The validation set is every entry from our dataset that is not contained in the training set:

In [19]:
# TODO: Make a real copy of the data to fix error below
df_validate = df_data.loc[~df_data.index.isin(df_train.index)]
df_validate

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
1,0.066009,0.067310,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.250000,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.007990,0.007812,0.015625,0.007812,0.046512,male
4,0.135120,0.079146,0.124656,0.078720,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.135120,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male
6,0.150762,0.074463,0.160106,0.092899,0.205718,0.112819,1.530643,5.987498,0.967573,0.762638,...,0.150762,0.105945,0.026230,0.266667,0.479620,0.007812,5.312500,5.304688,0.123992,male
8,0.142239,0.078018,0.138587,0.088206,0.208587,0.120381,1.099746,4.070284,0.970723,0.770992,...,0.142239,0.096729,0.017957,0.250000,0.336476,0.007812,2.164062,2.156250,0.148272,male
10,0.157021,0.071943,0.168160,0.101430,0.216740,0.115310,0.979442,3.974223,0.965249,0.733693,...,0.157021,0.088894,0.022069,0.117647,0.460227,0.007812,2.812500,2.804688,0.200000,male
11,0.138551,0.077054,0.127527,0.087314,0.202739,0.115426,1.626770,6.291365,0.966004,0.752042,...,0.138551,0.104199,0.019139,0.262295,0.246094,0.007812,2.718750,2.710938,0.132351,male
15,0.174272,0.069411,0.190874,0.115602,0.228279,0.112677,4.485038,61.764908,0.950972,0.635199,...,0.174272,0.102046,0.018328,0.246154,1.621299,0.007812,7.000000,6.992188,0.209311,male
17,0.171247,0.074872,0.152807,0.122391,0.243617,0.121227,3.207170,25.765565,0.936954,0.586420,...,0.171247,0.079718,0.015671,0.262295,0.106279,0.007812,0.570312,0.562500,0.138355,male
19,0.173631,0.073352,0.153569,0.123680,0.244234,0.120554,2.804975,20.857543,0.930917,0.518269,...,0.173631,0.090130,0.015702,0.210526,0.193044,0.007812,2.820312,2.812500,0.068124,male


### Setup

In [20]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_jobs=6)

### Prepare the datasets

In [26]:
# Prepping the data (strings should be transformed into enums)
train_labels = df_train.label
train_cols = df_train.ix[:,:-1]
validate_labels = df_validate.label
validate_cols = df_validate.ix[:,:-1]

### Fit & Predict

In [30]:
forest.fit(train_cols, df_train.label)
predictions = forest.predict(validate_cols)

### Check results

In [59]:
forest.score(validate_cols, validate_labels)

0.97411616161616166

In [61]:
pd.crosstab(validate_labels, predictions, rownames=["actual"], colnames=["predictions"])

predictions,female,male
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
female,743,18
male,23,800


### Analyzing the Forest

In [51]:
features = forest.feature_importances_
sorted_indices = features.argsort()
sorted_indices

array([16, 14, 18, 19, 15,  6,  8,  2, 13,  7, 17,  0,  4, 10, 11,  9,  5,
        1,  3, 12])

In [56]:
df_data.columns[sorted_indices[::-1]]

Index(['meanfun', 'Q25', 'sd', 'IQR', 'sfm', 'centroid', 'mode', 'Q75',
       'meanfreq', 'maxdom', 'kurt', 'minfun', 'median', 'sp.ent', 'skew',
       'meandom', 'modindx', 'dfrange', 'maxfun', 'mindom'],
      dtype='object')