# Imports, helper functions, and context initiation

In [1]:
from h2o.automl import H2OAutoML
from pyspark.sql import SparkSession
from pysparkling import *
from pyspark import SparkContext
from pyspark.sql.functions import *
from h2o.estimators.gbm import H2OGradientBoostingEstimator


def toDoubleSafe(v):
    try:
        return float(v)
    except ValueError:
        return str(v)
    
# define a udf that extracts the speaker id out of the filename
def get_speaker(filename):
    i = filename.index('sp')
    return int(filename[i+2:i+6])
speaker = udf(get_speaker)

In [2]:
ss = SparkSession.builder.getOrCreate()
hc = H2OContext.getOrCreate(ss)

Connecting to H2O server at http://192.168.1.240:54323 ... successful.


0,1
H2O cluster uptime:,14 secs
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,29 days
H2O cluster name:,sparkling-water-kathleenyi_local-1579064143879
H2O cluster total nodes:,1
H2O cluster free memory:,807 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4



Sparkling Water Context:
 * Sparkling Water Version: 3.28.0.1-1-2.4
 * H2O name: sparkling-water-kathleenyi_local-1579064143879
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (driver,192.168.1.240,54323)
  ------------------------

  Open H2O Flow in browser: http://192.168.1.240:54323 (CMD + click in Mac OSX)

    


In [3]:
# initialize h2o context
hc


Sparkling Water Context:
 * Sparkling Water Version: 3.28.0.1-1-2.4
 * H2O name: sparkling-water-kathleenyi_local-1579064143879
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (driver,192.168.1.240,54323)
  ------------------------

  Open H2O Flow in browser: http://192.168.1.240:54323 (CMD + click in Mac OSX)

    




In [4]:
# initialize spark context
sc = SparkContext.getOrCreate()

# Create joined data set from each chunk to labels

In [5]:
# read the file with features
features = ss.read.csv('final_chunk.csv', header=True)
# read the file of speaker id and gender
gender = ss.read.csv('Lab41-SRI-VOiCES-speaker-gender-dataset.csv', header=True)

In [6]:
# process speaker id
features_df = features.withColumn('Speaker', speaker('FileName')).drop('FileName', '_c0')

In [7]:
# join the dataframe
results = features_df.join(gender, on='Speaker', how='leftOuter')

In [9]:
results.summary()

DataFrame[summary: string, Speaker: string, Centroid: string, variance: string, skewness: string, kurtosis: string, mfcc1: string, mfcc2: string, mfcc3: string, mfcc4: string, mfcc5: string, mfcc6: string, mfcc7: string, mfcc8: string, mfcc9: string, mfcc10: string, mfcc11: string, mfcc12: string, roll_off_max: string, roll_off_min: string, flatness: string, zeroCrossingRate: string, rms: string, Gender: string, DataSet: string]

In [10]:
audio = results.rdd

# Data cleaning (and import data, if not from above)

In [11]:
# load and convert the data into an RDD
# audio = sc.textFile("INPUT_FILE_PATH", 8).map(lambda x:  x.split(", "))
audio_raw = audio.map(lambda row: [toDoubleSafe(x) for x in row])

# OR straight to H2O DF

# import h2o
# audio = h2o.import_file("INPUT_FILE_PATH")
# audio_df = hc.as_h2o_frame(audio)

In [12]:
# convert data into H2O dataframe
audio_h2o = hc.as_h2o_frame(audio_raw, "audio") # not sure what the second argument does. it's name arg


In [13]:
# rename columns
audio_h2o = audio_h2o.set_names(['Speaker', 'Centroid', 'variance', 'skewness', 'kurtosis', 'mfcc1', 
                     'mfcc2', 'mfcc3', 'mfcc4', 'mfcc5', 'mfcc6', 'mfcc7', 'mfcc8', 'mfcc9',
                     'mfcc10', 'mfcc11', 'mfcc12', 'roll_off_max', 'roll_off_min', 'flatness',
                     'zeroCrossingRate', 'rms', 'Gender', 'DataSet'])

In [14]:
# data cleaning
# set our label column
audio_h2o.set_name(col='Gender',   # switch here for gender <-> speaker (1/2)
                   name='label')

# convert categoricals columns into enum
audio_h2o["label"] = audio_h2o["label"].asfactor()

# drop columns filename and dataset
drop_dataset_audio_h2o = audio_h2o.drop('DataSet')
drop_dataset_audio_h2o = drop_dataset_audio_h2o.drop('Speaker') # switch here for gender <-> speaker (2/2)

In [None]:
# isolate the predictors
predictors = drop_dataset_audio_h2o.names[:]
predictors.remove("label")

# Create train test split

In [15]:
# train, test, validation
audio_train, audio_test, audio_valid = drop_dataset_audio_h2o.split_frame(ratios=[.7, .15])
combo_audio_valid = audio_test.rbind(audio_valid)

In [16]:
audio_train.show(5)

Unnamed: 0,Centroid,variance,skewness,kurtosis,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,...,mfcc9,mfcc10,mfcc11,mfcc12,roll_off_max,roll_off_min,flatness,zeroCrossingRate,rms,label
0,1620.217469,1710.432894,2096.841439,2460.711352,-238.100723,130.859833,-27.928782,58.97028,-11.979715,28.727245,...,8.469316,13.741711,9.363866,9.446972,2817.271115,68.069193,6.6e-05,0.089968,0.058334,F
1,1932.105996,1727.463475,2108.337624,2459.150892,-260.091949,133.04216,-19.429264,47.11861,-30.591085,19.131094,...,-7.774847,-0.327605,-6.296566,-10.721835,3191.474035,253.632633,0.000115,0.120056,0.040678,F
2,1712.288849,1585.585767,1992.015119,2377.491518,-281.300995,140.942673,-42.971718,33.146664,-19.934807,12.305672,...,-3.374846,3.32733,-10.326621,-6.615748,2746.514075,291.03123,0.000175,0.10649,0.032049,F
3,2560.154639,1589.336933,1877.269055,2139.736103,-312.810242,88.289192,-67.447784,47.281433,-33.727982,5.027608,...,4.045847,2.695038,-5.203551,-3.167777,3874.915506,672.975013,0.000427,0.190121,0.03444,F
4,2227.661263,1865.665805,2214.606968,2530.486454,-299.553894,114.573692,-28.94388,62.334595,-27.747252,36.546997,...,3.095205,7.515024,-2.482076,-4.185434,3694.131829,234.822747,0.000168,0.133278,0.028046,F
5,1886.239479,1782.559158,2166.916012,2521.468574,-275.523651,133.671417,-32.199677,58.330833,-22.530445,32.114281,...,0.663532,9.81482,0.497106,0.156162,3175.136147,117.996974,0.00016,0.121262,0.031045,F
6,1793.655303,1648.212417,2054.888205,2438.854661,-284.854919,136.400864,-41.888577,38.036636,-16.637712,11.839247,...,-7.210217,4.500036,-9.382683,-6.278342,2901.591193,271.638662,0.000214,0.112091,0.028365,F
7,1659.965813,1615.046762,2027.373722,2408.13555,-319.357513,144.574005,-20.675859,36.205379,-33.971565,5.824019,...,-3.010233,-5.909896,-7.696308,-8.388947,2676.745182,257.527152,0.000156,0.102339,0.022672,F
8,1794.412761,1738.854666,2126.038623,2481.770375,-261.557892,136.104553,-14.372876,53.265083,-23.043184,31.313229,...,7.65252,10.53751,3.230154,-1.667815,3041.913273,109.566004,9.7e-05,0.109053,0.041799,F
9,2456.467127,1933.562752,2294.32365,2614.119384,-321.092682,106.196213,-36.997097,54.811203,-26.295233,30.729445,...,-5.434083,3.693404,-6.954719,-1.466971,3974.276257,381.031896,0.000368,0.168109,0.021945,F


In [25]:
audio_train.columns[-1]

'label'

# Specific model attempt

In [26]:
# not automl
# n_folds = 3
# fold_assignment = "Modulo"
# keep_cross_validation_predictions = True

# model_gbm = H2OGradientBoostingEstimator(ntrees=538,
#                                          max_depth=6,
#                                          learn_rate=0.1,
#                                          nfolds=n_folds,
#                                          fold_assignment=fold_assignment,
#                                          keep_cross_validation_predictions=keep_cross_validation_predictions)
# model_gbm.train(x=predictors,
#                 y="label",
#                 training_frame=audio_train)

In [None]:
# model_gbm

In [None]:
# print(model_gbm.auc(train=True))
# print(model_gbm.auc(valid=True))

# Running auto-ml

In [19]:
# initiate automl
model_automl = H2OAutoML(max_models = 10, seed = 1, nfolds = 3)
model_automl.train(x=predictors,
                y='label',
                training_frame=audio_train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [20]:
# see which models are the top performing (binary gender)
model_automl.leaderboard

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_AllModels_AutoML_20200114_210418,0.96117,0.25908,0.948206,0.10103,0.276404,0.0763993
StackedEnsemble_BestOfFamily_AutoML_20200114_210418,0.959588,0.263717,0.944643,0.102824,0.279132,0.0779146
GBM_4_AutoML_20200114_210418,0.958933,0.260629,0.935377,0.104782,0.279976,0.0783864
GBM_3_AutoML_20200114_210418,0.958043,0.261772,0.938065,0.10502,0.280682,0.0787821
GBM_2_AutoML_20200114_210418,0.957341,0.264053,0.946848,0.112454,0.283254,0.0802326
XGBoost_1_AutoML_20200114_210418,0.955522,0.273057,0.947143,0.112541,0.287234,0.0825036
GBM_1_AutoML_20200114_210418,0.954087,0.274159,0.939163,0.114922,0.288959,0.0834974
GBM_5_AutoML_20200114_210418,0.954002,0.276894,0.942907,0.113191,0.289858,0.0840178
XGBoost_3_AutoML_20200114_210418,0.951695,0.289539,0.936016,0.123388,0.295496,0.0873181
XGBoost_2_AutoML_20200114_210418,0.949164,0.293891,0.935282,0.124429,0.298655,0.0891949




In [22]:
# checking out the top models (binary gender)
import h2o
h2o.get_model("StackedEnsemble_AllModels_AutoML_20200114_210418")

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_AllModels_AutoML_20200114_210418

No model summary for this model

ModelMetricsBinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.0028651479421049
RMSE: 0.0535270767192166
LogLoss: 0.046807488597233216
Null degrees of freedom: 6577
Residual degrees of freedom: 6571
Null deviance: 9116.474970219499
Residual deviance: 615.7993199852004
AIC: 629.7993199852004
AUC: 0.999999815042518
AUCPR: 0.980918128790587
Gini: 0.999999630085036

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.6834988261878747: 


Unnamed: 0,Unnamed: 1,F,M,Error,Rate
0,F,3224.0,0.0,0.0,(0.0/3224.0)
1,M,1.0,3353.0,0.0003,(1.0/3354.0)
2,Total,3225.0,3353.0,0.0002,(1.0/6578.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.683499,0.999851,200.0
1,max f2,0.334736,0.999881,203.0
2,max f0point5,0.683499,0.99994,200.0
3,max accuracy,0.683499,0.999848,200.0
4,max precision,0.975705,1.0,0.0
5,max recall,0.334736,1.0,203.0
6,max specificity,0.975705,1.0,0.0
7,max absolute_mcc,0.683499,0.999696,200.0
8,max min_per_class_accuracy,0.683499,0.999702,200.0
9,max mean_per_class_accuracy,0.683499,0.999851,200.0



Gains/Lift Table: Avg response rate: 50.99 %, avg score: 51.09 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010033,0.975561,1.96124,1.96124,1.0,0.975706,1.0,0.975706,0.019678,0.019678,96.124031,96.124031
1,,2,0.020067,0.975356,1.96124,1.96124,1.0,0.975451,1.0,0.975579,0.019678,0.039356,96.124031,96.124031
2,,3,0.0301,0.97514,1.96124,1.96124,1.0,0.975248,1.0,0.975468,0.019678,0.059034,96.124031,96.124031
3,,4,0.040134,0.974925,1.96124,1.96124,1.0,0.975032,1.0,0.975359,0.019678,0.078712,96.124031,96.124031
4,,5,0.050015,0.974642,1.96124,1.96124,1.0,0.974774,1.0,0.975244,0.01938,0.098092,96.124031,96.124031
5,,6,0.10003,0.973222,1.96124,1.96124,1.0,0.973938,1.0,0.974591,0.098092,0.196184,96.124031,96.124031
6,,7,0.150046,0.971675,1.96124,1.96124,1.0,0.972515,1.0,0.973899,0.098092,0.294275,96.124031,96.124031
7,,8,0.200061,0.969531,1.96124,1.96124,1.0,0.970644,1.0,0.973085,0.098092,0.392367,96.124031,96.124031
8,,9,0.300091,0.962779,1.96124,1.96124,1.0,0.966419,1.0,0.970863,0.196184,0.588551,96.124031,96.124031
9,,10,0.39997,0.948854,1.96124,1.96124,1.0,0.95698,1.0,0.967396,0.195886,0.784436,96.124031,96.124031




ModelMetricsBinomialGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.07639933663409613
RMSE: 0.2764042992322951
LogLoss: 0.25908032762291733
Null degrees of freedom: 6577
Residual degrees of freedom: 6571
Null deviance: 9119.292468638847
Residual deviance: 3408.460790207101
AIC: 3422.460790207101
AUC: 0.9611701649524808
AUCPR: 0.9482061890806117
Gini: 0.9223403299049615

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4742025235469069: 


Unnamed: 0,Unnamed: 1,F,M,Error,Rate
0,F,2884.0,340.0,0.1055,(340.0/3224.0)
1,M,324.0,3030.0,0.0966,(324.0/3354.0)
2,Total,3208.0,3370.0,0.1009,(664.0/6578.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.474203,0.901249,203.0
1,max f2,0.152086,0.925713,305.0
2,max f0point5,0.828278,0.918972,97.0
3,max accuracy,0.525536,0.899514,190.0
4,max precision,0.9761,1.0,0.0
5,max recall,0.027395,1.0,397.0
6,max specificity,0.9761,1.0,0.0
7,max absolute_mcc,0.525536,0.799069,190.0
8,max min_per_class_accuracy,0.496052,0.898573,197.0
9,max mean_per_class_accuracy,0.525536,0.899604,190.0



Gains/Lift Table: Avg response rate: 50.99 %, avg score: 50.99 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010033,0.975647,1.96124,1.96124,1.0,0.976036,1.0,0.976036,0.019678,0.019678,96.124031,96.124031
1,,2,0.020067,0.97506,1.96124,1.96124,1.0,0.97534,1.0,0.975688,0.019678,0.039356,96.124031,96.124031
2,,3,0.0301,0.974523,1.96124,1.96124,1.0,0.97481,1.0,0.975395,0.019678,0.059034,96.124031,96.124031
3,,4,0.040134,0.974149,1.96124,1.96124,1.0,0.974317,1.0,0.975126,0.019678,0.078712,96.124031,96.124031
4,,5,0.050015,0.973883,1.96124,1.96124,1.0,0.97402,1.0,0.974907,0.01938,0.098092,96.124031,96.124031
5,,6,0.10003,0.972093,1.955279,1.95826,0.99696,0.973091,0.99848,0.973999,0.097794,0.195886,95.527909,95.82597
6,,7,0.150046,0.969345,1.949318,1.955279,0.993921,0.970784,0.99696,0.972927,0.097496,0.293381,94.931788,95.527909
7,,8,0.200061,0.965258,1.937395,1.950808,0.987842,0.967531,0.994681,0.971578,0.096899,0.39028,93.739544,95.080818
8,,9,0.300091,0.949015,1.892686,1.931434,0.965046,0.958558,0.984802,0.967238,0.189326,0.579606,89.268632,93.143423
9,,10,0.39997,0.884934,1.74034,1.883715,0.887367,0.924534,0.960471,0.956574,0.173822,0.753429,74.033958,88.371504







In [21]:
predictions = model_automl.predict(combo_audio_valid)
predictions

stackedensemble prediction progress: |████████████████████████████████████| 100%


predict,F,M
F,0.971652,0.0283485
F,0.903522,0.0964781
F,0.81969,0.18031
F,0.894212,0.105788
M,0.0650669,0.934933
F,0.664082,0.335918
M,0.221296,0.778704
M,0.0325569,0.967443
M,0.0470458,0.952954
M,0.335021,0.664979




In [None]:
sc.stop()