In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
ss = SparkSession.builder.getOrCreate()
sc = SparkContext.getOrCreate()

In [2]:
# read the file with features
features = ss.read.csv('final_chunk.csv', header=True)
# read the file of speaker id and gender
gender = ss.read.csv('Lab41-SRI-VOiCES-speaker-gender-dataset.csv', header=True)

In [4]:
# define a udf that extracts the speaker id out of the filename
def get_speaker(filename):
    i = filename.index('sp')
    return int(filename[i+2:i+6])
speaker = udf(get_speaker)

In [5]:
# process speaker id
features_df = features.withColumn('Speaker', speaker('FileName')).drop('FileName', '_c0')

In [6]:
# join the dataframe
results = features_df.join(gender, on='Speaker', how='leftOuter')

In [7]:
# check the number of observations
print(features.count())
print(results.count())

9381
9381


## Five Speaker Recognition in H2O

In [66]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pysparkling import *
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
import h2o

In [67]:
speaker_count = results.groupBy("Speaker").count().orderBy("count", ascending=False)
speaker_id = speaker_count.filter("count==160")
speaker5 = results.join(speaker_id, "Speaker",'left_semi')
speaker5_rdd = speaker5.rdd

In [68]:
def double_safe(value):
    try:
        return float(value)
    except ValueError:
        return str(value)

In [70]:
speaker5_rdd = speaker5_rdd.map(lambda row: [double_safe(x) for x in row])

In [74]:
splits = speaker5_rdd.randomSplit([0.8,0.2])
train_rdd = splits[0].cache()
test_rdd = splits[0].cache()

In [75]:
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()
hc = H2OContext.getOrCreate(ss)


Sparkling Water Context:
 * Sparkling Water Version: 3.28.0.1-1-2.4
 * H2O name: sparkling-water-niche_local-1579048523155
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (driver,10.1.129.239,54323)
  ------------------------

  Open H2O Flow in browser: http://10.1.129.239:54323 (CMD + click in Mac OSX)

    


In [93]:
train_h2o = hc.as_h2o_frame(train_rdd)
train_h2o["_1"] = train_h2o["_1"].asfactor()
test_h2o = hc.as_h2o_frame(test_rdd)
test_h2o["_1"] = test_h2o["_1"].asfactor()

In [102]:
test_h2o.summary()

Unnamed: 0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_10,_11,_12,_13,_14,_15,_16,_17,_18,_19,_20,_21,_22,_23,_24
type,enum,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,real,string,string
mins,,715.2059285279704,1013.5593618488223,1513.1846178536314,1958.853494352578,-412.8054504394531,81.93749237060547,-55.19367218017578,-16.482170104980472,-44.55499649047852,-17.448844909667972,-41.35249328613281,-46.75634384155274,-26.22501945495605,-21.78336334228516,-36.863731384277344,-17.73868179321289,810.1042498289232,37.28347178629912,1.6020314433262687e-05,0.037275461906934315,0.011590708047151566,,
mean,,1558.8070289828577,1645.4631085427504,2077.656494133379,2471.495041330062,-302.8285923957825,134.53789825439452,-9.15292090633302,39.14161804774776,-13.593262318102642,17.51758128463407,-15.108535551548993,3.045485135971103,-6.536512133499491,1.229099655910068,-4.30368997583064,-2.2737829620717087,2603.0817451942044,181.96094557401526,0.00016070035927100436,0.08701105275437562,0.03861259625118692,,
maxs,,2559.7953603088044,2343.098270017536,2717.0924958863393,3026.5569576826,-230.35374450683597,178.95611572265625,42.11531066894531,85.78621673583984,36.77939224243164,45.564231872558594,6.370003700256348,26.140810012817386,13.211542129516602,22.85942840576172,18.8350830078125,12.855154037475586,4639.6875,433.8485502860916,0.0006329945754259825,0.17609428703871202,0.09664590656757356,,
sigma,,358.6698156362024,235.16936069339627,221.09896114976755,204.9879312940437,30.20506190014551,16.563647712618533,18.268424928874218,14.553886596876195,16.241692506073235,12.519665162510204,9.52584166590934,15.447535027762957,8.093034657290888,8.061533117652063,9.574708228295796,5.881301348499163,685.2094283753347,73.07315482587877,0.00011041697625960841,0.027090399693344966,0.013511326327634557,,
zeros,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,112,1671.1562898873133,1701.8863855572667,2028.1135336328884,2342.5140970890006,-298.9597473144531,126.72098541259766,-32.87347412109375,71.2174301147461,-16.876304626464847,21.444076538085934,4.175195217132568,20.78470802307129,12.976224899291992,9.11994171142578,2.8190600872039795,9.46920108795166,3028.5780204187768,73.15368210700072,9.235877951141448e-05,0.08256404893270622,0.03318674489855766,F,train-clean-360
1,112,1347.7045618032653,1595.7500042688212,2095.285844216399,2541.932148963671,-289.67327880859375,137.33297729492188,1.120954990386963,26.13593292236328,-5.648991584777832,25.780946731567386,-6.543709754943848,20.5148868560791,10.645992279052734,7.391483783721924,2.699304342269897,3.668611764907837,2135.1101345486113,99.32153018904322,5.569871791522018e-05,0.06941296403463648,0.05940774083137512,F,train-clean-360
2,112,2069.631624077236,1818.2872267408914,2244.7122419652487,2622.212001148156,-304.095458984375,125.94711303710938,-11.63916015625,35.59788513183594,-25.28520393371582,26.934654235839844,-22.743227005004886,13.402088165283205,-6.572965621948242,3.3161582946777344,-3.719818830490112,-6.300156116485597,3335.6736269864173,289.1536019065163,0.00023887501447461543,0.1395693692294034,0.03208668157458305,F,train-clean-360


In [106]:
test_h2o.names[:]

['_1',
 '_2',
 '_3',
 '_4',
 '_5',
 '_6',
 '_7',
 '_8',
 '_9',
 '_10',
 '_11',
 '_12',
 '_13',
 '_14',
 '_15',
 '_16',
 '_17',
 '_18',
 '_19',
 '_20',
 '_21',
 '_22',
 '_23',
 '_24']

In [108]:
predictors = test_h2o.names
response = "_1"
predictors.remove(response)

In [109]:
from h2o.automl import H2OAutoML
model_automl = H2OAutoML(max_models = 10, seed = 1, nfolds = 3) # seed : Set a seed for reproducibility
model_automl.train(x=predictors,
                y=response,
                training_frame=train_h2o)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [110]:
predictions = model_automl.predict(test_h2o)
predictions

stackedensemble prediction progress: |████████████████████████████████████| 100%


predict,p112,p1272,p2532,p5338,p6895
112,0.96642,0.00108516,0.0235947,0.00149757,0.00740262
112,0.996078,0.000208362,0.00148143,0.000263091,0.00196911
112,0.994823,0.000289055,0.00238505,0.00036427,0.00213827
112,0.993905,0.000418329,0.00247721,0.000517753,0.00268126
112,0.886907,0.00239747,0.0907933,0.00331238,0.0165903
112,0.994378,0.000286551,0.0027324,0.000383104,0.00221997
112,0.995819,0.000223064,0.00161364,0.000281206,0.00206356
112,0.983417,0.000712249,0.00792675,0.000947528,0.00699651
112,0.994714,0.000308207,0.00216421,0.000389916,0.00242383
112,0.995679,0.000230703,0.00166088,0.000301959,0.00212702




In [112]:
model_automl.leaderboard

model_id,mean_per_class_error,logloss,rmse,mse
StackedEnsemble_BestOfFamily_AutoML_20200114_175341,0.0303852,0.100782,0.161907,0.0262138
StackedEnsemble_AllModels_AutoML_20200114_175341,0.0380916,0.359848,0.315479,0.0995273
GLM_1_AutoML_20200114_175341,0.0383693,0.116435,0.169213,0.0286329
GBM_3_AutoML_20200114_175341,0.0873317,0.25743,0.276639,0.0765289
GBM_4_AutoML_20200114_175341,0.102606,0.265297,0.285629,0.0815838
GBM_2_AutoML_20200114_175341,0.11099,0.28118,0.29679,0.0880843
GBM_1_AutoML_20200114_175341,0.115231,0.305624,0.304796,0.0929009
XGBoost_1_AutoML_20200114_175341,0.126251,0.373277,0.349027,0.12182
XGBoost_3_AutoML_20200114_175341,0.12807,0.422733,0.371722,0.138177
DRF_1_AutoML_20200114_175341,0.133538,0.510788,0.41083,0.168782




In [113]:
h2o.get_model("StackedEnsemble_BestOfFamily_AutoML_20200114_174321")

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_BestOfFamily_AutoML_20200114_174321

No model summary for this model

ModelMetricsMultinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.0012586324212841323
RMSE: 0.035477209885842666

ModelMetricsMultinomialGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.026213809560013755
RMSE: 0.16190679281615628




In [45]:
sc.stop()