In [1]:
import h2o
h2o.init()
from h2o.estimators.word2vec import H2OWord2vecEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,6 days 20 hours 11 mins
H2O cluster timezone:,Europe/Vienna
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.5
H2O cluster version age:,"14 days, 11 hours and 31 minutes"
H2O cluster name:,H2O_from_python_mackenzie_dm3nzl
H2O cluster total nodes:,1
H2O cluster free memory:,1.375 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [2]:
job_titles_path = "https://raw.githubusercontent.com/h2oai/sparkling-water/rel-1.6/examples/smalldata/craigslistJobTitles.csv"
job_titles = h2o.import_file(job_titles_path, destination_frame = "jobtitles",
                             col_names = ["category", "jobtitle"], col_types = ["enum", "string"], header = 1)
print(job_titles)

Parse progress: |█████████████████████████████████████████████████████████| 100%


category,jobtitle
education,After School Supervisor
education,"*****TUTORS NEEDED - FOR ALL SUBJECTS, ALL AGES*****"
education,Bay Area Family Recruiter
education,Adult Day Programs/Community Access/Job Coaches
education,General Counselor - Non Tenure track
education,Part-Time Summer Math Teachers/Tutors
education,Preschool Teacher (temp-to-hire)
education,"*****TUTORS NEEDED - FOR ALL SUBJECTS, ALL AGES*****"
education,Private Teachers and Tutors Needed in the South Bay
education,Art Therapist at Esther B. Clark School





In [3]:
STOP_WORDS = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what",
               "there","all","we","one","the","a","an","of","or","in","for","by","on",
               "but","is","in","a","not","with","as","was","if","they","are","this","and","it","have",
               "from","at","my","be","by","not","that","to","from","com","org","like","likes","so"]

In [4]:
def tokenize(sentences, stop_word = STOP_WORDS):
    tokenized = sentences.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

In [5]:
def predict(job_title,w2v, gbm):
    words = tokenize(h2o.H2OFrame(job_title).ascharacter())
    job_title_vec = w2v.transform(words, aggregate_method="AVERAGE")
    print(gbm.predict(test_data=job_title_vec))

In [6]:
print("Break job titles into sequence of words")
words = tokenize(job_titles["jobtitle"])

Break job titles into sequence of words


In [7]:
print("Build word2vec model")
w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model.train(training_frame=words)

Build word2vec model
word2vec Model Build progress: |██████████████████████████████████████████| 100%


In [8]:
print("Sanity check - find synonyms for the word 'teacher'")
w2v_model.find_synonyms("teacher", count = 5)

Sanity check - find synonyms for the word 'teacher'


OrderedDict([('preschool', 0.6674632430076599),
             ('infant', 0.666761577129364),
             ('teaching', 0.6467991471290588),
             ('toddler', 0.6421365141868591),
             ('aide', 0.637873649597168)])

In [14]:
print("Calculate a vector for each job title")
job_title_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")
print(job_title_vecs)

Calculate a vector for each job title


C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C27,C28,C29,C30,C31,C32,C33,C34,C35,C36,C37,C38,C39,C40,C41,C42,C43,C44,C45,C46,C47,C48,C49,C50,C51,C52,C53,C54,C55,C56,C57,C58,C59,C60,C61,C62,C63,C64,C65,C66,C67,C68,C69,C70,C71,C72,C73,C74,C75,C76,C77,C78,C79,C80,C81,C82,C83,C84,C85,C86,C87,C88,C89,C90,C91,C92,C93,C94,C95,C96,C97,C98,C99,C100
-0.251584,0.136924,-0.255622,0.312074,-0.0118113,-0.0419435,0.0466443,-0.22886,-0.303602,-0.0472739,-0.0213124,-0.0703419,0.0531935,-0.408683,-0.0789673,-0.094132,-0.13789,-0.163174,-0.3242,0.276175,0.304799,-0.309087,-0.131799,-0.187872,0.371162,-0.158022,0.101008,-0.062833,0.300701,-0.372577,0.041656,-0.233302,-0.0720808,-0.320965,0.0344183,-0.0503962,0.0385672,0.137535,0.10059,-0.0508965,-0.105906,0.0940593,0.335523,0.0797369,0.124125,0.0151864,0.196927,0.214675,-0.263531,-0.135715,-0.178507,0.107794,-0.162345,-0.12102,-0.158771,-0.127631,0.42478,-0.00291389,0.566371,-0.266624,0.124889,-0.0726887,-0.0131716,-0.102916,0.139192,0.183915,0.116067,0.0347701,-0.0838,0.0197136,0.169457,-0.0313154,0.00486292,-0.106337,-0.185604,-0.457041,-0.0265876,0.202438,-0.268457,-0.346918,0.38533,0.00512209,0.253923,0.413862,0.0142401,0.172131,-0.36137,0.114691,-0.470486,-0.0347269,-0.0808012,-0.160032,0.0217447,-0.143188,0.0275136,-0.147386,-0.41983,-0.133007,0.112142,-0.420146
-0.385594,-0.223537,-0.589687,-0.214223,-0.362832,-0.430404,0.537216,0.737671,-0.118072,0.194709,0.184864,-0.0461948,0.307238,-0.153701,-0.0568609,-0.195524,-0.449524,0.00160749,-0.291577,0.295948,0.545097,0.590229,-0.332481,0.0696192,-0.415839,-0.27066,-0.574071,-0.167856,-0.130402,0.682819,-0.0527046,0.275407,-0.453048,-0.0699872,0.0170623,0.365132,-0.386299,0.160154,-0.0564246,-0.107434,-0.413609,0.300056,0.850056,-0.04601,0.222707,-0.0114921,0.164838,0.415203,-0.294713,0.187536,-0.0324484,0.0921537,-0.23483,-0.152381,0.312211,0.307891,0.011294,-0.0851305,0.300332,-0.0556199,0.342629,0.0971193,0.142128,-0.650731,0.296569,-0.0170683,0.00161196,0.101912,-0.214938,-0.243685,0.0224212,0.266922,-0.00427749,0.000947144,-0.155636,0.373267,0.317644,0.0355001,0.265651,0.276324,-0.0219808,0.0672442,0.200542,0.471807,0.0344986,-0.0202369,-0.303343,0.335097,0.0966534,-0.313922,-0.337812,-0.0620166,-0.185626,0.0933087,0.0755286,-0.0661411,-0.815489,-0.170913,0.235029,-0.180461
-0.0954117,-0.0702975,-0.186445,-0.00935915,0.0431085,0.090021,0.119256,0.0611851,-0.295891,0.231734,-0.170702,0.13069,0.277911,0.222596,-0.302772,0.0669213,0.158227,0.118516,0.0563594,0.154408,0.0242654,0.308068,0.0789735,0.0111396,0.00101944,-0.0425005,0.110333,0.080065,0.255814,-0.0254293,0.36138,0.276812,-0.0964162,0.129942,0.130713,-0.184794,0.0156939,-0.0654218,0.0128636,0.47527,-0.226724,0.314848,-0.0730507,-0.216765,-0.0707427,-0.00603081,0.0699789,-0.14366,-0.364797,-0.0559318,-0.129343,0.197252,-0.107991,-0.11598,0.249636,0.127396,0.0745447,-0.207772,0.0444609,-0.092851,0.116533,-0.00733192,0.208556,-0.187216,0.120188,0.0866311,0.244091,0.146689,0.111703,-0.334957,0.17488,-0.0725898,-0.223959,-0.012875,0.229429,-0.140747,0.0467826,-0.20798,-0.0667032,0.207997,-0.106177,-0.101408,0.0615601,0.151133,-0.254149,0.15218,-0.208574,0.152721,-0.188204,-0.0652586,-0.0742955,-0.019335,0.0973487,0.0386987,0.196887,-0.0808491,-0.013056,-0.107551,-0.0197343,-0.0897458
-0.0633724,0.0697162,0.045874,-0.132308,-0.0125692,-0.0697526,0.349191,0.0143475,-0.14405,0.0573575,0.0704745,-0.0583423,0.134514,-0.171774,-0.268729,-0.192195,-0.174414,-0.150645,-0.19068,0.0681866,0.024791,-0.0256565,-0.0828449,-0.116476,0.224361,-0.0730649,-0.283766,-0.201799,0.165288,-0.30559,0.0220558,0.117959,-0.125571,-0.084034,-0.0111401,0.0747539,0.101458,0.0188188,-0.0222562,0.0422242,-0.0116395,0.0998284,-0.00727211,0.000483806,-0.000156656,-0.207639,0.0446618,0.266029,-0.24315,-0.0656087,-0.34511,0.207761,0.0320722,-0.0974651,-0.0964021,-0.014266,0.228001,0.0399858,0.1847,-0.0735165,0.00450312,0.331971,0.0470354,0.0704103,0.117919,-0.189733,0.38288,0.1083,-0.0254835,-0.173487,0.221591,-0.208055,0.188909,-0.140672,0.0955351,-0.0785088,-0.00146296,0.00292708,-0.182073,-0.235815,0.0806149,-0.247833,0.144545,0.0438801,-0.143635,0.0534744,-0.232459,0.278619,-0.26587,-0.121142,0.0164885,-0.324465,-0.278849,0.0126896,0.300586,-0.0577259,-0.00727834,-0.273371,0.168192,-0.361804
-0.112773,-0.182389,0.0100643,0.0710946,-0.039366,-0.180349,-0.0201027,-0.221207,-0.227281,0.041469,0.197577,-0.116743,0.0628668,0.094383,0.257026,-0.257134,-0.134032,-0.425734,-0.247737,0.470331,-0.156085,0.0105153,0.0913668,0.0748032,0.152459,0.103797,-0.260778,0.115222,0.174064,-0.396085,0.195302,0.132188,-0.189136,-0.0700651,0.0590237,-0.016153,0.121573,0.296275,-0.168379,0.313464,-0.205596,0.409273,-0.299189,0.0624417,-0.0812687,0.26428,0.255658,0.272845,-0.39518,-0.153141,-0.409095,0.359629,0.0407324,-0.0912451,0.166877,0.342884,0.214173,0.0410174,0.139492,-0.0422536,-0.0781162,0.205297,0.517775,0.0300943,-0.195492,0.000359154,-0.211017,0.308914,0.160424,-0.111471,0.305549,-0.0624078,-0.28398,-0.100969,-0.183113,-0.271719,0.128746,-0.0920872,-0.0428856,-0.0149382,0.219198,0.0721919,-0.0817904,-0.403299,0.154554,0.125135,-0.181857,-0.0483945,-0.272828,-0.143825,0.0640546,-0.0417439,-0.104711,-0.184328,0.159294,-0.055308,0.0281683,-0.214438,-0.0390676,-0.239378
-0.225038,-0.149809,-0.202244,0.0158687,-0.139796,-0.159031,0.257268,0.329105,-0.0722085,0.0896662,0.225198,-0.146263,0.128615,-0.02673,-0.111814,-0.0123075,-0.127377,-0.0223825,-0.323315,0.452286,0.146016,0.050317,-0.2737,-0.0108207,0.132617,-0.256691,-0.141408,0.0949571,0.289396,0.0974802,-0.0715597,-0.0724999,-0.209684,0.0195725,0.0818927,0.198791,0.0968093,0.21436,0.135775,-0.00699857,-0.169886,0.39752,0.405653,-0.264566,-0.0582955,-0.400167,0.0584577,0.0935691,-0.290428,0.0326468,-0.0560669,0.232453,0.0726725,-0.306895,0.0350108,0.0875827,0.242698,-0.146891,0.434011,-0.0932608,0.364029,0.183641,-0.0302338,-0.182101,0.26551,-0.0928361,0.307962,0.195147,0.00524677,-0.239445,0.373809,0.0591009,0.0751178,0.034926,0.142297,0.156667,0.0939898,0.0226789,-0.0983542,-0.130695,0.132935,-0.178569,0.194214,0.249998,-0.0908646,-0.00436264,-0.258383,0.327662,-0.14531,-7.83528e-05,-0.228543,-0.193906,-0.0498019,0.142063,0.173491,0.153775,-0.303565,0.151187,0.306185,-0.20999
-0.194405,-0.0873078,-0.318988,0.220969,-0.04137,-0.0567277,0.0961553,0.0441488,-0.123549,-0.405169,0.305185,-0.00519951,-0.239108,-0.113781,-0.0267297,-0.357478,-0.0631812,0.00559124,-0.225076,0.420643,0.376881,-0.158626,0.0121334,0.141184,-0.0490602,-0.0944844,-0.0205291,-0.0106926,0.481377,0.0702205,-0.0292656,-0.00835895,0.134326,-0.301414,0.206103,-0.130232,-0.126412,0.293709,0.216839,0.0745586,-0.279773,0.131582,0.0466678,-0.468251,0.192131,0.0472466,0.020625,-0.24649,-0.245106,0.427514,-0.31829,0.026442,-0.069378,-0.153357,-0.0746361,0.148266,0.397788,-0.129178,0.501626,0.0759109,-0.0985774,-0.00365255,-0.0853543,-0.0110664,0.235757,0.32345,0.0371921,0.0762525,0.0262926,-0.466391,0.108028,0.0720107,-0.150743,-0.0387895,-0.0841909,-0.00303625,0.0914285,-0.0565793,-0.459669,-0.0912211,0.251085,-0.0520789,-0.150275,0.113711,-0.186609,-0.20283,-0.156378,0.340299,-0.345365,0.0747955,0.210271,-0.0352411,0.0801873,0.0465502,-0.0564684,0.19045,-0.3369,0.254681,0.105039,-0.1469
-0.385594,-0.223537,-0.589687,-0.214223,-0.362832,-0.430404,0.537216,0.737671,-0.118072,0.194709,0.184864,-0.0461948,0.307238,-0.153701,-0.0568609,-0.195524,-0.449524,0.00160749,-0.291577,0.295948,0.545097,0.590229,-0.332481,0.0696192,-0.415839,-0.27066,-0.574071,-0.167856,-0.130402,0.682819,-0.0527046,0.275407,-0.453048,-0.0699872,0.0170623,0.365132,-0.386299,0.160154,-0.0564246,-0.107434,-0.413609,0.300056,0.850056,-0.04601,0.222707,-0.0114921,0.164838,0.415203,-0.294713,0.187536,-0.0324484,0.0921537,-0.23483,-0.152381,0.312211,0.307891,0.011294,-0.0851305,0.300332,-0.0556199,0.342629,0.0971193,0.142128,-0.650731,0.296569,-0.0170683,0.00161196,0.101912,-0.214938,-0.243685,0.0224212,0.266922,-0.00427749,0.000947144,-0.155636,0.373267,0.317644,0.0355001,0.265651,0.276324,-0.0219808,0.0672442,0.200542,0.471807,0.0344986,-0.0202369,-0.303343,0.335097,0.0966534,-0.313922,-0.337812,-0.0620166,-0.185626,0.0933087,0.0755286,-0.0661411,-0.815489,-0.170913,0.235029,-0.180461
-0.182342,-0.0579974,-0.449991,0.117976,-0.0954062,0.00742612,0.332724,0.425953,-0.163354,0.123913,-0.0544571,0.0784447,0.105416,0.0775618,-0.0651937,0.00915051,-0.119415,0.0895815,-0.26892,0.382744,0.206684,0.29138,-0.0691396,0.0323274,-0.154046,-0.20856,-0.129703,0.133819,0.266863,0.231505,0.128949,0.276318,-0.0884779,0.0599333,0.180709,0.157847,-0.122973,0.0389607,0.0951961,0.285854,-0.283286,0.278327,0.385646,-0.219923,-0.066476,-0.00975863,0.104856,0.22222,-0.468177,0.0473239,-0.0093125,0.147029,-0.17623,-0.27729,0.0827692,0.132084,0.0399378,-0.178384,0.231157,0.00213214,0.302596,0.0787009,0.119145,-0.369904,0.239195,-0.0149917,0.133751,0.0908859,-0.135389,-0.337356,0.157955,0.0512333,-0.102853,0.0455449,0.0940668,0.129102,0.0995499,-0.0253139,-0.0365975,0.129782,0.058924,-0.0745609,0.0810987,0.326139,-0.0231943,-0.0428666,-0.114714,0.173235,-0.143313,-0.0596502,-0.0618323,-0.131662,0.170804,0.141908,0.147841,0.0422823,-0.4879,-0.0626723,0.227223,-0.327797
-0.438469,-0.0413095,-0.271875,0.179808,0.352608,-0.189415,0.573536,0.172951,-0.508561,0.471962,0.0999073,0.0470175,0.240633,0.0495515,-0.108321,0.0472531,0.033767,-0.0620741,-0.588417,0.203363,0.164913,-0.126611,-0.0199955,-0.0232984,0.236297,-0.160943,0.0141518,-0.0763086,0.330094,-0.466812,-0.00316487,0.0573588,-0.281565,-0.194379,0.0241378,-0.0114897,0.0894352,0.180837,0.175978,0.0595403,-0.125062,0.424445,0.178516,-0.0419205,-0.147356,0.137639,0.266266,0.339838,-0.365407,-0.0789338,-0.348932,0.264597,-0.0168497,-0.276664,-0.00222578,0.297551,0.42243,-0.206162,0.495235,0.0781713,0.177026,0.167616,-0.00894832,-0.0428548,-0.0290473,-0.0146862,-0.0208031,0.320328,0.11384,0.0979903,0.102106,-0.127981,-0.206419,-0.0498477,-0.0838004,-0.160797,-0.106502,0.127224,-0.0957317,-0.323394,-0.0134163,-0.122785,-0.0181023,0.155277,-0.0428896,-0.00266042,-0.161857,0.210285,-0.415539,-0.293838,-0.125177,-0.364229,-0.146247,-0.238523,0.240613,0.153216,-0.286534,-0.0328544,0.0645675,-0.278274





In [15]:
print("Prepare training&validation data (keep only job titles made of known words)")
valid_job_titles = ~ job_title_vecs["C1"].isna()
data = job_titles[valid_job_titles,:].cbind(job_title_vecs[valid_job_titles,:])
data_split = data.split_frame(ratios=[0.8])
print(data)

Prepare training&validation data (keep only job titles made of known words)


category,jobtitle,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C27,C28,C29,C30,C31,C32,C33,C34,C35,C36,C37,C38,C39,C40,C41,C42,C43,C44,C45,C46,C47,C48,C49,C50,C51,C52,C53,C54,C55,C56,C57,C58,C59,C60,C61,C62,C63,C64,C65,C66,C67,C68,C69,C70,C71,C72,C73,C74,C75,C76,C77,C78,C79,C80,C81,C82,C83,C84,C85,C86,C87,C88,C89,C90,C91,C92,C93,C94,C95,C96,C97,C98,C99,C100
education,After School Supervisor,-0.251584,0.136924,-0.255622,0.312074,-0.0118113,-0.0419435,0.0466443,-0.22886,-0.303602,-0.0472739,-0.0213124,-0.0703419,0.0531935,-0.408683,-0.0789673,-0.094132,-0.13789,-0.163174,-0.3242,0.276175,0.304799,-0.309087,-0.131799,-0.187872,0.371162,-0.158022,0.101008,-0.062833,0.300701,-0.372577,0.041656,-0.233302,-0.0720808,-0.320965,0.0344183,-0.0503962,0.0385672,0.137535,0.10059,-0.0508965,-0.105906,0.0940593,0.335523,0.0797369,0.124125,0.0151864,0.196927,0.214675,-0.263531,-0.135715,-0.178507,0.107794,-0.162345,-0.12102,-0.158771,-0.127631,0.42478,-0.00291389,0.566371,-0.266624,0.124889,-0.0726887,-0.0131716,-0.102916,0.139192,0.183915,0.116067,0.0347701,-0.0838,0.0197136,0.169457,-0.0313154,0.00486292,-0.106337,-0.185604,-0.457041,-0.0265876,0.202438,-0.268457,-0.346918,0.38533,0.00512209,0.253923,0.413862,0.0142401,0.172131,-0.36137,0.114691,-0.470486,-0.0347269,-0.0808012,-0.160032,0.0217447,-0.143188,0.0275136,-0.147386,-0.41983,-0.133007,0.112142,-0.420146
education,"*****TUTORS NEEDED - FOR ALL SUBJECTS, ALL AGES*****",-0.385594,-0.223537,-0.589687,-0.214223,-0.362832,-0.430404,0.537216,0.737671,-0.118072,0.194709,0.184864,-0.0461948,0.307238,-0.153701,-0.0568609,-0.195524,-0.449524,0.00160749,-0.291577,0.295948,0.545097,0.590229,-0.332481,0.0696192,-0.415839,-0.27066,-0.574071,-0.167856,-0.130402,0.682819,-0.0527046,0.275407,-0.453048,-0.0699872,0.0170623,0.365132,-0.386299,0.160154,-0.0564246,-0.107434,-0.413609,0.300056,0.850056,-0.04601,0.222707,-0.0114921,0.164838,0.415203,-0.294713,0.187536,-0.0324484,0.0921537,-0.23483,-0.152381,0.312211,0.307891,0.011294,-0.0851305,0.300332,-0.0556199,0.342629,0.0971193,0.142128,-0.650731,0.296569,-0.0170683,0.00161196,0.101912,-0.214938,-0.243685,0.0224212,0.266922,-0.00427749,0.000947144,-0.155636,0.373267,0.317644,0.0355001,0.265651,0.276324,-0.0219808,0.0672442,0.200542,0.471807,0.0344986,-0.0202369,-0.303343,0.335097,0.0966534,-0.313922,-0.337812,-0.0620166,-0.185626,0.0933087,0.0755286,-0.0661411,-0.815489,-0.170913,0.235029,-0.180461
education,Bay Area Family Recruiter,-0.0954117,-0.0702975,-0.186445,-0.00935915,0.0431085,0.090021,0.119256,0.0611851,-0.295891,0.231734,-0.170702,0.13069,0.277911,0.222596,-0.302772,0.0669213,0.158227,0.118516,0.0563594,0.154408,0.0242654,0.308068,0.0789735,0.0111396,0.00101944,-0.0425005,0.110333,0.080065,0.255814,-0.0254293,0.36138,0.276812,-0.0964162,0.129942,0.130713,-0.184794,0.0156939,-0.0654218,0.0128636,0.47527,-0.226724,0.314848,-0.0730507,-0.216765,-0.0707427,-0.00603081,0.0699789,-0.14366,-0.364797,-0.0559318,-0.129343,0.197252,-0.107991,-0.11598,0.249636,0.127396,0.0745447,-0.207772,0.0444609,-0.092851,0.116533,-0.00733192,0.208556,-0.187216,0.120188,0.0866311,0.244091,0.146689,0.111703,-0.334957,0.17488,-0.0725898,-0.223959,-0.012875,0.229429,-0.140747,0.0467826,-0.20798,-0.0667032,0.207997,-0.106177,-0.101408,0.0615601,0.151133,-0.254149,0.15218,-0.208574,0.152721,-0.188204,-0.0652586,-0.0742955,-0.019335,0.0973487,0.0386987,0.196887,-0.0808491,-0.013056,-0.107551,-0.0197343,-0.0897458
education,Adult Day Programs/Community Access/Job Coaches,-0.0633724,0.0697162,0.045874,-0.132308,-0.0125692,-0.0697526,0.349191,0.0143475,-0.14405,0.0573575,0.0704745,-0.0583423,0.134514,-0.171774,-0.268729,-0.192195,-0.174414,-0.150645,-0.19068,0.0681866,0.024791,-0.0256565,-0.0828449,-0.116476,0.224361,-0.0730649,-0.283766,-0.201799,0.165288,-0.30559,0.0220558,0.117959,-0.125571,-0.084034,-0.0111401,0.0747539,0.101458,0.0188188,-0.0222562,0.0422242,-0.0116395,0.0998284,-0.00727211,0.000483806,-0.000156656,-0.207639,0.0446618,0.266029,-0.24315,-0.0656087,-0.34511,0.207761,0.0320722,-0.0974651,-0.0964021,-0.014266,0.228001,0.0399858,0.1847,-0.0735165,0.00450312,0.331971,0.0470354,0.0704103,0.117919,-0.189733,0.38288,0.1083,-0.0254835,-0.173487,0.221591,-0.208055,0.188909,-0.140672,0.0955351,-0.0785088,-0.00146296,0.00292708,-0.182073,-0.235815,0.0806149,-0.247833,0.144545,0.0438801,-0.143635,0.0534744,-0.232459,0.278619,-0.26587,-0.121142,0.0164885,-0.324465,-0.278849,0.0126896,0.300586,-0.0577259,-0.00727834,-0.273371,0.168192,-0.361804
education,General Counselor - Non Tenure track,-0.112773,-0.182389,0.0100643,0.0710946,-0.039366,-0.180349,-0.0201027,-0.221207,-0.227281,0.041469,0.197577,-0.116743,0.0628668,0.094383,0.257026,-0.257134,-0.134032,-0.425734,-0.247737,0.470331,-0.156085,0.0105153,0.0913668,0.0748032,0.152459,0.103797,-0.260778,0.115222,0.174064,-0.396085,0.195302,0.132188,-0.189136,-0.0700651,0.0590237,-0.016153,0.121573,0.296275,-0.168379,0.313464,-0.205596,0.409273,-0.299189,0.0624417,-0.0812687,0.26428,0.255658,0.272845,-0.39518,-0.153141,-0.409095,0.359629,0.0407324,-0.0912451,0.166877,0.342884,0.214173,0.0410174,0.139492,-0.0422536,-0.0781162,0.205297,0.517775,0.0300943,-0.195492,0.000359154,-0.211017,0.308914,0.160424,-0.111471,0.305549,-0.0624078,-0.28398,-0.100969,-0.183113,-0.271719,0.128746,-0.0920872,-0.0428856,-0.0149382,0.219198,0.0721919,-0.0817904,-0.403299,0.154554,0.125135,-0.181857,-0.0483945,-0.272828,-0.143825,0.0640546,-0.0417439,-0.104711,-0.184328,0.159294,-0.055308,0.0281683,-0.214438,-0.0390676,-0.239378
education,Part-Time Summer Math Teachers/Tutors,-0.225038,-0.149809,-0.202244,0.0158687,-0.139796,-0.159031,0.257268,0.329105,-0.0722085,0.0896662,0.225198,-0.146263,0.128615,-0.02673,-0.111814,-0.0123075,-0.127377,-0.0223825,-0.323315,0.452286,0.146016,0.050317,-0.2737,-0.0108207,0.132617,-0.256691,-0.141408,0.0949571,0.289396,0.0974802,-0.0715597,-0.0724999,-0.209684,0.0195725,0.0818927,0.198791,0.0968093,0.21436,0.135775,-0.00699857,-0.169886,0.39752,0.405653,-0.264566,-0.0582955,-0.400167,0.0584577,0.0935691,-0.290428,0.0326468,-0.0560669,0.232453,0.0726725,-0.306895,0.0350108,0.0875827,0.242698,-0.146891,0.434011,-0.0932608,0.364029,0.183641,-0.0302338,-0.182101,0.26551,-0.0928361,0.307962,0.195147,0.00524677,-0.239445,0.373809,0.0591009,0.0751178,0.034926,0.142297,0.156667,0.0939898,0.0226789,-0.0983542,-0.130695,0.132935,-0.178569,0.194214,0.249998,-0.0908646,-0.00436264,-0.258383,0.327662,-0.14531,-7.83528e-05,-0.228543,-0.193906,-0.0498019,0.142063,0.173491,0.153775,-0.303565,0.151187,0.306185,-0.20999
education,Preschool Teacher (temp-to-hire),-0.194405,-0.0873078,-0.318988,0.220969,-0.04137,-0.0567277,0.0961553,0.0441488,-0.123549,-0.405169,0.305185,-0.00519951,-0.239108,-0.113781,-0.0267297,-0.357478,-0.0631812,0.00559124,-0.225076,0.420643,0.376881,-0.158626,0.0121334,0.141184,-0.0490602,-0.0944844,-0.0205291,-0.0106926,0.481377,0.0702205,-0.0292656,-0.00835895,0.134326,-0.301414,0.206103,-0.130232,-0.126412,0.293709,0.216839,0.0745586,-0.279773,0.131582,0.0466678,-0.468251,0.192131,0.0472466,0.020625,-0.24649,-0.245106,0.427514,-0.31829,0.026442,-0.069378,-0.153357,-0.0746361,0.148266,0.397788,-0.129178,0.501626,0.0759109,-0.0985774,-0.00365255,-0.0853543,-0.0110664,0.235757,0.32345,0.0371921,0.0762525,0.0262926,-0.466391,0.108028,0.0720107,-0.150743,-0.0387895,-0.0841909,-0.00303625,0.0914285,-0.0565793,-0.459669,-0.0912211,0.251085,-0.0520789,-0.150275,0.113711,-0.186609,-0.20283,-0.156378,0.340299,-0.345365,0.0747955,0.210271,-0.0352411,0.0801873,0.0465502,-0.0564684,0.19045,-0.3369,0.254681,0.105039,-0.1469
education,"*****TUTORS NEEDED - FOR ALL SUBJECTS, ALL AGES*****",-0.385594,-0.223537,-0.589687,-0.214223,-0.362832,-0.430404,0.537216,0.737671,-0.118072,0.194709,0.184864,-0.0461948,0.307238,-0.153701,-0.0568609,-0.195524,-0.449524,0.00160749,-0.291577,0.295948,0.545097,0.590229,-0.332481,0.0696192,-0.415839,-0.27066,-0.574071,-0.167856,-0.130402,0.682819,-0.0527046,0.275407,-0.453048,-0.0699872,0.0170623,0.365132,-0.386299,0.160154,-0.0564246,-0.107434,-0.413609,0.300056,0.850056,-0.04601,0.222707,-0.0114921,0.164838,0.415203,-0.294713,0.187536,-0.0324484,0.0921537,-0.23483,-0.152381,0.312211,0.307891,0.011294,-0.0851305,0.300332,-0.0556199,0.342629,0.0971193,0.142128,-0.650731,0.296569,-0.0170683,0.00161196,0.101912,-0.214938,-0.243685,0.0224212,0.266922,-0.00427749,0.000947144,-0.155636,0.373267,0.317644,0.0355001,0.265651,0.276324,-0.0219808,0.0672442,0.200542,0.471807,0.0344986,-0.0202369,-0.303343,0.335097,0.0966534,-0.313922,-0.337812,-0.0620166,-0.185626,0.0933087,0.0755286,-0.0661411,-0.815489,-0.170913,0.235029,-0.180461
education,Private Teachers and Tutors Needed in the South Bay,-0.182342,-0.0579974,-0.449991,0.117976,-0.0954062,0.00742612,0.332724,0.425953,-0.163354,0.123913,-0.0544571,0.0784447,0.105416,0.0775618,-0.0651937,0.00915051,-0.119415,0.0895815,-0.26892,0.382744,0.206684,0.29138,-0.0691396,0.0323274,-0.154046,-0.20856,-0.129703,0.133819,0.266863,0.231505,0.128949,0.276318,-0.0884779,0.0599333,0.180709,0.157847,-0.122973,0.0389607,0.0951961,0.285854,-0.283286,0.278327,0.385646,-0.219923,-0.066476,-0.00975863,0.104856,0.22222,-0.468177,0.0473239,-0.0093125,0.147029,-0.17623,-0.27729,0.0827692,0.132084,0.0399378,-0.178384,0.231157,0.00213214,0.302596,0.0787009,0.119145,-0.369904,0.239195,-0.0149917,0.133751,0.0908859,-0.135389,-0.337356,0.157955,0.0512333,-0.102853,0.0455449,0.0940668,0.129102,0.0995499,-0.0253139,-0.0365975,0.129782,0.058924,-0.0745609,0.0810987,0.326139,-0.0231943,-0.0428666,-0.114714,0.173235,-0.143313,-0.0596502,-0.0618323,-0.131662,0.170804,0.141908,0.147841,0.0422823,-0.4879,-0.0626723,0.227223,-0.327797
education,Art Therapist at Esther B. Clark School,-0.438469,-0.0413095,-0.271875,0.179808,0.352608,-0.189415,0.573536,0.172951,-0.508561,0.471962,0.0999073,0.0470175,0.240633,0.0495515,-0.108321,0.0472531,0.033767,-0.0620741,-0.588417,0.203363,0.164913,-0.126611,-0.0199955,-0.0232984,0.236297,-0.160943,0.0141518,-0.0763086,0.330094,-0.466812,-0.00316487,0.0573588,-0.281565,-0.194379,0.0241378,-0.0114897,0.0894352,0.180837,0.175978,0.0595403,-0.125062,0.424445,0.178516,-0.0419205,-0.147356,0.137639,0.266266,0.339838,-0.365407,-0.0789338,-0.348932,0.264597,-0.0168497,-0.276664,-0.00222578,0.297551,0.42243,-0.206162,0.495235,0.0781713,0.177026,0.167616,-0.00894832,-0.0428548,-0.0290473,-0.0146862,-0.0208031,0.320328,0.11384,0.0979903,0.102106,-0.127981,-0.206419,-0.0498477,-0.0838004,-0.160797,-0.106502,0.127224,-0.0957317,-0.323394,-0.0134163,-0.122785,-0.0181023,0.155277,-0.0428896,-0.00266042,-0.161857,0.210285,-0.415539,-0.293838,-0.125177,-0.364229,-0.146247,-0.238523,0.240613,0.153216,-0.286534,-0.0328544,0.0645675,-0.278274





In [11]:
print("Build a basic GBM model")
gbm_model = H2OGradientBoostingEstimator()
gbm_model.train(x = job_title_vecs.names,
                y="category", 
                training_frame = data_split[0], 
                validation_frame = data_split[1])

Build a basic GBM model
gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [12]:
print("Predict!")
print(predict(["school teacher having holidays every month"], w2v_model, gbm_model))
print(predict(["developer with 3+ Java experience, jumping"], w2v_model, gbm_model))
print(predict(["Financial accountant CPA preferred"], w2v_model, gbm_model))

Predict!
Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%


predict,accounting,administrative,customerservice,education,foodbeverage,labor
education,0.00183002,0.0057537,0.0109317,0.968453,0.00442612,0.00860562



None
Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%


predict,accounting,administrative,customerservice,education,foodbeverage,labor
labor,0.0304531,0.219319,0.260151,0.0279077,0.108545,0.353624



None
Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%


predict,accounting,administrative,customerservice,education,foodbeverage,labor
accounting,0.898479,0.0769654,0.0102377,0.00387916,0.00472997,0.00570894



None
