In [1]:
import time
import applicants
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import NMF, LatentDirichletAllocation
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import gensim
from numpy.random import rand
from itertools import combinations
import time
import operator
import re
import os
import topic_model, dataio, d2v_utils

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
#df_users = applicants.get_applicant_data()

In [2]:
import jobs

df_jobs = jobs.get_job_posting_data()

# Use a subset of data
df_jobs = df_jobs[15000:]
df_jobs.reset_index(inplace=True)

print('df_jobs shape: %s' % str(df_jobs.shape))

docs = df_jobs.description

Getting job posting data...
- Time: 0.601379871368

df_jobs shape: (9015, 6)


In [3]:
# Params to build the vocab and topics
min_df = 50
max_df = .1
k_topics = 18
max_vocab_size = 10000
ngram_range=([1,1])

In [4]:
tm = topic_model.TopicModeller(model_type='NMF', vectorizer_type='localwise')
tm.fit(docs, max_vocab_size=max_vocab_size, 
             min_df=min_df, k_topics=k_topics,
             ngram_range=ngram_range)
du = d2v_utils.D2V_Utils(tm.vectorizer)

Number of documents to process: 9015

Extracting Vectorizer features...
- Time: 11.074s.

Fitting NMF model with LocalwiseVectorizer features, n_samples=9015 and n_features=10000...
- Time: 3.047s.

Reconstruction mse: 0.000321
Topic 0: program, organization, community, support, development, project, report, relationship, meeting, degree, social, director, resource, partner, management

Topic 1: restaurant, food, dining, wine, chef, guest, hospitality, cuisine, fast, fine, year, beer, menu, paced, dinner

Topic 2: child, teacher, school, teaching, age, preschool, classroom, early, class, student, childhood, curriculum, program, parent, center

Topic 3: customer, sale, retail, store, product, associate, service, business, brand, weekend, motivated, industry, growing, grow, fashion

Topic 4: standard, lift, equipment, procedure, guest, clean, safety, area, cleaning, stand, ensure, item, assist, perform, product

Topic 5: sitter, babysitter, babysitting, hunt, decide, nanny, tap, connecte

In [5]:
top_topics, top_topic_weights = tm.get_top_topics_and_topic_probs()

df_jobs['top_topic'] = top_topics
df_jobs['top_topic_weight'] = top_topic_weights

  probs = (topic_weights / topic_weights.sum())


In [7]:
doc_idx = 100
tokens = du.get_tokenized_docs([docs[doc_idx]])
tokens

Tokenizing docs...
- Time: 0.000716924667358


[[u'madison',
  u'opened',
  u'family',
  u'restaurant',
  u'sacramento',
  u'hard',
  u'miracle',
  u'dining',
  u'kabab',
  u'cook',
  u'ca95841',
  u'line',
  u'bigger',
  u'kitchen']]

In [8]:
docs[doc_idx]

"Essy's miracle kabab\n\nOur restaurants are a high energy, fun and family-friendly dining environment for everyone to enjoy. We will make bigger team now\n\nWe are opened restaurant 7 years\n\nWe looking for worked in kitchen\n\nat least 1 year full time work\n\nAnd reliable and hard working as line cook\n\n**Please only text me:**\n\nEssy\n\n(916) 7280314\n\n5207 Madison ave Sacramento CA95841"

In [9]:
titles = ['fast', 'cook','dishwasher','runner','bringing',]

In [10]:
def preProcess(s):
#     s = ' '.join(titles)
    for t in titles:
        if t in s:
            s = s + str(' '+t+' ') * 3
    return s

class myvec(TfidfVectorizer):
    '''
    http://scikit-learn.org/dev/modules/feature_extraction.html#vectorizing-a-large-text-corpus-with-the-hashing-trick
            self.vectorizer = TfidfVectorizer(token_pattern=get_token_pattern(),
                                              min_df=min_df,
                                              max_features=max_vocab_size,
                                              stop_words=get_stop_words(),
                                              ngram_range=ngram_range)
    '''
    def __init__(self, inflection_form='lemmer', max_features=5000, 
                                                 min_df=10,
                                                 ngram_range=ngram_range):
        super(myvec, self).__init__(max_features=max_features,
                                                  min_df=min_df,
                                                  ngram_range=ngram_range)
        self.inflection_form = inflection_form.lower()
        
#     def build_preprocessor(self):
#         preprocessor = super(myvec, self).build_preprocessor()
#         return lambda doc: preProcess(preprocessor(doc))

    def build_analyzer(self):
        analyzer = super(myvec, self).build_analyzer()
        sw = topic_model.get_stop_words()
        pattern = re.compile(topic_model.get_token_pattern())
        if self.inflection_form == 'lemmer':
            lemmer = WordNetLemmatizer()
            return lambda doc:(lemmer.lemmatize(w) for w in list(set(analyzer(doc)) - set(sw)) 
                               if pattern.match(w) if len(set(w.split()).intersection(sw)) == 0)
        else:
            stemmer = PorterStemmer()
            return lambda doc:(stemmer.stem(w) for w in list(set(analyzer(doc)) - set(sw)) if pattern.match(w))

In [11]:
#vectorizer = myvec(preprocessor=preProcess)
vectorizer = myvec()
t1 = time.time()
tfidf_matrix = vectorizer.fit_transform(docs[:1000])
print '- Time: %s' % (time.time() - t1)

- Time: 1.03492379189


In [12]:
doc = doc_idx
feature_names = vectorizer.get_feature_names()
feature_index = tfidf_matrix[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])

#print
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
    print w, s

# print feature_names
# print len(vectorizer.vocabulary_.keys())
vectorizer.vocabulary_['busser']

kitchen 0.28550024922102973
dining 0.36363840744357084
cook 0.3074904194290135
restaurant 0.4945824161470926
line 0.27200367901314
year 0.19868883744810542
family 0.2791851077368607
opened 0.4103634959386376
hard 0.2954468026903001


209

In [13]:
vectorizer.idf_.shape

(1844,)