In [9]:
%matplotlib inline

In [305]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import re

from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 
import string

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

#from sklearn.model_selection import test_tr

In [2]:
divcorpus = pd.read_csv('diversity_corpus.csv')

In [86]:
divcorpus.head()

masc_bow = divcorpus['genderSpecific']
neutral_bow = divcorpus['neutralEquality']

# format bow to list
masc_bow = masc_bow[:-2]
masc_bow = [i.lower() for i in list(masc_bow)]
neutral_bow = [i.lower() for i in list(neutral_bow)]

In [None]:
# Train, test, split


In [36]:
# stem words from job descript to determine whether they include words in neutral/masc words

porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()
porter.stem('having')

'have'

In [76]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.tokens = None
    def __call__(self, articles):
        self.tokens = [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
        self.tokens = [i.lower() for i in self.tokens if i not in string.punctuation]
        self.tokens = [i for i in self.tokens if i not in string.ascii_letters]
        return self.tokens

In [77]:
tf = TfidfVectorizer(stop_words='english', 
                     tokenizer=LemmaTokenizer(), 
                     strip_accents = 'unicode',
                     lowercase = True
                    )
#replace punctuation
# df['TEXT'] = df['TEXT'].str.replace('[^\w\s]', '')

tf_mat = tf.fit_transform(test)
vocab = tf.get_feature_names()

In [93]:
vocab[20:25]

['adding', 'additional', 'aid', 'aim', 'algorithm']

### Identify freq that a word appears in a document, categorize at masc/fem


In [159]:
# download job descriptions

jds = pd.read_csv('jds.csv')
jds.head()

Unnamed: 0,title,company,description
0,Software Engineer,SeatGeek,"The Overview\n\nWe're looking for smart, curio..."
1,Software Development Engineer,Amazon,Job Description\nAre you looking for an exciti...
2,Software Engineer,CALA,"CALA is building apparel manufacturing, invent..."
3,Software Engineer,Greenhouse Software,Greenhouse is looking for a Software Engineer ...
4,Software Engineer,Macquarie Group,Bring your experience in Software Engineering ...


In [176]:
jds.iloc[75]['description']

'Learn and work on meaningful initiatives with some of the best and brightest in the market research industry. The NPD Group provides the world’s most successful brands with leading market research, combining consumer and retail point-of-sale data with analytic solutions to interpret today’s market trends while anticipating tomorrow’s. In addition, we offer a career filled with innovation and growth to the forward-thinking problem solvers who join our team. Position Overview Work on the latest technologies influencing the innovation of products in a highly collaborative environment. Checkout is a revolutionary marketing research platform that captures consumer purchases from smartphones and transforms the data into information products for our many clients. From millions of consumers we mine hundreds of millions of receipts - yielding transaction-level detail across all retailers, all channels, and over time. Join this startup business with the financial backing of one of the marketing

In [160]:
jds = jds.astype({'description': 'str'})

In [143]:
jds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 3 columns):
title          145 non-null object
company        145 non-null object
description    145 non-null object
dtypes: object(3)
memory usage: 3.5+ KB


In [161]:
for i in jds['description']:
    i = i.splitlines()
    print(i)
    i = ' '.join(i)
    print(type(i))
    break

['The Overview', '', "We're looking for smart, curious engineers who want to help millions of people experience the thrill of live entertainment. The ticketing industry is messy, complex and ultimately makes attending live events more difficult than it should be; we believe a thoughtful application of software can fix that.", '', 'SeatGeek is a product-centric organization that deploys code into production multiple times per day. Our application is built in the microservice architectural style and we strive to use the best language for the task at hand. As we grow, the scale of our technical challenges - and their impact on our customers - continues to increase. We want to make sure we continue to build the best product we can.', '', "What We're Looking For", "Experience building web applications, especially within a microservices architecture. We'll be most interested in hearing about what you've built", "Experience solving complex technical challenges. SeatGeek engineers have had to 

In [339]:
# create article matrix
equality_mat = pd.DataFrame(columns =['articleID', 'num_neutrWords', 'num_specWords', 'neutroSpecRatio', 'score', 'label', 'neutrWords', 'specWords'])

corpus = jds['description'][:]


In [340]:
# identify freq of terms that are masc and fem

for i, article in enumerate(corpus):
    article_bow = lt.__call__(article)
    count_n = 0
    count_m = 0
    neutrWords = []
    specWords = []
    for j in article_bow:
        if j in neutral_bow:
            count_n += 1
            neutrWords.append(j)
        if j in masc_bow:
            count_m += 1
            specWords.append(j)
    df = pd.DataFrame({'articleID': [int(i)], 
              'num_neutrWords': [count_n], 
              'num_specWords': [count_m], 
              'neutrSpecRatio': [(1 + count_n)/(1 + count_m)], 
              'score': [(1 + count_n)/(1 + count_m) - 1],
              'label': 0,
              'neutrWords': [neutrWords],
              'specWords': [specWords]} # = equal/neutral
             )
    
    equality_mat = pd.concat([equality_mat, df])
    #equality_mat.astype({'articleID': 'int'})
    #equality_mat.set_index('articleID', inplace=True)

equality_mat['label'][equality_mat['score'] < 0] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [341]:
equality_mat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145 entries, 0 to 0
Data columns (total 9 columns):
articleID          145 non-null object
label              145 non-null object
neutrSpecRatio     145 non-null float64
neutrWords         145 non-null object
neutroSpecRatio    0 non-null object
num_neutrWords     145 non-null object
num_specWords      145 non-null object
score              145 non-null float64
specWords          145 non-null object
dtypes: float64(2), object(7)
memory usage: 11.3+ KB


In [198]:
jds_bow = []
for article in jds['description']:
    article_bow = lt.__call__(article)
    jds_bow.append(article_bow)

jds_bow = np.array(jds_bow)

In [None]:
# identify most related 

In [342]:
equality_mat.head()

Unnamed: 0,articleID,label,neutrSpecRatio,neutrWords,neutroSpecRatio,num_neutrWords,num_specWords,score,specWords
0,0,1,0.5,[],,0,1,-0.5,[opinion]
0,1,0,4.0,"[support, sexual, orientation]",,3,0,3.0,[]
0,2,0,1.5,"[support, together]",,2,1,0.5,[decision]
0,3,0,2.0,[connect],,1,0,1.0,[]
0,4,0,1.75,"[understand, together, support, sexual, orient...",,6,3,0.75,"[principle, individual, individual]"


In [271]:
map(lambda x: int(x), equality_mat['articleID'])
#equality_mat['articleID']

<map at 0x1219ea860>

In [353]:
labels = pd.DataFrame(data=equality_mat['label'])
labels.reset_index(inplace=True)
labels.drop('index', axis=1, inplace=True)
labels = np.array(labels).ravel()
labels = labels.astype('int')

#Create train, test labels
labels_train = labels[:100]
labels_test = labels[100:]

X = np.array(jds['description'][:100])
#X_train, X_test, y_train, y_test

In [354]:
labels_train

array([1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1])

In [355]:
# Change job descriptions to bag of words
tfidf = TfidfVectorizer(stop_words='english', tokenizer=LemmaTokenizer())
tfidf_mat = tfidf.fit_transform(X, labels_train)

In [356]:
X_test = jds[100:]['description']

X_test = [article for article in X_test]

In [357]:
#Transform new documents

count_vect = CountVectorizer()
#X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test)

In [359]:
# Predict whether article is masc or neutral
# Naive Bayes
clf = BernoulliNB().fit(tfidf_mat.todense(), labels_train)
nb_preds = clf.predict(X_test_tfidf)
nb_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [362]:
print('Accuracy: ', (nb_preds == labels_test).sum()/len(nb_preds))

Accuracy:  0.644444444444


In [372]:
# SVM model

clf_svm = SGDClassifier()

clf_svm.fit(tfidf_mat, labels_train)

svm_preds = clf_svm.predict(X_test_tfidf)
svm_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [373]:
print('Accuracy: ', (svm_preds == labels_test).sum()/len(svm_preds))

Accuracy:  0.644444444444


In [337]:
# identify trigger words

equality_mat[['neutrWords', 'specWords']]

Unnamed: 0,neutrWords,specWords
0,[],[opinion]
0,"[support, sexual, orientation]",[]
0,"[support, together]",[decision]
0,[connect],[]
0,"[understand, together, support, sexual, orient...","[principle, individual, individual]"
0,"[support, support]","[lead, lead, lead, decision, active]"
0,"[understand, support, understand, sexual, orie...",[]
0,"[support, together, support]","[principle, objective, logic, individual]"
0,"[connect, connect, connect, sexual, orientation]",[]
0,"[support, support, trust]",[active]


In [336]:
# identify most related keywords for each group using SVD/NMF



# Django app

In [None]:
import django