In [None]:
%matplotlib inline
import matplotlib.pyplot as plt


import os
import itertools
import json
import numpy as np
import pandas as pd
import pickle
import requests
import seaborn as sns
import collections
from collections import Counter
import scipy


from sklearn.ensemble import RandomForestClassifier
from sklearn import clone
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


import nltk
import gensim


In [None]:
from scipy.cluster.hierarchy import ward, dendrogram
from scipy.spatial import distance
from scipy.cluster.hierarchy import cophenet

In [None]:
#Functions for fitting random forest model
def dropcol_importances(rf, X_train, y_train):
    rf_ = clone(rf)
    rf_.random_state = 999
    rf_.fit(X_train, y_train)
    baseline = rf_.oob_score_
    imp = []
    for i, col in enumerate(X_train.columns):
        X = X_train.drop(col, axis=1)
        rf_ = clone(rf)
        rf_.random_state = 999
        rf_.fit(X, y_train)
        o = rf_.oob_score_
        imp.append(baseline - o)
    imp = np.array(imp)
    I = pd.DataFrame(
            data={'Feature':X_train.columns,
                  'Importance':imp})
    I = I.set_index('Feature')
    I = I.sort_values('Importance', ascending=True)
    return I


def get_predictive_terms(df, bin_data, features, route, n):
    is_route = df.Route == route
    positive = bin_data[is_route]
    _negative = bin_data[~is_route]
    size = positive.shape[0]
    y=[1]*size + [0]*size
    print(route, "-->", size)
    
    importances = []
    for i in range(0, n):
        random_idx = np.random.choice(_negative.shape[0], size=size, replace=False)
        negative = _negative[random_idx]
        X=np.vstack([positive, negative])
        rf = RandomForestClassifier(max_depth=3, n_estimators=100, oob_score=True)
        rf.fit(X, y)

        # Build dataframe only with used columns
        X_train = pd.DataFrame(X, columns=features)
        drop_cols = X_train.columns[rf.feature_importances_ == 0.0]
        X_train.drop(drop_cols, axis=1, inplace=True)
        
        I = dropcol_importances(rf, X_train, y)
        importances.append(I)
        
    df_imp = pd.concat(importances, axis=1, sort=False)
    df_imp[pd.isnull(df_imp)] = 0.0
    
    keywords_mean = df_imp.apply(np.mean, axis=1).sort_values(ascending=False)
    return dict(keywords_mean[keywords_mean > 0]), dict(keywords_mean[keywords_mean == 0])

In [None]:
#Functions for identifying terms with highest tf-idf
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding 
        feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important 
        amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

def get_top_words(desc, feature_names, vect, n = 25):
    response = vect.transform(desc)
    words = top_mean_feats(response, feature_names, grp_ids = None, top_n = n)
    return words['feature'].values

In [None]:
def get_top_words_weights(desc, feature_names, vect, n = 25):
    response = vect.transform(desc)
    words = top_mean_feats(response, feature_names, grp_ids = None, top_n = n)
    return words

In [None]:
output_dir = '/Users/jdjumalieva/ESCoE/outputs/'

In [None]:
lookup_dir = '/Users/jdjumalieva/ESCoE/lookups/'

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format\
(os.path.join(lookup_dir, 'word2vec_output.txt'))

In [None]:
df_api = pd.read_csv(os.path.join(output_dir, 'df_api.csv'),
                     encoding = 'utf-8')

In [None]:
df_api.head()

In [None]:
df_api.columns

In [None]:
df_api.iloc[1]['clean_desc'][:50]

## Extracting important terms that describe routes

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = ['g','e g', 'uk', 'org', '-', '–', 'le', 'kpis', 'anti', 'client ’',
                'l', '’ need', 'b', 'k', 'd', '”', '“', 'customer ’', "'s",
                '‘',  'v', 'h', 'ass', 'http', 'http www', 'www', 'c', 'ac',
                'skill –', 'h s', 'nh', 'customers ’', 'process e', 
                's requirement', 's degree', "'", "organisation 's", 'level',
                'degree', 'de', '·', 'companies ’', 'e', '•', '’']
stopwords.extend(new_stopwords)

In [None]:
textfortoken= df_api['clean_desc']

In [None]:
tokens = [nltk.word_tokenize(elem) for elem in textfortoken]
tags = [nltk.pos_tag(elem) for elem in tokens]

In [None]:
#Only keeping nouns
select = [[word for word,pos in elem if (pos == 'NN' or pos == 'NNP')]
 for elem in tags]

In [None]:
test = [elem for elem in select if 'delivers' in elem]

In [None]:
nltk.pos_tag(['efficient'])

In [None]:
select[:5]

In [None]:
len(select)

In [None]:
df_api['pruned'] = [' '.join(elem) for elem in select]

In [None]:
df_api['pruned'][:5]

In [None]:
textfortoken= df_api['pruned']

tfidf = TfidfVectorizer(tokenizer=tokenize,
                        stop_words=new_stopwords,
                        ngram_range=(1,2), 
                        max_df = 0.4, 
                        min_df = 2)
tfidfm = tfidf.fit_transform(textfortoken)
feature_names = tfidf.get_feature_names()


In [None]:
top_terms_dict = {}
for name, group in df_api.groupby('Route'):    
    top_terms = get_top_words(group['pruned'], feature_names, tfidf, n = 100)
    print(name, top_terms)
    top_terms_dict[name] = top_terms
    print('**************************************')

In [None]:
top_terms_df = pd.DataFrame.from_dict(top_terms_dict, orient = 'index')

In [None]:
top_terms_df.head()

In [None]:
top_terms_df.to_csv(os.path.join(output_dir, 'top_terms_routes.csv'), encoding = 'utf-8')

In [None]:
for name, group in df_api.groupby('Route'):    
    if name == 'Digital':
        top_terms_weights = get_top_words_weights(group['pruned'], feature_names, tfidf, n = 50)

In [None]:
top_terms_weights.sort_values(by = 'tfidf', ascending = False).head()

In [None]:
top_words = {}
for ix, row in top_terms_weights.iterrows():
    feature = row['feature']
    weight = row['tfidf']
    top_words[feature] = weight

In [None]:
top_words['vulnerability']

In [None]:
len(top_words)

### Fitting the model and calculating weighted sum of terms predictive of a given route for all standards

In [None]:
bin_vec = CountVectorizer(binary=True, stop_words = stopwords)
bin_data = bin_vec.fit_transform(textfortoken).toarray()


In [None]:
features = bin_vec.get_feature_names()
    
importances = {}
scores = {}
out_data = {}
unimportances = {}
for route in set(df_api.Route):
    print(route)
#    if route == 'Digital':
    keywords, keywords_unimportant = get_predictive_terms(df_api, bin_data, features, route, n=50)

    importances[route] = keywords
    unimportances[route] = keywords_unimportant
    count_vec = CountVectorizer(vocabulary=keywords.keys(), stop_words=new_stopwords)
    count_data = count_vec.fit_transform(textfortoken).toarray()

    weighted_count = []
    for _, row in pd.DataFrame(count_data, columns=count_vec.get_feature_names()).iterrows():
        wc = sum(np.log(count + 1)*importances[route][term] for term, count in row.iteritems() if count > 0)
        weighted_count.append(wc)
    weighted_count = np.array(weighted_count)

    #count_data[count_data > 0] = 1

    corpus_length = np.array([len(doc) for doc in textfortoken])
    digitalness = weighted_count/corpus_length

    _df = df_api.copy()
    _df[route] = digitalness #- np.median(digitalness)

    approved = _df.Status == "Approved for delivery"


    score = {group: grouped[route].median()
              for group, grouped in _df[approved].groupby("Route")}
    order = [route for route, score in Counter(score).most_common()]
    scores[route] = score

    fig, ax = plt.subplots(figsize=(10,6))
    ax = sns.boxplot(x=route, y="Route", 
                     data=_df[_df.Status == "Approved for delivery"], 
                     orient="h", order=order,
                     palette="Paired", ax=ax)

    ax.set_xlabel(f"'{route.replace(' ','-').replace(',','').lower()}ness' of apprenticeship standard description")
    ax.set_ylabel("Apprenticeship standard route")
    #ax.set_xlim(-0.003, 0.01)
    ax.set_title(route)
    plt.savefig(f"{route}-{int(stop_pc*100)}.png", bbox_inches = "tight")        

    out_data[route] = _df
    out_data[route].sort_values(route, ascending=False)[["Title", "Route", route]].to_csv(
        f"{route.replace(' ','-').replace(',','').lower()}ness-{int(stop_pc*100)}.csv", index=False)

print()
for route, imps in importances.items():
    print(route, "-->", [k for k, v in Counter(imps).most_common(20)])
    print()

print([out_data, scores, importances, unimportances, stop_words])
        
        

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
ax = sns.boxplot(x='Digital', y="Route", 
                 data=_df[_df.Status == "Approved for delivery"], 
                 orient="h", order=order,
                 palette="Paired", ax=ax)

#ax.set_xlabel(f"'{route.replace(' ','-').replace(',','').lower()}ness' of apprenticeship standard description")
#ax.set_ylabel("Apprenticeship standard route")
#ax.set_xlim(-0.003, 0.01)
ax.set_title(route)
#plt.savefig(f"figs/language_specialisation/{route}-{int(stop_pc*100)}.png", bbox_inches = "tight")        
