# Feature Selection: Text (Description of Grant)

**Research Question**
    
Is it possible to predict grant value from description keywords, location-based data and/or other characteristics?

## 0.0 Libraries

In [1]:
%load_ext autoreload
%autoreload 2

#system
import os
import sys
from os.path import join as pj
module_path = os.path.abspath(pj('..','..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# data
import pandas as pd
import numpy as np
np.set_printoptions(precision=4)
from dateutil.relativedelta import relativedelta
import datetime
import random
from operator import itemgetter

# viz
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(font_scale=1.5)
plt.style.use('bmh')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import missingno as msno
from tqdm import tqdm

# configurations
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
import configparser

#stats
import scipy
import statsmodels.api as sm

# utils
from src.d00_utils import print_helper_functions as phf

# ipython
import warnings
warnings.simplefilter('ignore')

# type annotations
from typing import List, Set, Dict, Tuple, Optional
from collections.abc import Iterable

# machine learning
from scratch.deep_learning import Tensor
from sklearn.model_selection import train_test_split

# NLP
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import nltk
nltk.download('wordnet')
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk import PorterStemmer

!python -V

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/marclawson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Python 3.7.4


<Figure size 432x288 with 0 Axes>

In [2]:
# not used in this stub but often useful for finding various files
project_dir = Path().resolve().parents[1]
print(project_dir)

# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables
_ = load_dotenv(find_dotenv())

raw_dir = pj(project_dir, 'data', os.environ.get('RAW_DIR'))
interim_dir = pj(project_dir, 'data', os.environ.get('INTERIM_DIR'))
processed_dir = pj(project_dir, 'data', os.environ.get('PROCESSED_DIR'))

/Users/marclawson/repositories/grantnav_10k_predictor


## 1.0 Word Vectors 

First, we want to see what words or small groups of words appear frequently.  We can compare both count and tfidf vectorizers.

In [3]:
data = pd.read_csv(pj(interim_dir,'grantnav_data.csv'), index_col=0)

In [4]:
pt_definitions = {'CC': 'coordinating conjunction',
'CD': 'cardinal digit',
'DT': 'determiner',
'EX': 'existential there',
'FW': 'foreign word',
'IN': 'preposition/subordinating conjunction',
'JJ': 'This NLTK POS Tag is an adjective (large)',
'JJR': 'adjective, comparative (larger)',
'JJS': 'adjective, superlative (largest)',
'LS': 'list market',
'MD': 'modal (could, will)',
'NN': 'noun, singular (cat, tree)',
'NNS': 'noun plural (desks)',
'NNP': 'proper noun, singular (sarah)',
'NNPS': 'proper noun, plural (indians or americans)',
'PDT': 'predeterminer (all, both, half)',
'POS': "possessive ending (parent\ 's)",
'PRP': 'personal pronoun (hers, herself, him,himself)',
'PRP$': 'possessive pronoun (her, his, mine, my, our )',
'RB': 'adverb (occasionally, swiftly)',
'RBR': 'adverb, comparative (greater)',
'RBS': 'adverb, superlative (biggest)',
'RP': 'particle (about)',
'TO': 'infinite marker (to)',
'UH': 'interjection (goodbye)',
'VB': 'verb (ask)',
'VBG': 'verb gerund (judging)',
'VBD': 'verb past tense (pleaded)',
'VBN': 'verb past participle (reunified)',
'VBP': 'verb, present tense not 3rd person singular(wrap)',
'VBZ': 'verb, present tense with 3rd person singular (bases)',
'WDT': 'wh-determiner (that, what)',
'WP': 'wh- pronoun (who)',
'WRB': 'wh- adverb (how)'}

In [5]:
def create_document_grammars(grantrow: List[Tuple[str, str]], grammar: List[str] = ['NN']) -> List[str]:
    """Returns list of all specified grammar types within text"""
    text = nltk.word_tokenize(grantrow.lower())
    pos_tagged = nltk.pos_tag(text)
    document = []
    for g in grammar:
        grammar_in_doc = [_[0] for _ in \
                    filter(lambda x:x[1].startswith(g), \
                           pos_tagged)]
        document.extend(grammar_in_doc)
    return list(set(document))

In [6]:
documents = []
for row in data['description'].sample(1000):
    documents.append(create_document_grammars(row, grammar=['NN','RBS']))

In [7]:
corpus = [' '.join(_) for _ in documents]

def vectorizer_word_counts(corpus: List[str]) -> pd.DataFrame:

    vec = CountVectorizer(stop_words='english', ngram_range=(1,3))
    vec.fit(corpus)
    vec_mat = vec.transform(corpus)

    def word_frequency_sorter(words, counts):
        sort_result = sorted(zip(words, list(counts)),
                             key=itemgetter(1), reverse=True)
        return sort_result

    def word_counter(mat):
        count = np.array(mat.sum(axis=0))[0]
        return count

    words = vec.get_feature_names()
    _sum = word_counter(vec_mat) 
    df_word_frequency = pd.DataFrame(word_frequency_sorter(
        words, _sum), columns=['word', 'frequency'])
    df_word_frequency.set_index('word', inplace=True)
    return df_word_frequency

In [8]:
phf.print_full(vectorizer_word_counts(corpus).iloc[:10,:])

Unnamed: 0_level_0,frequency
word,Unnamed: 1_level_1
project,288
funding,263
people,182
grant,174
community,169
group,149
costs,126
programme,123
activities,105
children,99


In [9]:
data.shape

(525023, 11)

In [10]:
corpus = data['description']#.sample(10000)

corpus_train, corpus_test = train_test_split(corpus, train_size=0.8, test_size=0.2, shuffle=True)

In [11]:
tvec = TfidfVectorizer(stop_words='english', ngram_range=(1,1), token_pattern='\w+', strip_accents='unicode', max_features=5000)
tvec.fit(corpus_train)
vecmat = tvec.transform(corpus_train)

print("Number of nonzero entries:")
print(vecmat.nnz)
print("Highest count:")
print(vecmat.max())
print("Row means:")
print(vecmat.mean(axis=1))
print("Transform to numpy array format:")
print(vecmat.toarray())

df = pd.DataFrame(tvec.transform(corpus_train).toarray(),
                  columns=tvec.get_feature_names())
df = df.transpose().sort_values(0, ascending=False).transpose()
#df['target'] = data_train.target
df_train = df
display(df)

TfidfVectorizer(max_features=5000, stop_words='english',
                strip_accents='unicode', token_pattern='\\w+')

Number of nonzero entries:
7194440
Highest count:
1.0
Row means:
[[0.0008]
 [0.0008]
 [0.0002]
 ...
 [0.0002]
 [0.0008]
 [0.0004]]
Transform to numpy array format:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


Unnamed: 0,schools,sats,sixth,bodies,colleges,mats,responsible,condition,form,improvements,...,establishment,establishing,established,establish,essex,essentials,essential,especially,esol,zumba
0,0.419304,0.262298,0.26131,0.259912,0.259654,0.258748,0.257941,0.251038,0.248952,0.248049,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420013,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
420014,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
420015,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
420016,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 1.1. Stemming and lemmatizing words

Let's set up some lemmatising and stemming functions to pass as the tokeniser.

#### 1.1.1 2-grams

In [12]:
def lemmatize_stemming(text: str) -> str:
    """Lemmatises str"""
    ps = PorterStemmer()
    return ps.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text: str) -> List[str]:
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [13]:
doc_sample = corpus_train.iloc[10]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))
assert type(doc_sample) == str

original document: 
['A', 'Second', 'World', 'War', 'veteran', 'from', 'Cambridgeshire', 'has', 'been', 'awarded', 'a', 'grant', 'towards', 'travel', 'costs', 'for', 'a', 'commemorative', 'visit', 'to', 'Normandy', 'in', 'June', '2010.', 'Funding', 'has', 'been', 'provided', 'for', 'one', 'Second', 'World', 'War', 'veteran', 'and', 'one', 'carer.']


 tokenized and lemmatized document: 
['second', 'world', 'veteran', 'cambridgeshir', 'award', 'grant', 'travel', 'cost', 'commemor', 'visit', 'normandi', 'june', 'fund', 'provid', 'second', 'world', 'veteran', 'carer']


Now that we have a working tokenizer, we should investigate a tfidf vectoriser. 

In [14]:
corpus = data['description'].sample(10000) # sample first as this could take a long time
corpus_train, corpus_test = train_test_split(corpus, test_size=0.2)

In [15]:
tvec = TfidfVectorizer(stop_words='english', ngram_range=(1,2), token_pattern='\w+', strip_accents='unicode', tokenizer=preprocess)
tvec.fit(corpus_train)
vecmat = tvec.transform(corpus_train)

df = pd.DataFrame(tvec.transform(corpus_train).toarray(),
                  columns=tvec.get_feature_names())
df = df.transpose().sort_values(0, ascending=False).transpose()
display(df)

TfidfVectorizer(ngram_range=(1, 2), stop_words='english',
                strip_accents='unicode', token_pattern='\\w+',
                tokenizer=<function preprocess at 0x7fbd6c486af0>)

Unnamed: 0,flintshir base,enabl flintshir,plan support,inform plan,undertak comprehens,effici review,comprehens effici,chariti undertak,salari inform,offic enabl,...,express fundament,express excitatori,express develop,express enter,express enjoy,express embryon,express effect,express diseas,express differ,zumo children
0,0.251263,0.251263,0.251263,0.251263,0.251263,0.251263,0.251263,0.240302,0.240302,0.232525,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7996,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7997,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7998,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df.loc['Mean'] = df.mean()
df.loc['Sum'] = df.sum()

We should check words with the highest mean values to see how the ordered and lem/stem words look. 

In [17]:
columns = [c for c in df.columns if c.startswith('t')]
words_to_check = df[columns].T.sort_values('Mean', ascending=False)
list(words_to_check.index)[:10]

['train',
 'travel',
 'titl',
 'travel cost',
 'teacher',
 'titl covid',
 'travel veteran',
 'time',
 'traine',
 'trip']

In [18]:
#df[word_to_analyse].iloc[:-1].sort_values()

In [19]:
word_to_analyse = 'train'
idx = df[word_to_analyse].iloc[:-1].idxmax()
print(f"index for '{word_to_analyse}' = {idx}")
print()
print(f"ORIGINAL:\n{corpus_train.iloc[idx]}")
print()
ordered_words = df.iloc[idx][df.iloc[idx]>0].sort_values(ascending=False)
#print(f"ORDERED LEM/STEM:\n{' '.join(list(ordered_words.index))}")
#print()
scores = {i: round(v, 2) for i, v in zip(list(ordered_words.index), list(ordered_words.values))}
print(f"ORDERED LEM/STEM:\n{scores}")

index for 'train' = 418

ORIGINAL:
Training Grant (including Trader Training)

ORDERED LEM/STEM:
{'train': 0.42, 'trader train': 0.42, 'includ trader': 0.42, 'trader': 0.42, 'train grant': 0.41, 'grant includ': 0.26, 'includ': 0.2, 'grant': 0.15}


There are some high-scoring groups of words in this example that include probably redundant words (such as 'includ' and 'grant' in 'train grant') and could be replicated elsewhere.  We could try reducing to just 1 n-gram or we could reduce to just nouns.

#### 1.1.2 1-grams (nouns and superlatives)

In [20]:
 pd.DataFrame(data['description'].sample(1000))

Unnamed: 0,description
55783,Dengue shock syndrome (DSS) is the commonest l...
61818,Dance House is involved in a range of outreach...
179624,This school will create a courtyard sensory ga...
164973,This group promote awareness of interest in a...
346490,As requested by the donor
...,...
126172,A Second World War Veteran from Somerset has b...
60538,Funding is required to install new CCTV camera...
27494,Funding under Sport England's COVID-19 Communi...
256247,"Presteigne & Norton Chamber of Trade, Commerce..."


In [21]:
corpus2 = pd.DataFrame(data['description'])
tqdm.pandas()
corpus2['string_nouns'] = corpus2['description'].progress_apply(lambda x: ' '.join(create_document_grammars(x, grammar=['NN','RBS','JJS'])))
corpus2 = corpus2['string_nouns']

100%|██████████| 525023/525023 [13:25<00:00, 651.65it/s] 


In [22]:
corpus2_train, corpus2_test = train_test_split(corpus2, train_size=0.8, test_size=0.2, shuffle=True)

In [23]:
tvec = TfidfVectorizer(stop_words='english', ngram_range=(1,2), token_pattern='\w+', strip_accents='unicode', tokenizer=preprocess, max_features=5000)
tvec.fit(corpus2_train)
vecmat = tvec.transform(corpus2_train)

df_nouns = pd.DataFrame(tvec.transform(corpus2_train).toarray(),
                  columns=tvec.get_feature_names())
df_nouns = df_nouns.transpose().sort_values(0, ascending=False).transpose()
display(df_nouns)

TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english',
                strip_accents='unicode', token_pattern='\\w+',
                tokenizer=<function preprocess at 0x7fbd6c486af0>)

Unnamed: 0,instal vehicl,individu point,vehicl individu,point charg,charg,vehicl,point,instal,individu,plan,...,faith,fairer,fair,failur,factor,fact,facilit,facil villag,facil train,zoom
0,0.368381,0.368381,0.368289,0.368011,0.338802,0.310885,0.30418,0.287229,0.26657,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420013,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
420014,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
420015,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
420016,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
df_nouns.loc['Mean'] = df_nouns.mean()
df_nouns.loc['Sum'] = df_nouns.sum()

We should check words with the highest mean values to see how the ordered and lem/stem words look. 

In [25]:
columns = [c for c in df_nouns.columns if c.startswith('t')]
words_to_check = df_nouns[columns].T.sort_values('Mean', ascending=False)
list(words_to_check.index)[:10]

['train',
 'teacher',
 'trip',
 'train apprenticeship',
 'time',
 'trust',
 'transport',
 'traine',
 'team',
 'travel']

In [28]:
#df_nouns[word_to_analyse].iloc[:-1].sort_values()

In [26]:
word_to_analyse = 'trip'
idx_df = df_nouns[word_to_analyse].iloc[:-1].sort_values(ascending=False)
idx = idx_df[idx_df>0].sample(1).index[0]
print(f"index for '{word_to_analyse}' = {idx}")
print()
print(f"ORIGINAL:\n{corpus2_train.iloc[idx]}")
print()
ordered_words = df_nouns.iloc[idx][df_nouns.iloc[idx]>0].sort_values(ascending=False)
#print(f"ORDERED LEM/STEM:\n{' '.join(list(ordered_words.index))}")
#print()
scores = {i: round(v, 2) for i, v in zip(list(ordered_words.index), list(ordered_words.values))}
print(f"ORDERED LEM/STEM:\n{scores}")

index for 'trip' = 179838

ORIGINAL:
public families methods day sufferers support grant effects conference awareness group trips

ORDERED LEM/STEM:
{'suffer': 0.36, 'awar group': 0.36, 'group trip': 0.33, 'confer': 0.31, 'method': 0.31, 'support grant': 0.3, 'effect': 0.28, 'public': 0.26, 'trip': 0.24, 'awar': 0.24, 'famili': 0.19, 'support': 0.16, 'group': 0.13, 'grant': 0.12}


'parent children' has a higher score than parent and children put together; but how do we account for grouping of words when a user types in an engine?  Do we engineer it in a way that allows for predictions if 'children parent' is typed in.  What about 'parent child'?  There is also the added problem of these nouns appearing together artidificially (there'll be other PS types between nouns, and nouns could move about a sentence.  Perhaps it's better to stick with individual words.

In [49]:
tvec = TfidfVectorizer(stop_words='english', ngram_range=(1,1), token_pattern='\w+', strip_accents='unicode', tokenizer=preprocess, max_features=5000)
tvec.fit(corpus2_train)
vecmat = tvec.transform(corpus2_train)

df_nouns_1n = pd.DataFrame(tvec.transform(corpus2_train).toarray(),
                  columns=tvec.get_feature_names())
df_nouns_1n = df_nouns_1n.transpose().sort_values(0, ascending=False).transpose()
display(df_nouns_1n)

TfidfVectorizer(max_features=5000, stop_words='english',
                strip_accents='unicode', token_pattern='\\w+',
                tokenizer=<function preprocess at 0x7fbd6c486af0>)

Unnamed: 0,charg,vehicl,point,instal,individu,phosphoryl,photoshop,photoreceptor,photographi,photograph,...,falmouth,fall,falkirk,falciparum,faith,fairer,fair,failur,fail,zumba
0,0.500887,0.459614,0.449702,0.424641,0.3941,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420013,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
420014,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
420015,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
420016,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
df_nouns_1n.loc['Mean'] = df_nouns_1n.mean()
df_nouns_1n.loc['Sum'] = df_nouns_1n.sum()

In [51]:
columns = [c for c in df_nouns_1n.columns if c.startswith('e')]
words_to_check = df_nouns_1n[columns].T.sort_values('Mean', ascending=False)
list(words_to_check.index)[:10]

['england',
 'equip',
 'event',
 'emerg',
 'educ',
 'employ',
 'environ',
 'experi',
 'exercis',
 'engag']

In [59]:
word_to_analyse = 'environ'
idx_df = df_nouns_1n[word_to_analyse].iloc[:-1].sort_values(ascending=False)
idx = idx_df[idx_df>0].sample(1).index[0]
print(f"index for '{word_to_analyse}' = {idx}")
print()
print(f"ORIGINAL:\n{corpus2_train.iloc[idx]}")
print()
ordered_words = df_nouns_1n.iloc[idx][df_nouns_1n.iloc[idx]>0].sort_values(ascending=False)
scores = {i: round(v, 2) for i, v in zip(list(ordered_words.index), list(ordered_words.values))}
print(f"ORDERED LEM/STEM:\n{scores}")

index for 'environ' = 228302

ORIGINAL:
area community shrubs pond school project boards environment grounds wider manchester funding children information trees organisation access

ORDERED LEM/STEM:
{'shrub': 0.4, 'pond': 0.35, 'tree': 0.32, 'board': 0.31, 'manchest': 0.31, 'ground': 0.29, 'wider': 0.25, 'inform': 0.23, 'environ': 0.22, 'access': 0.2, 'organis': 0.17, 'area': 0.17, 'children': 0.16, 'school': 0.15, 'commun': 0.12, 'fund': 0.11, 'project': 0.1}


This is looking better.  I can see how a user might type these words into a keyword search bar and how a machine learning model (with limited computational resources) might handle this.

In [53]:
df_nouns_1n_test = pd.DataFrame(tvec.transform(corpus2_test).toarray(),
                  columns=tvec.get_feature_names())
df_nouns_1n_test = df_nouns_1n_test.transpose().sort_values(0, ascending=False).transpose()
display(df_nouns_1n_test)

Unnamed: 0,chicken,climber,hotel,signag,classroom,plant,day,materi,hous,educ,...,fareshar,fareham,fare,fan,famili,falmouth,fall,falkirk,falciparum,zumba
0,0.401735,0.401735,0.355323,0.332787,0.288056,0.258669,0.250242,0.211972,0.20272,0.187869,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105001,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105002,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105003,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
df_nouns_1n_test.loc['Mean'] = df_nouns_1n_test.mean()
df_nouns_1n_test.loc['Sum'] = df_nouns_1n_test.sum()

In [62]:
columns = [c for c in df_nouns_1n_test.columns if c.startswith('c')]
words_to_check = df_nouns_1n_test[columns].T.sort_values('Mean', ascending=False)
list(words_to_check.index)[:10]

['commun',
 'cost',
 'children',
 'capit',
 'club',
 'carer',
 'condit',
 'centr',
 'colleg',
 'core']

In [63]:
word_to_analyse = 'children'
idx_df = df_nouns_1n_test[word_to_analyse].iloc[:-1].sort_values(ascending=False)
idx = idx_df[idx_df>0].sample(1).index[0]
print(f"index for '{word_to_analyse}' = {idx}")
print()
print(f"ORIGINAL:\n{corpus2_test.iloc[idx]}")
print()
ordered_words = df_nouns_1n_test.iloc[idx][df_nouns_1n_test.iloc[idx]>0].sort_values(ascending=False)
scores = {i: round(v, 2) for i, v in zip(list(ordered_words.index), list(ordered_words.values))}
print(f"ORDERED LEM/STEM:\n{scores}")

index for 'children' = 91825

ORIGINAL:
mon project creativity service tennis aim kids workers events machine courses children resources anglesey variety network ynys aid die area workshops cutters grant â£2,461 awards skill order

ORDERED LEM/STEM:
{'cutter': 0.38, 'anglesey': 0.36, 'kid': 0.3, 'machin': 0.28, 'creativ': 0.26, 'tenni': 0.24, 'varieti': 0.23, 'network': 0.21, 'order': 0.21, 'resourc': 0.2, 'worker': 0.2, 'cours': 0.19, 'workshop': 0.17, 'event': 0.16, 'award': 0.15, 'skill': 0.15, 'servic': 0.14, 'area': 0.14, 'children': 0.13, 'grant': 0.1, 'project': 0.08}
