In [1]:
from setup import *
import sys
if DATA_PATH not in sys.path: sys.path.append(DATA_PATH)
from constants import *

In [2]:
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 12)
pd.set_option('display.max_columns', 200)

Load previously cleaned data

In [6]:
dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
df = pd.read_csv(os.path.join(DATA_PATH, 'text.csv.gz'))
df.tokens

0         ['python', 'never', 'stop', 'learning', 'what'...
1                       ['Watching', 'Boa', 'vs', 'Python']
2         ['Monty', 'Python', 'The', 'silly', 'walk', 'v...
3         ['Senior', 'Software', 'Engineer', 'Full', 'St...
4         ['Architect', 'Django', 'Solr', 'Platform', 'E...
5           ['peaceful', 'rain', 'Python', 'inevitability']
                                ...                        
183064    ['Las', 'mejores', 'ides', 'para', 'Python', '...
183065    ['Gagal', 'tidur', 'gegara', 'habis', 'vertica...
183066         ['Go', 'boa', 'wkwk', 'Boa', 'vs', 'Python']
183067    ['RT', 'RealPython', 'List', 'of', 'Python', '...
183068                  ['Watching', 'Boa', 'vs', 'Python']
183069    ['Чертова', 'дюжина', 'вакансий', 'в', 'IT', '...
Name: tokens, dtype: object

In [8]:
d = Dictionary.from_documents(([str(s) for s in row]for row in df.tokens))

In [9]:
df.tokens.iloc[0]

"['python', 'never', 'stop', 'learning', 'what', 'you', 'enjoy', 'doing']"

When we said "QUOTE_NONNUMERIC" we didn't mean **ALL** nonnumeric fields ;)

In [10]:
df['tokens'] = df.txt.str.split()
df.tokens

0         [python, never, stop, learning, what, you, enj...
1                               [Watching, Boa, vs, Python]
2           [Monty, Python, The, silly, walk, via, YouTube]
3         [Senior, Software, Engineer, Full, Stack, Pyth...
4         [Architect, Django, Solr, Platform, Engineer, ...
5                   [peaceful, rain, Python, inevitability]
                                ...                        
183064    [Las, mejores, ides, para, Python, Antes, de, ...
183065    [Gagal, tidur, gegara, habis, vertical, limit,...
183066                     [Go, boa, wkwk, Boa, vs, Python]
183067    [RT, RealPython, List, of, Python, API, Wrappe...
183068                          [Watching, Boa, vs, Python]
183069    [Чертова, дюжина, вакансий, в, IT, и, Digital,...
Name: tokens, dtype: object

In [11]:
df.tokens.values[0:3]

array([['python', 'never', 'stop', 'learning', 'what', 'you', 'enjoy', 'doing'],
       ['Watching', 'Boa', 'vs', 'Python'],
       ['Monty', 'Python', 'The', 'silly', 'walk', 'via', 'YouTube']], dtype=object)

In [12]:
d = Dictionary.from_documents(df.tokens)
d

<gensim.corpora.dictionary.Dictionary at 0x7ff70c7f2da0>

In [13]:
tfidf = TfidfModel(d)

TypeError: object of type 'int' has no len()

*Hint-Hint:* `gensim` is sprinting this week at PyCon!

In [19]:
TfidfModel?

In [20]:
TfidfModel(df.txt)

ValueError: not enough values to unpack (expected 2, got 1)

In [21]:
TfidfModel(df.tokens)

ValueError: too many values to unpack (expected 2)

In [14]:
TfidfModel((d.doc2bow(tokens) for tokens in df.tokens))

<gensim.models.tfidfmodel.TfidfModel at 0x7ff6f0171a90>

But there's a simpler way.  
We already have a vocabulary  
with term and document frequencies in a matrix...  

In [15]:
pd.Series(d.dfs)

0          444
1         1658
2        53491
3          611
4         9048
5         2374
         ...  
87141        1
87142        1
87143        1
87144        1
87145        1
87146        1
dtype: int64

In [16]:
pd.Series(d.iteritems())

0        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
1        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
2        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
3        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
4        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
5        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
                               ...                        
87141    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87142    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87143    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87144    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87145    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87146    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
dtype: object

OK, now I get it  

- `document` is a list of strings (ordered sequence of tokens)  
- `bow` or [bag of words] is a list of `Counter`-like mappings between word IDs and their count in each document
- `TfidfModel` is a transformation from a BOW into a BORF,  a "bag of relative frequencies"  

TFIDF = BORF = term frequencies normalized by document occurence counts


In [17]:
pd.Series(d.doc2bow(toks) for toks in df.tokens[:3])

0    [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...
1                   [(8, 1), (9, 1), (10, 1), (11, 1)]
2    [(10, 1), (12, 1), (13, 1), (14, 1), (15, 1), ...
dtype: object

Did it assign 0 to the first word it found?  
Sort-of...  

In [18]:
d.token2id['python']

2

In [27]:
d.token2id['Python']

8

In [28]:
d.token2id['you']

3

In [29]:
d.id2token[0]  # guesses anyone?

'never'

In [20]:
tfidf = TfidfModel(dictionary=d)
tfidf

<gensim.models.tfidfmodel.TfidfModel at 0x7ff6f94669e8>

In [26]:
dfs = pd.Series(OrderedDict(sorted([(d.id2token[i], numdocs) for (i, numdocs) in tfidf.dfs.items()])))
dfs

A           7338
AA             1
AAA            2
AAAA           1
AAAAAA         1
AAAAAAND       2
            ... 
ＴＨＥ            1
Ｗ              2
ＷＡＮＴ           3
ＷＡＲＮＩＮＧ        1
ＹＯＵ           10
𝓩Ᏸ             1
dtype: int64

In [27]:
dfs.iloc[4000:4030]

Bioinformatics    20
Biological         3
Biologist          2
Biologists        13
Biology           17
Biomechanics       1
                  ..
Birkenstocks       1
Birkin             2
Birman            56
Birmann            1
Birmingham        11
Birth             12
dtype: int64

In [28]:
tfidf.num_docs

183070

In [29]:
tfidf.num_nnz

2392557

In [30]:
tfidf.save(os.path.join(DATA_PATH, 'tfidf'))

In [31]:
tfidf2 = TfidfModel.load(os.path.join(DATA_PATH, 'tfidf'))

In [32]:
tfidf2.num_nnz

2392557