# EDA wiki news embedding vs Spacy tokenizer

In [14]:
import os
import re
import pandas as pd
import spacy
from collections import Counter
from gensim.models import KeyedVectors

In [15]:
def token_info(tokens):
    print('Number of tokens: ', len(tokens))
    print('First 10 tokens: ', tokens[:10])


class TokenTypes:
    def __init__(self):
        self.num_tokens = []
        self.symbol_tokens = []
        self.low_tokens = []
        self.up_tokens = []
        self.caps_tokens = []

    def __call__(self, tokens):
        num_re = re.compile('[0-9]')
        low_re = re.compile('[a-z]')
        up_re = re.compile('[A-Z]')
        caps_re = re.compile('^[A-Z]*$')
        letnum_re = re.compile('[a-zA-Z0-9]')
        for t in tokens:
            if num_re.search(t):
                self.num_tokens.append(t)
            if not letnum_re.search(t):
                self.symbol_tokens.append(t)
            if low_re.match(t):
                self.low_tokens.append(t)
            if up_re.match(t):
                self.up_tokens.append(t)
            if caps_re.match(t):
                self.caps_tokens.append(t)
        print('Number of num tokens: ', len(self.num_tokens))
        print('Number of symbol tokens: ', len(self.symbol_tokens))
        print('Number of lowcase tokens: ', len(self.low_tokens))
        print('Number of upper case tokens: ', len(self.up_tokens))
        print('Nubmer of capslock tokens: ', len(self.caps_tokens))

        print('\n')

        print('Freq num tokens: ', self.num_tokens[:10])
        print('Rare num tokens: ', self.num_tokens[-10:], '\n')
        print('Freq symbol tokens: ', self.symbol_tokens[:10])
        print('Rare symbol tokens: ', self.symbol_tokens[-10:], '\n')
        print('Freq low tokens: ', self.low_tokens[:10])
        print('Rare low tokens: ', self.low_tokens[-10:], '\n')
        print('Freq up tokens: ', self.up_tokens[:10])
        print('Rare up tokens: ', self.up_tokens[-10:], '\n')
        print('Freq caps tokens: ', self.caps_tokens[:10])
        print('Rare caps tokens: ', self.caps_tokens[-10:])

### Look at embedding

In [16]:
emb_path = 'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
data_dir = cache = '../data'

train_csv = os.path.join(data_dir, 'train.csv')
test_csv = os.path.join(data_dir, 'test.csv')
emb_path = os.path.join(data_dir, emb_path)

In [17]:
%%time
emb_tokens = []
with open(emb_path) as f:
    for line in f:
        tok = line.split()[0]
        emb_tokens.append(tok)

CPU times: user 9.45 s, sys: 1.03 s, total: 10.5 s
Wall time: 12.6 s


In [18]:
token_info(emb_tokens)

Number of tokens:  999995
First 10 tokens:  ['999994', ',', 'the', '.', 'and', 'of', 'to', 'in', 'a', '"']


In [19]:
ttemb = TokenTypes()
ttemb(emb_tokens)

Number of num tokens:  89709
Number of symbol tokens:  15047
Number of lowcase tokens:  319245
Number of upper case tokens:  586099
Nubmer of capslock tokens:  55932


Freq num tokens:  ['999994', '1', '2', '10', '20', '11', '3', '12', '15', '18']
Rare num tokens:  ['20files', 'SU-25s', 'DE3', '0.0045', 'FY09E', '60.66', '615p', '60.38', '37.19', '32.37'] 

Freq symbol tokens:  [',', '.', '"', ':', ')', '(', '*', "'", '/', '=']
Rare symbol tokens:  ['ਜੋ', 'આપણે', 'சரி', 'இதன', 'ഞാൻ', '\x0e', '\uf025', 'Армия', '당신이', 'Хочу'] 

Freq low tokens:  ['the', 'and', 'of', 'to', 'in', 'a', 'that', 'is', 'for', 'on']
Rare low tokens:  ['mail-room', 'thambi', 'red-fruited', 'pin-points', 'polytes', 'pyralid', 'programs--including', 'calligraffiti', 'whitespotted', 'sacoglossan'] 

Freq up tokens:  ['The', 'I', 'Wikipedia', 'In', 'UTC', 'A', 'If', 'But', 'This', 'It']
Rare up tokens:  ['Laremy', 'Horra', 'Chiropotes', 'Sunlife', 'Majnoun', 'Bartenura', 'Melkam', 'Iseya', 'Bayyah', 'Vilaya'] 

Fre

### Tokenize data and build vocab

In [20]:
train = pd.read_csv(train_csv)['question_text'].values.tolist()
test = pd.read_csv(test_csv)['question_text'].values.tolist()
data = train + test

In [21]:
%%time
unknown_tokens = []
vocab = Counter()
spacy_en = spacy.load('en')
tokenizer = spacy_en.tokenizer
emb_tokens_set = set(emb_tokens)
for question in data:
    doc = tokenizer(question)
    for token in doc:
        tok_text = token.text
        vocab.update([tok_text])
        if vocab[tok_text] == 1:
            if tok_text not in emb_tokens_set:
                unknown_tokens.append(tok_text)
        
#doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
#for token in doc:
#    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#          token.shape_, token.is_alpha, token.is_stop)

CPU times: user 2min 26s, sys: 174 ms, total: 2min 26s
Wall time: 2min 24s


In [22]:
vocab_tokens = dict(vocab.most_common()).keys()

In [23]:
ttvocab = TokenTypes()
ttvocab(vocab_tokens)

Number of num tokens:  29530
Number of symbol tokens:  1727
Number of lowcase tokens:  132699
Number of upper case tokens:  113484
Nubmer of capslock tokens:  15169


Freq num tokens:  ['2', '2017', '2018', '1', '3', '10', '5', '4', '12', '6']
Rare num tokens:  ['10–12', '61850', 'Feb-2018', '2200-$2500/mo', '4000/mo', '999999', '450cc', '78.53', '900va', '6740'] 

Freq symbol tokens:  ['?', ',', '.', '"', '-', ')', '(', '/', "'", '&']
Rare symbol tokens:  ['τῆς', 'κατὰ', 'τὴν', 'ἡμετέραν', 'χρῆσιν', 'συνηθείας', '=--', '中庸', '中庸之道', '∅⊂'] 

Freq low tokens:  ['the', 'to', 'a', 'in', 'is', 'of', 'and', 'do', 'are', 'for']
Rare low tokens:  ['noumber', 'sml', 'sgill', 'jauhar', 'clearly"bad', 'myAIR', 'cybershot', 'ganpat', 'rabels', 'photopicker'] 

Freq up tokens:  ['What', 'I', 'How', 'Why', 'Is', 'Can', 'Which', 'Do', 'If', 'India']
Rare up tokens:  ['Dime', 'Entanglements', 'Tamilar', 'Flurry', 'Routines', 'ULTIMATELY', 'Hb', 'Generate', 'Khayami', 'Thetis'] 

Freq caps tokens:  ['

### Capslock tokens

In [24]:
caps_tokens = ttvocab.caps_tokens
emotional_caps_tokens = []
for t in caps_tokens:
    if t not in emb_tokens_set and t.lower() in emb_tokens_set:
        emotional_caps_tokens.append(t)
print(len(emotional_caps_tokens))

766


In [25]:
emotional_caps_tokens

['MAINS',
 'MUJ',
 'IITS',
 'ANTIFA',
 'PEGA',
 'BITCOIN',
 'COMED',
 'DUCAT',
 'KLU',
 'SLET',
 'SIOM',
 'QUORA',
 'IARE',
 'JABALPUR',
 'IRSE',
 'BODS',
 'ETHER',
 'MBOX',
 'UNSUBSCRIBE',
 'NGINX',
 'DEMAT',
 'VOLTE',
 'SIMILARITIES',
 'NARCOS',
 'AMET',
 'VANI',
 'AMITY',
 'PERIODIC',
 'SNAPCHAT',
 'PRELIMS',
 'ANONYMOUSLY',
 'VIZAG',
 'ODES',
 'GOLEM',
 'BEGINNER',
 'RESONANCE',
 'MAHABHARATA',
 'CUNTS',
 'JPOP',
 'PAPPU',
 'STUPIDEST',
 'ENROLL',
 'THUNDERBIRD',
 'DISADVANTAGES',
 'BABI',
 'WASE',
 'EUCALYPTUS',
 'CALICUT',
 'MUDD',
 'MYRA',
 'GEFORCE',
 'JALANDHAR',
 'DUOS',
 'DEHRADUN',
 'OCAML',
 'ISSUING',
 'PHARMACISTS',
 'CRYPTOCURRENCY',
 'DORM',
 'TELECOMMUNICATION',
 'JOSSA',
 'QUANT',
 'EDITORIALS',
 'NOLLYWOOD',
 'THERMO',
 'XEM',
 'HYMEN',
 'CELU',
 'MILLING',
 'ANALYTICS',
 'BIGG',
 'AIRTEL',
 'IGNIS',
 'NOIS',
 'GOTRA',
 'VLOG',
 'DELFT',
 'FREQUENCIES',
 'MARATHA',
 'KARGIL',
 'AMRITA',
 'INAT',
 'COMMUNAL',
 'CRISPER',
 'BAJAJ',
 'ROSCA',
 'IONIC',
 'JAAT',
 'CLOAK

### Unknown tokens

In [26]:
unk_vocab = {}
for w in unknown_tokens:
    unk_vocab[w] = vocab[w]
unk_vocab = Counter(unk_vocab)

In [27]:
print('Total number of tokens: ', sum(vocab.values()))
print('Number of unique tokens: ', len(vocab))
print('Total number of unknown tokens: ', sum(unk_vocab.values()))
print('Number of unique unknown tokens: ', len(unknown_tokens))
print('Percent of unknown tokens: ', 100 * sum(unk_vocab.values())/sum(vocab.values()))

Total number of tokens:  19837964
Number of unique tokens:  268601
Total number of unknown tokens:  217147
Number of unique unknown tokens:  102145
Percent of unknown tokens:  1.0946032566648474


In [28]:
unk_common = dict(unk_vocab.most_common())
print('Number of 100 most common unk tokens: ', sum(list(unk_common.values())[:100]))

Number of 100 most common unk tokens:  81924


In [29]:
unk_common

{"n't": 48294,
 '’s': 8778,
 'n’t': 6897,
 '’m': 2683,
 '’ve': 1256,
 'and/or': 1098,
 'Quorans': 882,
 'C++': 750,
 '’re': 734,
 'BITSAT': 578,
 '_': 468,
 'COMEDK': 363,
 'KVPY': 361,
 '9/11': 342,
 'Quoran': 322,
 "''": 298,
 'WBJEE': 239,
 '\xa0': 239,
 '/math': 231,
 'mtech': 220,
 '1/2': 218,
 'x^2': 205,
 'articleship': 198,
 'VITEEE': 188,
 'f(x': 179,
 'aadhar': 161,
 'UPES': 160,
 'c++': 158,
 '^2': 147,
 '’ll': 146,
 'marksheet': 146,
 '’d': 145,
 'Fortnite': 138,
 'AFCAT': 130,
 'UCEED': 126,
 'bcom': 120,
 'dropshipping': 119,
 'UPSEE': 111,
 '24/7': 104,
 'BNBR': 104,
 'Machedo': 103,
 'AMCAT': 97,
 '\\frac': 96,
 'IITian': 95,
 'IITJEE': 89,
 'm/s': 88,
 'Qoura': 81,
 'M.tech': 80,
 '1/3': 80,
 "Qur'an": 79,
 'ICOs': 79,
 '1/4': 79,
 'NMAT': 77,
 '3/4': 73,
 'L&T': 73,
 'JIIT': 73,
 'hairfall': 73,
 'LNMIIT': 72,
 '2/3': 69,
 'Zerodha': 69,
 'A+': 69,
 'Kavalireddi': 67,
 'm.tech': 66,
 'adhar': 66,
 'MAINS': 66,
 'bhakts': 65,
 '1/': 65,
 'R&D': 63,
 'Doklam': 62,
 'NIC

In [30]:
ttunk = TokenTypes()
ttunk(list(unk_common.keys()))

Number of num tokens:  20459
Number of symbol tokens:  1194
Number of lowcase tokens:  54498
Number of upper case tokens:  33019
Nubmer of capslock tokens:  4068


Freq num tokens:  ['9/11', '1/2', 'x^2', '^2', '24/7', '1/3', '1/4', '3/4', '2/3', '1/']
Rare num tokens:  ['GD1', 'N4200', '100/5', 'Z2600', '10–12', '61850', 'Feb-2018', '2200-$2500/mo', '4000/mo', '900va'] 

Freq symbol tokens:  ['_', "''", '\xa0', '`', ':)', ':(', '"-', '\n', ':/', '\\\\']
Rare symbol tokens:  ['δ᾽', 'ἔργον', 'ἐξήλλακτο', 'ἡμετέραν', 'χρῆσιν', 'συνηθείας', '=--', '中庸', '中庸之道', '∅⊂'] 

Freq low tokens:  ["n't", 'n’t', 'and/or', 'mtech', 'x^2', 'articleship', 'f(x', 'aadhar', 'c++', 'marksheet']
Rare low tokens:  ['youvision', 'enrollnment', 'noumber', 'sgill', 'clearly"bad', 'myAIR', 'cybershot', 'ganpat', 'rabels', 'photopicker'] 

Freq up tokens:  ['Quorans', 'C++', 'BITSAT', 'COMEDK', 'KVPY', 'Quoran', 'WBJEE', 'VITEEE', 'UPES', 'Fortnite']
Rare up tokens:  ['ZeDong', 'Kiccha', 'Bhangani', 'GS-', 'Magi

### t.lower() if t not in emb

In [36]:
unk_known = []
for t in list(unk_common.keys()):
    if t.lower() in emb_tokens_set:
        unk_known.append(t)
print(len(unk_known))

3134


In [37]:
unk_known

['MAINS',
 'B.SC',
 'Whst',
 'Demonetization',
 'Devastations',
 'Howmany',
 'MUJ',
 'Dominar',
 'theBest',
 'Invento',
 'Gst',
 'IITS',
 'doI',
 'LORs',
 'ANTIFA',
 'Whic',
 'Rubrik',
 'Ssc',
 'Upvote',
 'Ehat',
 'Whta',
 'Navamsa',
 'Isit',
 'PEGA',
 'BITCOIN',
 'Demonetisation',
 'Microservices',
 'WhAt',
 'F1.8',
 'Whichis',
 'Mca',
 'Plzz',
 'WWhat',
 'Downvote',
 'MnC',
 'Vitiate',
 'Altcoins',
 'Whch',
 'Pribumi',
 'bEst',
 'COMED',
 'LoRa',
 'Clickbank',
 'HoW',
 'Altcoin',
 'Whick',
 'Ipcc',
 'Bpd',
 'Hwo',
 'So2',
 'Bts',
 'RaGa',
 'Whuch',
 'DUCAT',
 'KLU',
 'Numpy',
 'Willl',
 'Vashikaran',
 'Masterbation',
 'Iot',
 'Whatif',
 'Whts',
 'Chatbots',
 '.CA',
 'SLET',
 'MacOs',
 'SIOM',
 'QUORA',
 'IARE',
 'Gpa',
 'Ssn',
 'Edx',
 'DoMS',
 'J2ee',
 'Rbi',
 'Empaths',
 'LiteCoin',
 'JABALPUR',
 'IRSE',
 'Hsc',
 'BODS',
 'CSe',
 'Hve',
 'ETHER',
 'spaCy',
 'tanA',
 'Bcz',
 'Chalta',
 'Srk',
 'BlockChain',
 'Deplorables',
 '60FPS',
 'MBOX',
 'UNSUBSCRIBE',
 'DataFrame',
 'PrOPEL',
