# EDA Glove embedding and Spacy tokenizer

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
import pandas as pd
import spacy
from collections import Counter
import numpy as np
from utils import *

### Look at embedding

In [3]:
emb_path = 'embeddings/glove.840B.300d/glove.840B.300d.txt'
data_dir = cache = '../data'

train_csv = os.path.join(data_dir, 'train.csv')
test_csv = os.path.join(data_dir, 'test.csv')
emb_path = os.path.join(data_dir, emb_path)

In [6]:
%%time
emb_tokens = []
with open(emb_path) as f:
    for line in f:
        tok = line.split()[0]
        emb_tokens.append(tok)

CPU times: user 21.5 s, sys: 1.24 s, total: 22.7 s
Wall time: 22.7 s


In [7]:
token_info(emb_tokens)

Number of tokens:  2196017
First 10 tokens:  [',', '.', 'the', 'and', 'to', 'of', 'a', 'in', '"', ':']


In [9]:
token_types(emb_tokens)

Number of num tokens:  536635
Number of symbol tokens:  2513
Number of lowcase tokens:  653309
Number of upper case tokens:  1089000
Nubmer of capslock tokens:  153812


Freq num tokens:  ['1', '2', '3', '4', '2012', '5', '2011', '10', '2010', '2009']
Rare num tokens:  ['km/11', 'mono-2', 'otherjuicystar07', 'own36', 'p263', 'r91', 's9100', 'sarah123', 'v205', 'z/28'] 

Freq symbol tokens:  [',', '.', '"', ':', ')', '(', '-', '...', '!', '?']
Rare symbol tokens:  ['?!?!?!?!!', '?!?!?!?!?!?!?!?!?', 'ãƒˆã', 'ａ', '-------------------------------------------------------------------------------------------------------------------------------------------------', 'ɯ', 'вЂ', '回', 'ĵ', 'ÐÐ'] 

Freq low tokens:  ['the', 'and', 'to', 'of', 'a', 'in', 'is', 'for', 'that', 'on']
Rare low tokens:  ['work.Like', 'working.So', 'wried', 'wwent', 'xalisae', 'xtremecaffeine', 'yildirim', 'z/28', 'zipout', 'zulchzulu'] 

Freq up tokens:  ['I', 'The', 'It', 'This', 'A', 'In', 'You', 'We', 'If', 'And']
Rare

### Tokenize data and build vocab

In [10]:
train = pd.read_csv(train_csv)['question_text'].values.tolist()
test = pd.read_csv(test_csv)['question_text'].values.tolist()
data = train + test

In [11]:
%%time
unknown_tokens = []
vocab = Counter()
spacy_en = spacy.load('en')
tokenizer = spacy_en.tokenizer
emb_tokens_set = set(emb_tokens)
for question in data:
    doc = tokenizer(question)
    for token in doc:
        tok_text = token.text
        vocab.update([tok_text])
        if vocab[tok_text] == 1:
            if tok_text not in emb_tokens_set:
                unknown_tokens.append(tok_text)
        
#doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
#for token in doc:
#    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#          token.shape_, token.is_alpha, token.is_stop)

CPU times: user 2min 24s, sys: 185 ms, total: 2min 25s
Wall time: 2min 23s


In [12]:
vocab_tokens = dict(vocab.most_common()).keys()

In [13]:
token_types(vocab_tokens)

Number of num tokens:  29530
Number of symbol tokens:  1727
Number of lowcase tokens:  132699
Number of upper case tokens:  113484
Nubmer of capslock tokens:  15169


Freq num tokens:  ['2', '2017', '2018', '1', '3', '10', '5', '4', '12', '6']
Rare num tokens:  ['10–12', '61850', 'Feb-2018', '2200-$2500/mo', '4000/mo', '999999', '450cc', '78.53', '900va', '6740'] 

Freq symbol tokens:  ['?', ',', '.', '"', '-', ')', '(', '/', "'", '&']
Rare symbol tokens:  ['τῆς', 'κατὰ', 'τὴν', 'ἡμετέραν', 'χρῆσιν', 'συνηθείας', '=--', '中庸', '中庸之道', '∅⊂'] 

Freq low tokens:  ['the', 'to', 'a', 'in', 'is', 'of', 'and', 'do', 'are', 'for']
Rare low tokens:  ['noumber', 'sml', 'sgill', 'jauhar', 'clearly"bad', 'myAIR', 'cybershot', 'ganpat', 'rabels', 'photopicker'] 

Freq up tokens:  ['What', 'I', 'How', 'Why', 'Is', 'Can', 'Which', 'Do', 'If', 'India']
Rare up tokens:  ['Dime', 'Entanglements', 'Tamilar', 'Flurry', 'Routines', 'ULTIMATELY', 'Hb', 'Generate', 'Khayami', 'Thetis'] 

Freq caps tokens:  ['

### Capslock tokens

In [14]:
caps_re = re.compile('^[A-Z]*$')
caps_tokens = []
for t in vocab_tokens:
    if caps_re.match(t):
        caps_tokens.append(t)

In [15]:
emotional_caps_tokens = []
for t in caps_tokens:
    if t not in emb_tokens_set and t.lower() in emb_tokens_set:
        emotional_caps_tokens.append(t)
print(len(emotional_caps_tokens))

307


In [16]:
emotional_caps_tokens

['UPSE',
 'BHIM',
 'IPHO',
 'AADHAR',
 'ANTHE',
 'MESRA',
 'QUORA',
 'IARE',
 'HODL',
 'BLACKI',
 'SHIATS',
 'BRDS',
 'PSUS',
 'NARCOS',
 'SILCHAR',
 'SNAPCHAT',
 'OCER',
 'NITR',
 'UOFT',
 'FUDI',
 'DEPR',
 'NAOH',
 'CRYPTOCURRENCY',
 'EVETS',
 'SBUF',
 'JOSSA',
 'RGIA',
 'CELU',
 'IANG',
 'ARTICLESHIP',
 'GOTRA',
 'MARATHA',
 'PRIST',
 'CAPGEMINI',
 'JAAT',
 'LIATE',
 'OMPRAKASH',
 'HEETS',
 'GOOGEL',
 'ACHEIVEMENTS',
 'RITEE',
 'IRRESPONSIBILITY',
 'ISHANT',
 'BATSMEN',
 'MEDICORE',
 'ACCEDENT',
 'PYURIA',
 'GASSES',
 'CURRNT',
 'INSITED',
 'EXHISTING',
 'FANCHISE',
 'CENSE',
 'SURABHI',
 'IRRITANTS',
 'PANVEL',
 'SAYEED',
 'OLYMPIADS',
 'JUCHE',
 'UNWORKABLE',
 'BLASPHEMING',
 'DHARWAD',
 'ASKD',
 'CHEGG',
 'FIIO',
 'APPOITMENT',
 'HTTRACK',
 'WEBTOON',
 'REFRENCES',
 'SHEIN',
 'BANARAS',
 'GRETEST',
 'GOWDA',
 'REFERNECE',
 'LANDLESS',
 'INGENERAL',
 'FATIHAH',
 'SANJU',
 'BELEIEVE',
 'PARTIT',
 'DAENERYS',
 'TUMMO',
 'BITCOINS',
 'RATIONALIZE',
 'ADND',
 'AMIEN',
 'VEERE',
 'FETO

### Unknown tokens

In [17]:
unk_vocab = {}
for w in unknown_tokens:
    unk_vocab[w] = vocab[w]
unk_vocab = Counter(unk_vocab)

In [18]:
print('Total number of tokens: ', sum(vocab.values()))
print('Number of unique tokens: ', len(vocab))
print('Total number of unknown tokens: ', sum(unk_vocab.values()))
print('Number of unique unknown tokens: ', len(unknown_tokens))
print('Percent of unknown tokens: ', 100 * sum(unk_vocab.values())/sum(vocab.values()))

Total number of tokens:  19837964
Number of unique tokens:  268601
Total number of unknown tokens:  113339
Number of unique unknown tokens:  82595
Percent of unknown tokens:  0.5713237507639393


In [19]:
unk_common = dict(unk_vocab.most_common())
print('Number of 100 most common unk tokens: ', sum(list(unk_common.values())[:100]))

Number of 100 most common unk tokens:  9242


In [20]:
unk_common

{'..': 888,
 'Quorans': 882,
 'Brexit': 510,
 'cryptocurrencies': 506,
 'Redmi': 394,
 '\xa0': 239,
 '/math': 231,
 'x^2': 205,
 'f(x': 179,
 '^2': 147,
 'OnePlus': 130,
 'UCEED': 126,
 'Blockchain': 111,
 'GDPR': 110,
 'demonetisation': 109,
 'Coinbase': 105,
 'BNBR': 104,
 'Machedo': 103,
 'Adityanath': 101,
 'Boruto': 96,
 '\\frac': 96,
 'ethereum': 95,
 'DCEU': 93,
 'IIEST': 90,
 'SJWs': 84,
 'Qoura': 81,
 "Qur'an": 79,
 'LNMIIT': 72,
 'Zerodha': 69,
 'A+': 69,
 'Upwork': 68,
 'Kavalireddi': 67,
 '.net': 65,
 'bhakts': 65,
 '1/': 65,
 'Doklam': 62,
 'NICMAR': 62,
 'Vajiram': 61,
 'Unacademy': 60,
 'w/': 56,
 'AlShamsi': 56,
 'MUOET': 55,
 'chsl': 55,
 'x^3': 55,
 "5'4": 53,
 'Bhakts': 52,
 'HackerRank': 52,
 'Litecoin': 51,
 'Jiren': 51,
 'Awdhesh': 50,
 "A2A'd": 49,
 'altcoin': 49,
 'y^2': 48,
 'eLitmus': 47,
 'altcoins': 47,
 'Cryptocurrency': 47,
 'Ryzen': 46,
 "5'9": 46,
 'SRMJEE': 46,
 '\\sqrt': 44,
 'Baahubali': 44,
 '^3': 44,
 "5'5": 44,
 'Amazon.in': 43,
 "5'3": 42,
 'SGSIT

In [21]:
token_types(list(unk_common.keys()))

Number of num tokens:  16564
Number of symbol tokens:  1519
Number of lowcase tokens:  43548
Number of upper case tokens:  26679
Nubmer of capslock tokens:  2531


Freq num tokens:  ['x^2', '^2', '1/', 'x^3', "5'4", "A2A'd", 'y^2', "5'9", '^3', "5'5"]
Rare num tokens:  ['An+1=', 'An=2', 'NBA2K18', '3.71/4.44', '39/389', 'Z2600', '10–12', 'Feb-2018', '2200-$2500/mo', '900va'] 

Freq symbol tokens:  ['..', '\xa0', ':(', '₹', '"-', 'करना', '\ufeff', '\n', ':/', '️']
Rare symbol tokens:  ['τῆς', 'κατὰ', 'τὴν', 'ἡμετέραν', 'χρῆσιν', 'συνηθείας', '=--', '中庸', '中庸之道', '∅⊂'] 

Freq low tokens:  ['cryptocurrencies', 'x^2', 'f(x', 'demonetisation', 'ethereum', 'bhakts', 'w/', 'chsl', 'x^3', 'altcoin']
Rare low tokens:  ['youvision', 'enrollnment', 'noumber', 'sgill', 'jauhar', 'clearly"bad', 'myAIR', 'ganpat', 'rabels', 'photopicker'] 

Freq up tokens:  ['Quorans', 'Brexit', 'Redmi', 'OnePlus', 'UCEED', 'Blockchain', 'GDPR', 'Coinbase', 'BNBR', 'Machedo']
Rare up tokens:  ['Bhangani', 'GS-', 'Ma

### Look at spacy tokenizer 

In [22]:
from spacy.lang.en.stop_words import STOP_WORDS

In [23]:
print(STOP_WORDS)

{'whether', 'must', 'you', 'becomes', 'otherwise', 'here', 'someone', 'on', 'and', 'no', 'out', 'make', 'everything', 'seemed', 'the', 'next', 'as', 'they', 'whereafter', 'whom', 'while', 'such', 'had', 'yourselves', 'four', 'latter', 'last', 'name', 'over', 'after', 'six', 'very', 'his', 'within', 'through', 'around', 'further', 'am', 'sometime', 'toward', 'behind', 'do', 'put', 'various', 'yours', 'five', 'something', 'quite', 'without', 'would', 'are', 'least', 'formerly', 'amount', 'please', 'thus', 'if', 'latterly', 'because', 'via', 'made', 'already', 'until', 'down', 'be', 'himself', 'doing', 'former', 'seeming', 'take', 'namely', 'side', 'what', 'has', 'against', 'herein', 'hers', 'itself', 'she', 'he', 'off', 'them', 'beforehand', 'due', 'my', 'ten', 'those', 'towards', 'sixty', 'i', 'though', 'show', 'own', 'myself', 'unless', 'during', 'hereupon', 'keep', 'somehow', 'were', 'nine', 'twelve', 'still', 'themselves', 'using', 'but', 'ca', 'a', 'again', 'may', 'often', 'full', '

### Emb vectors

In [13]:
%%time
# read to dict
embs = {}
with open(emb_path) as f:
    count = 0
    for line in f:
        count += 1
        cut = line.find(' ')
        tok = line[:cut]
        emb = line[cut + 1:].split()
        emb = np.array([float(e) for e in emb])
        #embs[tok] = emb

CPU times: user 3min 38s, sys: 952 ms, total: 3min 39s
Wall time: 3min 39s


In [18]:
%%time
a = np.loadtxt(emb_path, usecols=list(range(2, 300)))

ValueError: could not convert string to float: '.'

In [16]:
range(1,200)

range(1, 200)

In [None]:
# times:
# go through lines: 2.5 s
# tok = : 3 s
# emb = line[cut + 1:].split()   23.5 s
# emb = np.array() :  1 min 56 sec
# total: 2 min

In [65]:
low = embs['I']
up = embs['you']

In [66]:
def sim(v1, v2):
    result = sum(v1*v2)/np.linalg.norm(v1)/np.linalg.norm(v2)
    return result

In [67]:
print(sim(low, up))

0.6627086554182071


In [30]:
# read to array
emb_vectors = emb_to_array(emb_path)

In [51]:
mean = np.mean(emb_vectors, 0)

In [52]:
np.mean(np.array([abs(e) for e in mean]))

0.11827651831144917

In [53]:
np.mean(mean)

-0.005838490668689134

In [54]:
np.std(mean)

0.1819384092990661

In [55]:
np.std(emb_vectors, 0)

array([0.44851531, 0.46038846, 0.44955892, 0.45646961, 0.47140167,
       0.4422903 , 0.44041244, 0.44698275, 0.4442726 , 0.59969696,
       0.46547839, 0.44527083, 0.44985766, 0.45156409, 0.44917818,
       0.44906476, 0.44474889, 0.52384112, 0.45641812, 0.4467048 ,
       0.44479669, 0.45699833, 0.45092903, 0.44507482, 0.45484755,
       0.44878899, 0.44448158, 0.43785266, 0.45786598, 0.44926295,
       0.44259412, 0.45113808, 0.45128958, 0.44436349, 0.45377213,
       0.44751607, 0.44129199, 0.46179661, 0.43668213, 0.47212486,
       0.44979053, 0.45925169, 0.44387464, 0.45996361, 0.45231752,
       0.44722584, 0.46001757, 0.4524131 , 0.44885582, 0.45103552,
       0.44425859, 0.46027448, 0.44023431, 0.44849907, 0.44747041,
       0.45154174, 0.44528843, 0.44824609, 0.45852647, 0.45084543,
       0.44968386, 0.44867101, 0.45057548, 0.48588496, 0.46485229,
       0.44705304, 0.44496828, 0.4460746 , 0.44262784, 0.46026568,
       0.46001462, 0.4595112 , 0.46911504, 0.44716035, 0.44971

In [56]:
np.mean(emb_vectors)

-0.005838490668689374

In [57]:
np.std(emb_vectors)

0.48782070623981577

In [66]:
np.random.normal(0, 0.5)

0.16967462959684068

### t.lower() if t not in emb

In [24]:
unk_known = []
for t in list(unk_common.keys()):
    if t.lower() in emb_tokens_set:
        unk_known.append(t)
print(len(unk_known))

2161


In [25]:
unk_known

['Blockchain',
 'Cryptocurrency',
 'Golang',
 'Whst',
 'UPSE',
 'Fiitjee',
 'Demonetization',
 'Howcan',
 'BHIM',
 'Swiggy',
 'GANs',
 'B.arch',
 "DON'T",
 'Jipmer',
 'IPHO',
 'Whichis',
 '9A0',
 'Chutiya',
 'WWhat',
 'BigHit',
 'MnC',
 'Cgl',
 'H2So4',
 'AADHAR',
 'Ibps',
 'BvS',
 'Whatdo',
 'Pribumi',
 'GoLang',
 'AngularJs',
 'Rupay',
 'SHAREit',
 'LoRa',
 'Wbjee',
 'Angularjs',
 'Whatdoes',
 'Lpu',
 'Odias',
 'UpGrad',
 'Whuch',
 'BoJack',
 'pharmD',
 'Tnpsc',
 'ANTHE',
 'Happn',
 'Dogsee',
 'NetLogon',
 'LLb',
 'Whatwas',
 'eBay.in',
 'MESRA',
 'Wherecan',
 'Ncert',
 'ReLEx',
 'QUORA',
 'IARE',
 'UoT',
 'DoMS',
 'Wgat',
 'A.p',
 'TenX',
 'CSe',
 'H0w',
 'Wjat',
 'H2so4',
 'spaCy',
 'tanA',
 'Pagri',
 'BlockChain',
 'Acads',
 'Lnt',
 'theDifference',
 'Metzitzah',
 'ComedK',
 'Watsapp',
 'Whh',
 'PrOPEL',
 'GitHub.com',
 'B.ARCH',
 'Java.lang',
 'Rrb',
 'LnT',
 'HODL',
 'Cdac',
 'WhT',
 'AntiFa',
 'Epf',
 'BLACKI',
 'SHIATS',
 'BRDS',
 'Hown',
 'GloVe',
 'cosA',
 'Jeou',
 'sinA',
 