# EDA paramgram embedding and Spacy tokenizer

In [1]:
import os
import re
import pandas as pd
import spacy
from collections import Counter

In [2]:
def token_info(tokens):
    print('Number of tokens: ', len(tokens))
    print('First 10 tokens: ', tokens[:10])

In [3]:
def token_types(tokens):
    num_tokens = []
    symbol_tokens = []
    low_tokens = []
    up_tokens = []
    caps_tokens = []
    num_re = re.compile('[0-9]')
    low_re = re.compile('[a-z]')
    up_re = re.compile('[A-Z]')
    caps_re = re.compile('^[A-Z]*$')
    letnum_re = re.compile('[a-zA-Z0-9]')
    for t in tokens:
        if num_re.search(t):
            num_tokens.append(t)
        if not letnum_re.search(t):
            symbol_tokens.append(t)
        if low_re.match(t):
            low_tokens.append(t)
        if up_re.match(t):
            up_tokens.append(t)
        if caps_re.match(t):
            caps_tokens.append(t)
    print('Number of num tokens: ', len(num_tokens))
    print('Number of symbol tokens: ', len(symbol_tokens))
    print('Number of lowcase tokens: ' , len(low_tokens))
    print('Number of upper case tokens: ', len(up_tokens))
    print('Nubmer of capslock tokens: ' , len(caps_tokens))
    
    print('\n')
    
    print('Freq num tokens: ', num_tokens[:10])
    print('Rare num tokens: ', num_tokens[-10:], '\n')
    print('Freq symbol tokens: ', symbol_tokens[:10])
    print('Rare symbol tokens: ', symbol_tokens[-10:], '\n')
    print('Freq low tokens: ', low_tokens[:10])
    print('Rare low tokens: ', low_tokens[-10:], '\n')
    print('Freq up tokens: ', up_tokens[:10])
    print('Rare up tokens: ', up_tokens[-10:], '\n')
    print('Freq caps tokens: ', caps_tokens[:10])
    print('Rare caps tokens: ', caps_tokens[-10:])

### Look at embedding

In [4]:
emb_path = 'embeddings/paragram_300_sl999/paragram_300_sl999.txt'
data_dir = cache = '../data'

train_csv = os.path.join(data_dir, 'train.csv')
test_csv = os.path.join(data_dir, 'test.csv')
emb_path = os.path.join(data_dir, emb_path)

In [5]:
%%time
emb_tokens = []
with open(emb_path) as f:
    for line in f:
        tok = line.split()[0]
        emb_tokens.append(tok)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbd in position 731: invalid start byte

In [6]:
token_info(emb_tokens)

Number of tokens:  97525
First 10 tokens:  [',', '.', 'the', 'and', 'to', 'of', 'a', 'in', '"', ':']


In [205]:
token_types(emb_tokens)

Number of num tokens:  536635
Number of symbol tokens:  2513
Number of lowcase tokens:  653309
Number of upper case tokens:  1089000
Nubmer of capslock tokens:  153812


Freq num tokens:  ['1', '2', '3', '4', '2012', '5', '2011', '10', '2010', '2009']
Rare num tokens:  ['km/11', 'mono-2', 'otherjuicystar07', 'own36', 'p263', 'r91', 's9100', 'sarah123', 'v205', 'z/28'] 

Freq symbol tokens:  [',', '.', '"', ':', ')', '(', '-', '...', '!', '?']
Rare symbol tokens:  ['?!?!?!?!!', '?!?!?!?!?!?!?!?!?', 'ãƒˆã', 'ａ', '-------------------------------------------------------------------------------------------------------------------------------------------------', 'ɯ', 'вЂ', '回', 'ĵ', 'ÐÐ'] 

Freq low tokens:  ['the', 'and', 'to', 'of', 'a', 'in', 'is', 'for', 'that', 'on']
Rare low tokens:  ['work.Like', 'working.So', 'wried', 'wwent', 'xalisae', 'xtremecaffeine', 'yildirim', 'z/28', 'zipout', 'zulchzulu'] 

Freq up tokens:  ['I', 'The', 'It', 'This', 'A', 'In', 'You', 'We', 'If', 'And']
Rare

### Tokenize data and build vocab

In [58]:
train = pd.read_csv(train_csv)['question_text'].values.tolist()
test = pd.read_csv(test_csv)['question_text'].values.tolist()
data = train + test

In [124]:
%%time
unknown_tokens = []
vocab = Counter()
spacy_en = spacy.load('en')
tokenizer = spacy_en.tokenizer
emb_tokens_set = set(emb_tokens)
for question in data:
    doc = tokenizer(question)
    for token in doc:
        tok_text = token.text
        vocab.update([tok_text])
        if vocab[tok_text] == 1:
            if tok_text not in emb_tokens_set:
                unknown_tokens.append(tok_text)
        
#doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
#for token in doc:
#    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#          token.shape_, token.is_alpha, token.is_stop)

CPU times: user 2min 22s, sys: 136 ms, total: 2min 22s
Wall time: 2min 21s


In [219]:
vocab_tokens = dict(vocab.most_common()).keys()

In [220]:
token_types(vocab_tokens)

Number of num tokens:  29530
Number of symbol tokens:  1727
Number of lowcase tokens:  132699
Number of upper case tokens:  113484
Nubmer of capslock tokens:  15169


Freq num tokens:  ['2', '2017', '2018', '1', '3', '10', '5', '4', '12', '6']
Rare num tokens:  ['10–12', '61850', 'Feb-2018', '2200-$2500/mo', '4000/mo', '999999', '450cc', '78.53', '900va', '6740'] 

Freq symbol tokens:  ['?', ',', '.', '"', '-', ')', '(', '/', "'", '&']
Rare symbol tokens:  ['τῆς', 'κατὰ', 'τὴν', 'ἡμετέραν', 'χρῆσιν', 'συνηθείας', '=--', '中庸', '中庸之道', '∅⊂'] 

Freq low tokens:  ['the', 'to', 'a', 'in', 'is', 'of', 'and', 'do', 'are', 'for']
Rare low tokens:  ['noumber', 'sml', 'sgill', 'jauhar', 'clearly"bad', 'myAIR', 'cybershot', 'ganpat', 'rabels', 'photopicker'] 

Freq up tokens:  ['What', 'I', 'How', 'Why', 'Is', 'Can', 'Which', 'Do', 'If', 'India']
Rare up tokens:  ['Dime', 'Entanglements', 'Tamilar', 'Flurry', 'Routines', 'ULTIMATELY', 'Hb', 'Generate', 'Khayami', 'Thetis'] 

Freq caps tokens:  ['

### Capslock tokens

In [223]:
caps_re = re.compile('^[A-Z]*$')
caps_tokens = []
for t in vocab_tokens:
    if caps_re.match(t):
        caps_tokens.append(t)

In [237]:
emotional_caps_tokens = []
for t in caps_tokens:
    if t not in emb_tokens_set and t.lower() in emb_tokens_set:
        emotional_caps_tokens.append(t)
print(len(emotional_caps_tokens))

307


In [238]:
emotional_caps_tokens

['UPSE',
 'BHIM',
 'IPHO',
 'AADHAR',
 'ANTHE',
 'MESRA',
 'QUORA',
 'IARE',
 'HODL',
 'BLACKI',
 'SHIATS',
 'BRDS',
 'PSUS',
 'NARCOS',
 'SILCHAR',
 'SNAPCHAT',
 'OCER',
 'NITR',
 'UOFT',
 'FUDI',
 'DEPR',
 'NAOH',
 'CRYPTOCURRENCY',
 'EVETS',
 'SBUF',
 'JOSSA',
 'RGIA',
 'CELU',
 'IANG',
 'ARTICLESHIP',
 'GOTRA',
 'MARATHA',
 'PRIST',
 'CAPGEMINI',
 'JAAT',
 'LIATE',
 'OMPRAKASH',
 'HEETS',
 'GOOGEL',
 'ACHEIVEMENTS',
 'RITEE',
 'IRRESPONSIBILITY',
 'ISHANT',
 'BATSMEN',
 'MEDICORE',
 'ACCEDENT',
 'PYURIA',
 'GASSES',
 'CURRNT',
 'INSITED',
 'EXHISTING',
 'FANCHISE',
 'CENSE',
 'SURABHI',
 'IRRITANTS',
 'PANVEL',
 'SAYEED',
 'OLYMPIADS',
 'JUCHE',
 'UNWORKABLE',
 'BLASPHEMING',
 'DHARWAD',
 'ASKD',
 'CHEGG',
 'FIIO',
 'APPOITMENT',
 'HTTRACK',
 'WEBTOON',
 'REFRENCES',
 'SHEIN',
 'BANARAS',
 'GRETEST',
 'GOWDA',
 'REFERNECE',
 'LANDLESS',
 'INGENERAL',
 'FATIHAH',
 'SANJU',
 'BELEIEVE',
 'PARTIT',
 'DAENERYS',
 'TUMMO',
 'BITCOINS',
 'RATIONALIZE',
 'ADND',
 'AMIEN',
 'VEERE',
 'FETO

### Unknown tokens

In [133]:
unk_vocab = {}
for w in unknown_tokens:
    unk_vocab[w] = vocab[w]
unk_vocab = Counter(unk_vocab)

In [143]:
print('Total number of tokens: ', sum(vocab.values()))
print('Number of unique tokens: ', len(vocab))
print('Total number of unknown tokens: ', sum(unk_vocab.values()))
print('Number of unique unknown tokens: ', len(unknown_tokens))
print('Percent of unknown tokens: ', 100 * sum(unk_vocab.values())/sum(vocab.values()))

Total number of tokens:  19837964
Number of unique tokens:  268601
Total number of unknown tokens:  113339
Number of unique unknown tokens:  82595
Percent of unknown tokens:  0.5713237507639393


In [213]:
unk_common = dict(unk_vocab.most_common())
print('Number of 100 most common unk tokens: ', sum(list(unk_common.values())[:100]))

Number of 100 most common unk tokens:  9242


In [222]:
unk_common

{'..': 888,
 'Quorans': 882,
 'Brexit': 510,
 'cryptocurrencies': 506,
 'Redmi': 394,
 '\xa0': 239,
 '/math': 231,
 'x^2': 205,
 'f(x': 179,
 '^2': 147,
 'OnePlus': 130,
 'UCEED': 126,
 'Blockchain': 111,
 'GDPR': 110,
 'demonetisation': 109,
 'Coinbase': 105,
 'BNBR': 104,
 'Machedo': 103,
 'Adityanath': 101,
 'Boruto': 96,
 '\\frac': 96,
 'ethereum': 95,
 'DCEU': 93,
 'IIEST': 90,
 'SJWs': 84,
 'Qoura': 81,
 "Qur'an": 79,
 'LNMIIT': 72,
 'Zerodha': 69,
 'A+': 69,
 'Upwork': 68,
 'Kavalireddi': 67,
 '.net': 65,
 'bhakts': 65,
 '1/': 65,
 'Doklam': 62,
 'NICMAR': 62,
 'Vajiram': 61,
 'Unacademy': 60,
 'w/': 56,
 'AlShamsi': 56,
 'MUOET': 55,
 'chsl': 55,
 'x^3': 55,
 "5'4": 53,
 'Bhakts': 52,
 'HackerRank': 52,
 'Litecoin': 51,
 'Jiren': 51,
 'Awdhesh': 50,
 "A2A'd": 49,
 'altcoin': 49,
 'y^2': 48,
 'eLitmus': 47,
 'altcoins': 47,
 'Cryptocurrency': 47,
 'Ryzen': 46,
 "5'9": 46,
 'SRMJEE': 46,
 '\\sqrt': 44,
 'Baahubali': 44,
 '^3': 44,
 "5'5": 44,
 'Amazon.in': 43,
 "5'3": 42,
 'SGSIT

In [221]:
token_types(list(unk_common.keys()))

Number of num tokens:  16564
Number of symbol tokens:  1519
Number of lowcase tokens:  43548
Number of upper case tokens:  26679
Nubmer of capslock tokens:  2531


Freq num tokens:  ['x^2', '^2', '1/', 'x^3', "5'4", "A2A'd", 'y^2', "5'9", '^3', "5'5"]
Rare num tokens:  ['An+1=', 'An=2', 'NBA2K18', '3.71/4.44', '39/389', 'Z2600', '10–12', 'Feb-2018', '2200-$2500/mo', '900va'] 

Freq symbol tokens:  ['..', '\xa0', ':(', '₹', '"-', 'करना', '\ufeff', '\n', ':/', '️']
Rare symbol tokens:  ['τῆς', 'κατὰ', 'τὴν', 'ἡμετέραν', 'χρῆσιν', 'συνηθείας', '=--', '中庸', '中庸之道', '∅⊂'] 

Freq low tokens:  ['cryptocurrencies', 'x^2', 'f(x', 'demonetisation', 'ethereum', 'bhakts', 'w/', 'chsl', 'x^3', 'altcoin']
Rare low tokens:  ['youvision', 'enrollnment', 'noumber', 'sgill', 'jauhar', 'clearly"bad', 'myAIR', 'ganpat', 'rabels', 'photopicker'] 

Freq up tokens:  ['Quorans', 'Brexit', 'Redmi', 'OnePlus', 'UCEED', 'Blockchain', 'GDPR', 'Coinbase', 'BNBR', 'Machedo']
Rare up tokens:  ['Bhangani', 'GS-', 'Ma

### Look at spacy tokenizer 

In [166]:
from spacy.lang.en.stop_words import STOP_WORDS

In [167]:
print(STOP_WORDS)

{'several', 'while', 'moreover', 'without', 'since', 'whoever', 'where', 'that', 'anyway', 'another', 'became', 'can', 'doing', 'anything', 'except', 'everything', 'if', 'were', 'else', 'thereafter', 'one', 'between', 'which', 'whose', 'formerly', 'nowhere', 'made', 'or', 'six', 'now', 'perhaps', 'name', 'same', 'become', 'being', 'these', 'former', 'whenever', 'nine', 'just', 'many', 'almost', 'first', 'sometime', 'please', 'twelve', 'besides', 'keep', 'is', 'have', 'latterly', 'anyone', 're', 'there', 'upon', 'get', 'below', 'nevertheless', 'never', 'above', 'serious', 'hence', 'twenty', 'latter', 'before', 'ourselves', 'more', 'onto', 'whom', 'take', 'afterwards', 'in', 'he', 'already', 'here', 'nothing', 'does', 'namely', 'noone', 'could', 'yourselves', 'move', 'using', 'whereas', 'least', 'she', 'behind', 'they', 'none', 'together', 'ever', 'too', 'across', 'than', 'most', 'elsewhere', 'fifteen', 'sometimes', 'our', 'though', 'when', 'seeming', 'anyhow', 'about', 'mine', 'four', '

### Read embedding

In [5]:
import numpy as np
embs = {}
with open(emb_path) as f:
    count = 0
    for line in f:
        if count > 100000:
            break
        count += 1
        cut = line.find(' ')
        tok = line[:cut]
        emb = line[cut + 1:].split()
        emb = np.array([float(e) for e in emb])
        embs[tok] = emb

In [65]:
low = embs['I']
up = embs['you']

In [66]:
def sim(v1, v2):
    result = sum(v1*v2)/np.linalg.norm(v1)/np.linalg.norm(v2)
    return result

In [67]:
print(sim(low, up))

0.6627086554182071
