# EDA google news embedding and Spacy tokenizer

In [1]:
import os
import re
import pandas as pd
import spacy
from collections import Counter
from gensim.models import KeyedVectors
from utils import *

### Look at embedding

In [2]:
emb_path = 'embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
data_dir = cache = '../data'

train_csv = os.path.join(data_dir, 'train.csv')
test_csv = os.path.join(data_dir, 'test.csv')
emb_path = os.path.join(data_dir, emb_path)

In [3]:
embeddings_index = KeyedVectors.load_word2vec_format(emb_path, binary=True)

In [4]:
emb_tokens = embeddings_index.index2word

In [32]:
embeddings_index.vector_size

300

In [5]:
token_info(emb_tokens)

Number of tokens:  3000000
First 10 tokens:  ['</s>', 'in', 'for', 'that', 'is', 'on', '##', 'The', 'with', 'said']


In [6]:
ttemb = TokenTypes()
ttemb(emb_tokens)

Number of num tokens:  35844
Number of symbol tokens:  5172
Number of lowcase tokens:  631815
Number of upper case tokens:  2324222
Nubmer of capslock tokens:  104279


Freq num tokens:  ['1', '2', '3', '5', '4', '6', '7', '8', '9', '0']
Rare num tokens:  ['K9_Duds', 'ponceau_4R_E###', 'ctorres2@bloomberg.net', 'Twickenham_TW1', 'GSX_R####_K9', 'Jailbreak_iPhone_3GS', 'W3C_OASIS', 'Time4_Media_subsidiary', '1p_dearer', 'Audi_A4_saloon'] 

Freq symbol tokens:  ['##', '####', '$', '###', '#.#', '#-#', '%', '#,###', '#.##', '##,###']
Rare symbol tokens:  ['###:#_###:#', '=====_====', 'Ð_¸_Ð', '+##-###-##-#####', '¢_##,###,###,###', '##,###,###_##,###,###_-------------------------------------', '#########_##.####_#.#####', '#,###,###_#,###,###_-------------------------------------------', '##.#:#_##.#:#_##.#:#', '#.#####_+#.#####_#.#####_-#.#####'] 

Freq low tokens:  ['in', 'for', 'that', 'is', 'on', 'with', 'said', 'was', 'the', 'at']
Rare low tokens:  ['prince_Guido_Henckel', 'loess_soi

In [7]:
ttemb.num_tokens

['1',
 '2',
 '3',
 '5',
 '4',
 '6',
 '7',
 '8',
 '9',
 '0',
 '2nd',
 '1st',
 '3_pointer',
 '3rd',
 '3D',
 '4th',
 '3_pointers',
 '3G',
 '5th',
 '6th',
 '7th',
 '8th',
 '9th',
 'Q1',
 'F1',
 'Q3',
 'Q2',
 'Q4',
 'PS3',
 'CO2',
 'MP3',
 '4A',
 '7pm',
 'G8',
 '8pm',
 '4G',
 '3M',
 '3s',
 '5pm',
 '3A',
 'U2',
 'No.1',
 '6pm',
 '5A',
 '3pm',
 '0_Comments',
 '5K',
 'O2',
 '2pm',
 '4pm',
 '0_0',
 'E3',
 '9pm',
 '9am',
 '+1',
 '2A',
 '2B',
 'Class_4A',
 '1pm',
 '2D',
 'ESPN2',
 'F.3d',
 'AZ_az_,0_9',
 '1B',
 '1A',
 'CO2_emissions',
 'Class_3A',
 'B2B',
 'Proposition_8',
 '8am',
 'VH1',
 'G7',
 '1Q',
 'H1',
 'Class_2A',
 '4Q',
 'Windows_Phone_7',
 '3B',
 'A1',
 'Class_5A',
 '3Q',
 'MP3_player',
 'iPhone_3G',
 '3_PTR',
 '2Q',
 'PS2',
 'R2',
 '6am',
 'IPv6',
 '0_errors',
 '£_1m',
 '7am',
 'Comments_0',
 'G1',
 'P2P',
 '+##_0',
 '2b',
 '3Com',
 'omega_3',
 'V8',
 'No.2',
 'MI5',
 '1m',
 'K2',
 'P1',
 'R1',
 '1st_Class',
 '2G',
 '2GB',
 '-1',
 'Update1',
 '3am',
 'Class_1A',
 'ITV1',
 '1GB',
 '2am'

In [8]:
[t for t in ttemb.symbol_tokens if len(t) <= 2]

['##',
 '$',
 '%',
 '&',
 '•',
 '_',
 '£',
 '*',
 '#',
 '®',
 '+',
 '`',
 '@',
 '€',
 '»',
 '™',
 '=',
 '«',
 '·',
 '½',
 '©',
 '**',
 '>>',
 '~',
 '¢',
 '§',
 '¶',
 '■',
 'Â',
 '¥',
 '●',
 '::',
 '¿',
 '¤',
 ':)',
 '°',
 '^',
 '±',
 'â',
 '×',
 '¼',
 'ā',
 '♦',
 '¾',
 '¬',
 '◆',
 '²',
 '行情',
 ';)',
 '³',
 'μ',
 'Ñ',
 '¨',
 '→',
 'º',
 '¡',
 'Ð',
 'à',
 'î',
 'É',
 '月',
 '\x93',
 '__',
 '年',
 '►',
 '\x94',
 ':/',
 '†',
 'Š',
 '>',
 '※',
 'Ó',
 '¹',
 '‰',
 'іѕ',
 '★',
 '∞',
 'ó',
 '\ue06e',
 'Ø',
 '==',
 'ä',
 'ñ',
 '\uf0a7',
 '¯',
 '\uf0b7',
 '♥',
 '─',
 '▪',
 '抯',
 '≥',
 'аѕ',
 '₤',
 'Ÿ',
 '¸',
 'ª',
 '\x9d',
 'æ',
 '&&',
 'Å',
 'ㅡ',
 'ø',
 '\uf06e',
 '的',
 'Ã',
 'Ö',
 '□',
 '\uf02e',
 '◊',
 '‡',
 'Äî',
 '♣',
 'Ê',
 'þ',
 '≤',
 '⋅',
 '❑',
 'û',
 'ß',
 '♠',
 '\x81',
 'β',
 '\x90',
 '뭩',
 '荘',
 '待续',
 'α',
 'é',
 '\uf0d8',
 'è',
 'ö',
 'á',
 'Ü',
 '和',
 '‖',
 'ü',
 '\x96',
 '\uf025',
 '在',
 '\x80',
 '÷',
 'Ń',
 '√',
 '˚',
 '每',
 '\x97',
 'ï',
 'ù',
 'Ω',
 '◗',
 'å',
 'À',
 '∙',
 '了',
 '

In [84]:
mixed_tokens = []
mix_re = re.compile('^[a-zA-Z,.?]*$')
symb_re = re.compile(',.?')
for t in emb_tokens:
    if  symb_re.search(t) and not re.search('#', t):
        mixed_tokens.append(t)

In [86]:
mixed_tokens.__len__()

114

### Tokenize data and build vocab

In [9]:
train = pd.read_csv(train_csv)['question_text'].values.tolist()
test = pd.read_csv(test_csv)['question_text'].values.tolist()
data = train + test

In [10]:
%%time
unknown_tokens = []
vocab = Counter()
spacy_en = spacy.load('en')
tokenizer = spacy_en.tokenizer
emb_tokens_set = set(emb_tokens)
for question in data:
    doc = tokenizer(question)
    for token in doc:
        tok_text = token.text
        vocab.update([tok_text])
        if vocab[tok_text] == 1:
            if tok_text not in emb_tokens_set:
                unknown_tokens.append(tok_text)
        
#doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
#for token in doc:
#    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#          token.shape_, token.is_alpha, token.is_stop)

CPU times: user 2min 34s, sys: 256 ms, total: 2min 35s
Wall time: 2min 33s


In [11]:
vocab_tokens = dict(vocab.most_common()).keys()

In [12]:
ttvocab = TokenTypes()
ttvocab(vocab_tokens)

Number of num tokens:  29530
Number of symbol tokens:  1727
Number of lowcase tokens:  132699
Number of upper case tokens:  113484
Nubmer of capslock tokens:  15169


Freq num tokens:  ['2', '2017', '2018', '1', '3', '10', '5', '4', '12', '6']
Rare num tokens:  ['10–12', '61850', 'Feb-2018', '2200-$2500/mo', '4000/mo', '999999', '450cc', '78.53', '900va', '6740'] 

Freq symbol tokens:  ['?', ',', '.', '"', '-', ')', '(', '/', "'", '&']
Rare symbol tokens:  ['τῆς', 'κατὰ', 'τὴν', 'ἡμετέραν', 'χρῆσιν', 'συνηθείας', '=--', '中庸', '中庸之道', '∅⊂'] 

Freq low tokens:  ['the', 'to', 'a', 'in', 'is', 'of', 'and', 'do', 'are', 'for']
Rare low tokens:  ['noumber', 'sml', 'sgill', 'jauhar', 'clearly"bad', 'myAIR', 'cybershot', 'ganpat', 'rabels', 'photopicker'] 

Freq up tokens:  ['What', 'I', 'How', 'Why', 'Is', 'Can', 'Which', 'Do', 'If', 'India']
Rare up tokens:  ['Dime', 'Entanglements', 'Tamilar', 'Flurry', 'Routines', 'ULTIMATELY', 'Hb', 'Generate', 'Khayami', 'Thetis'] 

Freq caps tokens:  ['

### Capslock tokens

In [13]:
caps_tokens = ttvocab.caps_tokens
emotional_caps_tokens = []
for t in caps_tokens:
    if t not in emb_tokens_set and t.lower() in emb_tokens_set:
        emotional_caps_tokens.append(t)
print(len(emotional_caps_tokens))

433


In [14]:
emotional_caps_tokens

['ENGG',
 'DUCAT',
 'BLAT',
 'IYO',
 'BODS',
 'ETHER',
 'DEMAT',
 'VOLTE',
 'SYLLABUS',
 'NARCOS',
 'VANI',
 'VEDAS',
 'PRELIMS',
 'ANONYMOUSLY',
 'ODES',
 'GOLEM',
 'RESONANCE',
 'CEOL',
 'STUPIDEST',
 'DISCOM',
 'DEPR',
 'BABI',
 'WASE',
 'EUCALYPTUS',
 'GEFORCE',
 'DUOS',
 'BIBS',
 'RATNA',
 'COPR',
 'MATHEMATICAL',
 'QUANT',
 'HYMEN',
 'WEKA',
 'ARTICLESHIP',
 'NAVEL',
 'CALCULATE',
 'GOTRA',
 'VLOG',
 'FREQUENCIES',
 'STANKY',
 'STANK',
 'URDU',
 'CRISPER',
 'ROSCA',
 'SMITE',
 'CLOAK',
 'SMOK',
 'ACHEIVEMENTS',
 'IRRESPONSIBILITY',
 'PERSISTED',
 'MEDICORE',
 'HANGERS',
 'GASSES',
 'EXPLAINATION',
 'COMMAS',
 'INSITED',
 'WEBCAMS',
 'CENSE',
 'SABOT',
 'ALLERGENS',
 'IRRITANTS',
 'TOXICS',
 'VOCATION',
 'INFL',
 'OLYMPIADS',
 'JUCHE',
 'UNWORKABLE',
 'BRAHMAN',
 'BLASPHEMING',
 'COCKSUCKERS',
 'KINEMATIC',
 'VISCOSITY',
 'PREDESTINATION',
 'KASPERSKY',
 'ASPIRANTS',
 'MANAGMENT',
 'COLLATE',
 'JOKA',
 'OCAL',
 'AMPLIFIER',
 'ANSD',
 'POEPLE',
 'NAMO',
 'ESCORTS',
 'EYELASHES',
 '

### Unknown tokens

In [15]:
unk_vocab = {}
for w in unknown_tokens:
    unk_vocab[w] = vocab[w]
unk_vocab = Counter(unk_vocab)

In [16]:
print('Total number of tokens: ', sum(vocab.values()))
print('Number of unique tokens: ', len(vocab))
print('Total number of unknown tokens: ', sum(unk_vocab.values()))
print('Number of unique unknown tokens: ', len(unknown_tokens))
print('Percent of unknown tokens: ', 100 * sum(unk_vocab.values())/sum(vocab.values()))

Total number of tokens:  19837964
Number of unique tokens:  268601
Total number of unknown tokens:  3986997
Number of unique unknown tokens:  121310
Percent of unknown tokens:  20.097813465131804


In [17]:
unk_common = dict(unk_vocab.most_common())
print('Number of 100 most common unk tokens: ', sum(list(unk_common.values())[:100]))

Number of 100 most common unk tokens:  3749654


In [18]:
unk_common

{'?': 1440556,
 'to': 423732,
 'a': 421001,
 'of': 347358,
 'and': 263940,
 ',': 240502,
 '.': 99408,
 '"': 75829,
 "'s": 73188,
 '-': 58940,
 ')': 58022,
 '(': 55035,
 '/': 32971,
 "'": 20582,
 '2017': 9010,
 '’s': 8778,
 '2018': 7589,
 ':': 7266,
 '10': 6945,
 'n’t': 6897,
 '”': 4035,
 '“': 4029,
 '12': 3551,
 '100': 3062,
 '20': 2849,
 '’m': 2683,
 '12th': 2651,
 '15': 2384,
 '!': 2311,
 '30': 2129,
 '[': 2039,
 ']': 2027,
 '…': 2009,
 '50': 2007,
 '18': 1974,
 '11': 1863,
 '16': 1512,
 ';': 1493,
 '14': 1426,
 '}': 1425,
 '17': 1343,
 '40': 1315,
 'favourite': 1300,
 '2016': 1291,
 '13': 1271,
 '’ve': 1256,
 '25': 1185,
 '10th': 1105,
 '2019': 1098,
 'and/or': 1098,
 '60': 1066,
 'bitcoin': 1045,
 'colour': 1022,
 '{': 993,
 'centre': 914,
 '..': 888,
 '90': 885,
 'Quorans': 882,
 '500': 875,
 '1000': 871,
 '200': 870,
 '‘': 861,
 'cryptocurrency': 856,
 '11th': 855,
 'Snapchat': 848,
 'e.g.': 808,
 '24': 802,
 'C++': 750,
 '’re': 734,
 '80': 733,
 '21': 729,
 'travelling': 724,
 '

In [19]:
ttunk = TokenTypes()
ttunk(list(unk_common.keys()))

Number of num tokens:  28150
Number of symbol tokens:  1563
Number of lowcase tokens:  63525
Number of upper case tokens:  36140
Nubmer of capslock tokens:  3458


Freq num tokens:  ['2017', '2018', '10', '12', '100', '20', '12th', '15', '30', '50']
Rare num tokens:  ['10–12', '61850', 'Feb-2018', '2200-$2500/mo', '4000/mo', '999999', '450cc', '78.53', '900va', '6740'] 

Freq symbol tokens:  ['?', ',', '.', '"', '-', ')', '(', '/', "'", ':']
Rare symbol tokens:  ['τῆς', 'κατὰ', 'τὴν', 'ἡμετέραν', 'χρῆσιν', 'συνηθείας', '=--', '中庸', '中庸之道', '∅⊂'] 

Freq low tokens:  ['to', 'a', 'of', 'and', 'n’t', 'favourite', 'and/or', 'bitcoin', 'colour', 'centre']
Rare low tokens:  ['enrollnment', 'noumber', 'sgill', 'jauhar', 'clearly"bad', 'myAIR', 'cybershot', 'ganpat', 'rabels', 'photopicker'] 

Freq up tokens:  ['Quorans', 'Snapchat', 'C++', 'Brexit', 'Redmi', 'KVPY', 'Paytm', 'Ethereum', 'Whatis', 'INTJ']
Rare up tokens:  ['Kiccha', 'Bhangani', 'GS-', 'Magick++', 'NotJesseChen', 'THEORETICAL', 

In [20]:
ttunk.num_tokens

['2017',
 '2018',
 '10',
 '12',
 '100',
 '20',
 '12th',
 '15',
 '30',
 '50',
 '18',
 '11',
 '16',
 '14',
 '17',
 '40',
 '2016',
 '13',
 '25',
 '10th',
 '2019',
 '60',
 '90',
 '500',
 '1000',
 '200',
 '11th',
 '24',
 '80',
 '21',
 '70',
 '19',
 '2000',
 '2020',
 '22',
 '300',
 '23',
 '35',
 '150',
 '45',
 '2015',
 '26',
 '28',
 '75',
 '21st',
 '9/11',
 '32',
 '400',
 '95',
 '27',
 '2014',
 '1.5',
 '65',
 '20th',
 '85',
 '250',
 '99',
 '120',
 '10,000',
 '55',
 '5000',
 '2008',
 '29',
 '19th',
 '2012',
 '360',
 '72',
 '2013',
 '1/2',
 '20s',
 '2010',
 '600',
 '1500',
 'x^2',
 '2.5',
 '10000',
 '3000',
 '36',
 '350',
 '2011',
 '800',
 '48',
 '64',
 '33',
 '31',
 '180',
 '52',
 '3.5',
 '160',
 '90s',
 '100,000',
 '38',
 '80s',
 '130',
 '700',
 '34',
 '110',
 '42',
 '2.0',
 '10k',
 '^2',
 '51',
 '37',
 '30s',
 '98',
 '2009',
 '1980s',
 '18th',
 '2001',
 '2007',
 '1947',
 '1,000',
 '4000',
 '1990',
 '1984',
 '1980',
 '911',
 '1960s',
 '39',
 '2050',
 '1945',
 '2003',
 '140',
 '2002',
 '0.5',