# EDA google news embedding and Spacy tokenizer

In [1]:
import os
import re
import pandas as pd
import spacy
from collections import Counter
from gensim.models import KeyedVectors
from utils import *

In [78]:
class TokenTypes:
    def __init__(self):
        self.num_tokens = []
        self.symbol_tokens = []
        self.low_tokens = []
        self.up_tokens = []
        self.caps_tokens = []
        self.merged_tokens = []
        self.num_alpha_tokens = []
        self.one_symb_tokens = []

    def __call__(self, tokens):
        num_re = re.compile('[0-9]')
        low_re = re.compile('[a-z]')
        up_re = re.compile('[A-Z]')
        caps_re = re.compile('^[A-Z]*$')
        letnum_re = re.compile('[a-zA-Z0-9]')
        merged_re = re.compile('_')
        num_alpha_re = re.compile('^[0-9]+[a-zA-Z]+$')
        one_symb_re = re.compile('^[a-zA-Z]*[^0-9a-zA-Z][a-zA-Z]*$')
        
        for t in tokens:
            if num_re.search(t):
                self.num_tokens.append(t)
            if not letnum_re.search(t):
                self.symbol_tokens.append(t)
            if low_re.match(t):
                self.low_tokens.append(t)
            if up_re.match(t):
                self.up_tokens.append(t)
            if caps_re.match(t):
                self.caps_tokens.append(t)
            if merged_re.search(t):
                self.merged_tokens.append(t)
            if num_alpha_re.match(t):
                self.num_alpha_tokens.append(t)
            if one_symb_re.match(t) and len(t) > 1 and not merged_re.search(t):
                self.one_symb_tokens.append(t)
        print('Number of num tokens: ', len(self.num_tokens))
        print('Number of symbol tokens: ', len(self.symbol_tokens))
        print('Number of lowcase tokens: ', len(self.low_tokens))
        print('Number of upper case tokens: ', len(self.up_tokens))
        print('Nubmer of capslock tokens: ', len(self.caps_tokens))
        print('Nubmer of merged tokens: ', len(self.merged_tokens))
        print('Nubmer of num alpha tokens: ', len(self.num_alpha_tokens))
        print('Nubmer of one symbol tokens: ', len(self.one_symb_tokens))

        print('\n')

        print('Freq num tokens: ', self.num_tokens[:50])
        print('Rare num tokens: ', self.num_tokens[-10:], '\n')
        print('Freq symbol tokens: ', self.symbol_tokens[:10])
        print('Rare symbol tokens: ', self.symbol_tokens[-10:], '\n')
        print('Freq low tokens: ', self.low_tokens[:10])
        print('Rare low tokens: ', self.low_tokens[-10:], '\n')
        print('Freq up tokens: ', self.up_tokens[:10])
        print('Rare up tokens: ', self.up_tokens[-10:], '\n')
        print('Freq caps tokens: ', self.caps_tokens[:10])
        print('Rare caps tokens: ', self.caps_tokens[-10:])
        print('Freq merged tokens: ', self.merged_tokens[:20])
        print('Freq num alpha tokens: ', self.num_alpha_tokens[:20])
        print('Freq one symb tokens: ', self.one_symb_tokens[:20])



### Look at embedding

In [79]:
emb_path = 'embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
data_dir = cache = '../data'

train_csv = os.path.join(data_dir, 'train.csv')
test_csv = os.path.join(data_dir, 'test.csv')
emb_path = os.path.join(data_dir, emb_path)

In [80]:
embeddings_index = KeyedVectors.load_word2vec_format(emb_path, binary=True)

In [81]:
emb_tokens = embeddings_index.index2word

In [82]:
embeddings_index.vector_size

300

In [83]:
token_info(emb_tokens)

Number of tokens:  3000000
First 10 tokens:  ['</s>', 'in', 'for', 'that', 'is', 'on', '##', 'The', 'with', 'said']


In [84]:
ttemb = TokenTypes()
ttemb(emb_tokens)

Number of num tokens:  35844
Number of symbol tokens:  5172
Number of lowcase tokens:  631815
Number of upper case tokens:  2324222
Nubmer of capslock tokens:  104279
Nubmer of merged tokens:  2070978
Nubmer of num alpha tokens:  2255
Nubmer of one symbol tokens:  22548


Freq num tokens:  ['1', '2', '3', '5', '4', '6', '7', '8', '9', '0', '2nd', '1st', '3_pointer', '3rd', '3D', '4th', '3_pointers', '3G', '5th', '6th', '7th', '8th', '9th', 'Q1', 'F1', 'Q3', 'Q2', 'Q4', 'PS3', 'CO2', 'MP3', '4A', '7pm', 'G8', '8pm', '4G', '3M', '3s', '5pm', '3A', 'U2', 'No.1', '6pm', '5A', '3pm', '0_Comments', '5K', 'O2', '2pm', '4pm']
Rare num tokens:  ['K9_Duds', 'ponceau_4R_E###', 'ctorres2@bloomberg.net', 'Twickenham_TW1', 'GSX_R####_K9', 'Jailbreak_iPhone_3GS', 'W3C_OASIS', 'Time4_Media_subsidiary', '1p_dearer', 'Audi_A4_saloon'] 

Freq symbol tokens:  ['##', '####', '$', '###', '#.#', '#-#', '%', '#,###', '#.##', '##,###']
Rare symbol tokens:  ['###:#_###:#', '=====_====', 'Ð_¸_Ð', '+##-###-##-###

In [85]:
ttemb.merged_tokens[500:]

['First_Amendment',
 'integral_part',
 'Great_Depression',
 'scoreless_innings',
 'American_Legion',
 'Finance_Committee',
 'Kobe_Bryant',
 'press_releases',
 'Public_Works',
 'Bear_Stearns',
 'Roger_Federer',
 'State_Condoleezza_Rice',
 'Election_Day',
 'Tony_Blair',
 'Notes_@',
 'Memorial_Hospital',
 '###-####_ext',
 'younger_brother',
 'heavy_rain',
 'House_Speaker',
 'running_mate',
 'Prime_Minister_Manmohan_Singh',
 'parliamentary_elections',
 'radio_stations',
 'Highway_Patrol',
 'place_undue_reliance',
 'Nova_Scotia',
 'Times_Square',
 'starting_lineup',
 'Tamil_Nadu',
 'text_message',
 'due_diligence',
 'identity_theft',
 'homeland_security',
 'Humane_Society',
 'Jesus_Christ',
 'associate_professor',
 'Commonwealth_Games',
 'Southeastern_Conference',
 'pm_EDT',
 'sport_utility',
 'Costa_Rica',
 'drunken_driving',
 'Public_Schools',
 'invisible_item_flow',
 'Funeral_Home',
 'historical_facts',
 'Chief_Operating_Officer',
 'lung_cancer',
 'Robert_Gates',
 'Gold_Coast',
 'master_

In [86]:
ttemb.num_tokens

['1',
 '2',
 '3',
 '5',
 '4',
 '6',
 '7',
 '8',
 '9',
 '0',
 '2nd',
 '1st',
 '3_pointer',
 '3rd',
 '3D',
 '4th',
 '3_pointers',
 '3G',
 '5th',
 '6th',
 '7th',
 '8th',
 '9th',
 'Q1',
 'F1',
 'Q3',
 'Q2',
 'Q4',
 'PS3',
 'CO2',
 'MP3',
 '4A',
 '7pm',
 'G8',
 '8pm',
 '4G',
 '3M',
 '3s',
 '5pm',
 '3A',
 'U2',
 'No.1',
 '6pm',
 '5A',
 '3pm',
 '0_Comments',
 '5K',
 'O2',
 '2pm',
 '4pm',
 '0_0',
 'E3',
 '9pm',
 '9am',
 '+1',
 '2A',
 '2B',
 'Class_4A',
 '1pm',
 '2D',
 'ESPN2',
 'F.3d',
 'AZ_az_,0_9',
 '1B',
 '1A',
 'CO2_emissions',
 'Class_3A',
 'B2B',
 'Proposition_8',
 '8am',
 'VH1',
 'G7',
 '1Q',
 'H1',
 'Class_2A',
 '4Q',
 'Windows_Phone_7',
 '3B',
 'A1',
 'Class_5A',
 '3Q',
 'MP3_player',
 'iPhone_3G',
 '3_PTR',
 '2Q',
 'PS2',
 'R2',
 '6am',
 'IPv6',
 '0_errors',
 '£_1m',
 '7am',
 'Comments_0',
 'G1',
 'P2P',
 '+##_0',
 '2b',
 '3Com',
 'omega_3',
 'V8',
 'No.2',
 'MI5',
 '1m',
 'K2',
 'P1',
 'R1',
 '1st_Class',
 '2G',
 '2GB',
 '-1',
 'Update1',
 '3am',
 'Class_1A',
 'ITV1',
 '1GB',
 '2am'

In [92]:
ttemb.one_symb_tokens

["'re",
 "'ve",
 "'m",
 "'ll",
 'Inc.',
 'Mr.',
 'No.',
 "'d",
 'St.',
 'Dr.',
 'Sept.',
 'Jan.',
 'Dec.',
 'Corp.',
 'Oct.',
 'Nov.',
 'Feb.',
 'Aug.',
 'Co.',
 'in.',
 'Jr.',
 'Sen.',
 'vs.',
 'Rep.',
 'J.',
 'A.',
 'Ms.',
 'E.',
 'S.',
 'W.',
 'M.',
 'Gov.',
 'Ltd.',
 'Calif.',
 'etc.',
 'L.',
 'Mrs.',
 'R.',
 'C.',
 'N.',
 'D.',
 'Rev.',
 'B.',
 "O'Brien",
 "It'sa",
 'Lt.',
 'Capt.',
 'P.',
 'v.',
 'T.',
 'Sr.',
 'G.',
 'H.',
 "I'ma",
 'F.',
 'Gen.',
 'Fla.',
 "O'Connor",
 "it'sa",
 'Pa.',
 'K.',
 "O'Neill",
 "O'Neal",
 'Yahoo!',
 'Q.',
 "isn'ta",
 'Mo.',
 'Mass.',
 "wasn'ta",
 "O'Donnell",
 'Col.',
 'Ill.',
 'Amazon.com',
 'Sacbee.com',
 'FXstreet.com',
 'Va.',
 'Mich.',
 'I.',
 "'S",
 "there'sa",
 "o'clock",
 'Maj.',
 'Minn.',
 'Md.',
 "O'Reilly",
 'V.',
 'Mar.',
 'Ga.',
 "O'Connell",
 'MLB.com',
 'Ind.',
 'Prof.',
 'Wis.',
 "O'Malley",
 "O'Leary",
 "There'sa",
 'Colo.',
 'Café',
 'Apr.',
 'La.',
 "O'Sullivan",
 'Ky.',
 'Tenn.',
 'O.',
 'Ark.',
 'Wash.',
 "Shi'ite",
 'Conn.',
 'M

### Tokenize data and build vocab

In [17]:
train = pd.read_csv(train_csv)['question_text'].values.tolist()
test = pd.read_csv(test_csv)['question_text'].values.tolist()
data = train + test

In [127]:
%%time
with open(train_csv) as f:
    for l in f:
        l.split()

CPU times: user 878 ms, sys: 12 ms, total: 889 ms
Wall time: 888 ms


In [18]:
%%time
unknown_tokens = []
vocab = Counter()
spacy_en = spacy.load('en')
tokenizer = spacy_en.tokenizer
emb_tokens_set = set(emb_tokens)
for question in data:
    doc = tokenizer(question)
    for token in doc:
        tok_text = token.text
        vocab.update([tok_text])
        if vocab[tok_text] == 1:
            if tok_text not in emb_tokens_set:
                unknown_tokens.append(tok_text)
        
#doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
#for token in doc:
#    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#          token.shape_, token.is_alpha, token.is_stop)

CPU times: user 2min 31s, sys: 243 ms, total: 2min 31s
Wall time: 2min 30s


In [19]:
vocab_tokens = dict(vocab.most_common()).keys()

In [20]:
ttvocab = TokenTypes()
ttvocab(vocab_tokens)

Number of num tokens:  29530
Number of symbol tokens:  1727
Number of lowcase tokens:  132699
Number of upper case tokens:  113484
Nubmer of capslock tokens:  15169
Nubmer of merged tokens:  714


Freq num tokens:  ['2', '2017', '2018', '1', '3', '10', '5', '4', '12', '6', '100', '20', '7', '12th', '15', '8', '30', '50', '18', '11', '9', '16', '14', '17', '40', '2016', '13', '1st', '25', '0', '10th', '2019', '2nd', '60', 'WW2', '90', '500', '1000', '200', '11th', '3D', '24', '80', '21', '70', '3rd', '19', '2000', '2020', '22']
Rare num tokens:  ['10–12', '61850', 'Feb-2018', '2200-$2500/mo', '4000/mo', '999999', '450cc', '78.53', '900va', '6740'] 

Freq symbol tokens:  ['?', ',', '.', '"', '-', ')', '(', '/', "'", '&']
Rare symbol tokens:  ['τῆς', 'κατὰ', 'τὴν', 'ἡμετέραν', 'χρῆσιν', 'συνηθείας', '=--', '中庸', '中庸之道', '∅⊂'] 

Freq low tokens:  ['the', 'to', 'a', 'in', 'is', 'of', 'and', 'do', 'are', 'for']
Rare low tokens:  ['noumber', 'sml', 'sgill', 'jauhar', 'clearly"bad', 'myAIR', '

### Capslock tokens

In [21]:
caps_tokens = ttvocab.caps_tokens
emotional_caps_tokens = []
for t in caps_tokens:
    if t not in emb_tokens_set and t.lower() in emb_tokens_set:
        emotional_caps_tokens.append(t)
print(len(emotional_caps_tokens))

433


In [22]:
emotional_caps_tokens

['ENGG',
 'DUCAT',
 'BLAT',
 'IYO',
 'BODS',
 'ETHER',
 'DEMAT',
 'VOLTE',
 'SYLLABUS',
 'NARCOS',
 'VANI',
 'VEDAS',
 'PRELIMS',
 'ANONYMOUSLY',
 'ODES',
 'GOLEM',
 'RESONANCE',
 'CEOL',
 'STUPIDEST',
 'DISCOM',
 'DEPR',
 'BABI',
 'WASE',
 'EUCALYPTUS',
 'GEFORCE',
 'DUOS',
 'BIBS',
 'RATNA',
 'COPR',
 'MATHEMATICAL',
 'QUANT',
 'HYMEN',
 'WEKA',
 'ARTICLESHIP',
 'NAVEL',
 'CALCULATE',
 'GOTRA',
 'VLOG',
 'FREQUENCIES',
 'STANKY',
 'STANK',
 'URDU',
 'CRISPER',
 'ROSCA',
 'SMITE',
 'CLOAK',
 'SMOK',
 'ACHEIVEMENTS',
 'IRRESPONSIBILITY',
 'PERSISTED',
 'MEDICORE',
 'HANGERS',
 'GASSES',
 'EXPLAINATION',
 'COMMAS',
 'INSITED',
 'WEBCAMS',
 'CENSE',
 'SABOT',
 'ALLERGENS',
 'IRRITANTS',
 'TOXICS',
 'VOCATION',
 'INFL',
 'OLYMPIADS',
 'JUCHE',
 'UNWORKABLE',
 'BRAHMAN',
 'BLASPHEMING',
 'COCKSUCKERS',
 'KINEMATIC',
 'VISCOSITY',
 'PREDESTINATION',
 'KASPERSKY',
 'ASPIRANTS',
 'MANAGMENT',
 'COLLATE',
 'JOKA',
 'OCAL',
 'AMPLIFIER',
 'ANSD',
 'POEPLE',
 'NAMO',
 'ESCORTS',
 'EYELASHES',
 '

### Unknown tokens

In [23]:
unk_vocab = {}
for w in unknown_tokens:
    unk_vocab[w] = vocab[w]
unk_vocab = Counter(unk_vocab)

In [24]:
print('Total number of tokens: ', sum(vocab.values()))
print('Number of unique tokens: ', len(vocab))
print('Total number of unknown tokens: ', sum(unk_vocab.values()))
print('Number of unique unknown tokens: ', len(unknown_tokens))
print('Percent of unknown tokens: ', 100 * sum(unk_vocab.values())/sum(vocab.values()))

Total number of tokens:  19837964
Number of unique tokens:  268601
Total number of unknown tokens:  3986997
Number of unique unknown tokens:  121310
Percent of unknown tokens:  20.097813465131804


In [25]:
unk_common = dict(unk_vocab.most_common())
print('Number of 100 most common unk tokens: ', sum(list(unk_common.values())[:100]))

Number of 100 most common unk tokens:  3749654


In [26]:
unk_common

{'?': 1440556,
 'to': 423732,
 'a': 421001,
 'of': 347358,
 'and': 263940,
 ',': 240502,
 '.': 99408,
 '"': 75829,
 "'s": 73188,
 '-': 58940,
 ')': 58022,
 '(': 55035,
 '/': 32971,
 "'": 20582,
 '2017': 9010,
 '’s': 8778,
 '2018': 7589,
 ':': 7266,
 '10': 6945,
 'n’t': 6897,
 '”': 4035,
 '“': 4029,
 '12': 3551,
 '100': 3062,
 '20': 2849,
 '’m': 2683,
 '12th': 2651,
 '15': 2384,
 '!': 2311,
 '30': 2129,
 '[': 2039,
 ']': 2027,
 '…': 2009,
 '50': 2007,
 '18': 1974,
 '11': 1863,
 '16': 1512,
 ';': 1493,
 '14': 1426,
 '}': 1425,
 '17': 1343,
 '40': 1315,
 'favourite': 1300,
 '2016': 1291,
 '13': 1271,
 '’ve': 1256,
 '25': 1185,
 '10th': 1105,
 '2019': 1098,
 'and/or': 1098,
 '60': 1066,
 'bitcoin': 1045,
 'colour': 1022,
 '{': 993,
 'centre': 914,
 '..': 888,
 '90': 885,
 'Quorans': 882,
 '500': 875,
 '1000': 871,
 '200': 870,
 '‘': 861,
 'cryptocurrency': 856,
 '11th': 855,
 'Snapchat': 848,
 'e.g.': 808,
 '24': 802,
 'C++': 750,
 '’re': 734,
 '80': 733,
 '21': 729,
 'travelling': 724,
 '

In [27]:
ttunk = TokenTypes()
ttunk(list(unk_common.keys()))

Number of num tokens:  28150
Number of symbol tokens:  1563
Number of lowcase tokens:  63525
Number of upper case tokens:  36140
Nubmer of capslock tokens:  3458
Nubmer of merged tokens:  708


Freq num tokens:  ['2017', '2018', '10', '12', '100', '20', '12th', '15', '30', '50', '18', '11', '16', '14', '17', '40', '2016', '13', '25', '10th', '2019', '60', '90', '500', '1000', '200', '11th', '24', '80', '21', '70', '19', '2000', '2020', '22', '300', '23', '35', '150', '45', '2015', '26', '28', '75', '21st', '9/11', '32', '400', '95', '27']
Rare num tokens:  ['10–12', '61850', 'Feb-2018', '2200-$2500/mo', '4000/mo', '999999', '450cc', '78.53', '900va', '6740'] 

Freq symbol tokens:  ['?', ',', '.', '"', '-', ')', '(', '/', "'", ':']
Rare symbol tokens:  ['τῆς', 'κατὰ', 'τὴν', 'ἡμετέραν', 'χρῆσιν', 'συνηθείας', '=--', '中庸', '中庸之道', '∅⊂'] 

Freq low tokens:  ['to', 'a', 'of', 'and', 'n’t', 'favourite', 'and/or', 'bitcoin', 'colour', 'centre']
Rare low tokens:  ['enrollnment', 'noumber', 's

In [28]:
ttunk.num_tokens

['2017',
 '2018',
 '10',
 '12',
 '100',
 '20',
 '12th',
 '15',
 '30',
 '50',
 '18',
 '11',
 '16',
 '14',
 '17',
 '40',
 '2016',
 '13',
 '25',
 '10th',
 '2019',
 '60',
 '90',
 '500',
 '1000',
 '200',
 '11th',
 '24',
 '80',
 '21',
 '70',
 '19',
 '2000',
 '2020',
 '22',
 '300',
 '23',
 '35',
 '150',
 '45',
 '2015',
 '26',
 '28',
 '75',
 '21st',
 '9/11',
 '32',
 '400',
 '95',
 '27',
 '2014',
 '1.5',
 '65',
 '20th',
 '85',
 '250',
 '99',
 '120',
 '10,000',
 '55',
 '5000',
 '2008',
 '29',
 '19th',
 '2012',
 '360',
 '72',
 '2013',
 '1/2',
 '20s',
 '2010',
 '600',
 '1500',
 'x^2',
 '2.5',
 '10000',
 '3000',
 '36',
 '350',
 '2011',
 '800',
 '48',
 '64',
 '33',
 '31',
 '180',
 '52',
 '3.5',
 '160',
 '90s',
 '100,000',
 '38',
 '80s',
 '130',
 '700',
 '34',
 '110',
 '42',
 '2.0',
 '10k',
 '^2',
 '51',
 '37',
 '30s',
 '98',
 '2009',
 '1980s',
 '18th',
 '2001',
 '2007',
 '1947',
 '1,000',
 '4000',
 '1990',
 '1984',
 '1980',
 '911',
 '1960s',
 '39',
 '2050',
 '1945',
 '2003',
 '140',
 '2002',
 '0.5',

### custom tokenisation

In [48]:
num_symb_re = re.compile('^[^a-zA-Z]*[0-9]+[^a-zA-Z]*$')

In [49]:
num_alpha_re = re.compile('^[0-9]+[a-zA-Z]+$')

In [66]:
one_symb_re = re.compile('^[a-zA-Z]*[^0-9a-zA-Z][a-zA-Z]*$')

In [69]:
one_symb_re.match('!')

<_sre.SRE_Match object; span=(0, 1), match='!'>

In [101]:
def tokenize_numbers(tok):
    num_symb_re = re.compile('^[^a-zA-Z]*[0-9]+[^a-zA-Z]*$')
    sub_re = re.compile('[0-9]')
    if num_symb_re.match(tok) and len(tok) > 1:
        tok = sub_re.sub('#', tok)
    return tok
        
    
    

In [None]:
def tokenize_numbers(tok):
    num_symb_re = re.compile('^[^a-zA-Z]*[0-9]+[^a-zA-Z]*$')
    sub_re = re.compile('[0-9]')
    num_alpha_re = re.compile('^[0-9][0-9]+[a-zA-Z]+$')
    split_re = 
    if num_symb_re.match(tok) and len(tok) > 1:
        tok = sub_re.sub('#', tok)
    if num_alpha_repha_re.match(tok):
        split_re
    return tok
        

In [104]:
tokenize_numbers('23.23')

'##.##'

In [100]:
re.sub('[0-9]', '#', '2aa')

'#aa'

In [107]:
%%time
unknown_tokens = []
vocab = Counter()
spacy_en = spacy.load('en')
tokenizer = spacy_en.tokenizer
emb_tokens_set = set(emb_tokens)
num_symb_re = re.compile('^[^a-zA-Z]*[0-9]+[^a-zA-Z]*$')
sub_re = re.compile('[0-9]')
for question in data:
    doc = tokenizer(question)
    for token in doc:
        tok_text = token.text
        if num_symb_re.match(tok_text) and len(tok_text) > 1:
            tok_text = sub_re.sub('#', tok_text)
        vocab.update([tok_text])
        if vocab[tok_text] == 1:
            if tok_text not in emb_tokens_set:
                unknown_tokens.append(tok_text)
        
#doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
#for token in doc:
#    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#          token.shape_, token.is_alpha, token.is_stop)

CPU times: user 2min 36s, sys: 184 ms, total: 2min 37s
Wall time: 2min 35s


In [118]:
unk_vocab = {}
for w in unknown_tokens:
    unk_vocab[w] = vocab[w]
unk_vocab = Counter(unk_vocab)

In [119]:
print('Total number of tokens: ', sum(vocab.values()))
print('Number of unique tokens: ', len(vocab))
print('Total number of unknown tokens: ', sum(unk_vocab.values()))
print('Number of unique unknown tokens: ', len(unknown_tokens))
print('Percent of unknown tokens: ', 100 * sum(unk_vocab.values())/sum(vocab.values()))

Total number of tokens:  19837964
Number of unique tokens:  259996
Total number of unknown tokens:  3877055
Number of unique unknown tokens:  112510
Percent of unknown tokens:  19.543613447428374


In [120]:
unk_common = dict(unk_vocab.most_common())

In [121]:
unk_common

{'?': 1440556,
 'to': 423732,
 'a': 421001,
 'of': 347358,
 'and': 263940,
 ',': 240502,
 '.': 99408,
 '"': 75829,
 "'s": 73188,
 '-': 58940,
 ')': 58022,
 '(': 55035,
 '/': 32971,
 "'": 20582,
 '’s': 8778,
 ':': 7266,
 'n’t': 6897,
 '”': 4035,
 '“': 4029,
 '’m': 2683,
 '12th': 2651,
 '!': 2311,
 '[': 2039,
 ']': 2027,
 '…': 2009,
 ';': 1493,
 '}': 1425,
 'favourite': 1300,
 '’ve': 1256,
 '10th': 1105,
 'and/or': 1098,
 'bitcoin': 1045,
 'colour': 1022,
 '{': 993,
 'centre': 914,
 '..': 888,
 'Quorans': 882,
 '#/#': 873,
 '‘': 861,
 'cryptocurrency': 856,
 '11th': 855,
 'Snapchat': 848,
 'e.g.': 808,
 'C++': 750,
 '’re': 734,
 'travelling': 724,
 'counselling': 668,
 'i.e.': 592,
 'Brexit': 510,
 'btech': 507,
 'cryptocurrencies': 506,
 'blockchain': 491,
 'behaviour': 485,
 '#/##': 480,
 'upvotes': 451,
 'programme': 421,
 'Redmi': 394,
 'realise': 391,
 'defence': 373,
 '-#': 369,
 'KVPY': 361,
 'Paytm': 350,
 '21st': 347,
 '<': 337,
 'organisation': 321,
 'grey': 311,
 'cancelled': 