In [1]:
import numpy as np
import pandas as pd
import os
import time
import gc
import random
from contextlib import contextmanager
from fastprogress import master_bar, progress_bar
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F

Using TensorFlow backend.


In [2]:
CRAWL_EMBEDDING_PATH = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'

In [3]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

In [4]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

In [5]:
with timer('crawl'):
    crawl_emb_dict = load_embeddings(CRAWL_EMBEDDING_PATH)
gc.collect()

[crawl] done in 84 s


11

In [6]:
%%time
# 9.9G
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

x_train = train['comment_text']
y_train = np.where(train['target'] >= 0.5, 1, 0)
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
x_test = test['comment_text']
del train, test
gc.collect()

CPU times: user 8.69 s, sys: 713 ms, total: 9.4 s
Wall time: 8.29 s


In [7]:
import re
# これだと、'はembeddingに結構入ってるのに除外されちゃう。　よくないので ' だけ抜いた
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x: str) -> str:
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, ' {} '.format(punct))
    return x

puncts_apos = ["''", ',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x: str) -> str:
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, ' {} '.format(punct))
    return x

def clean_text_apos(x: str) -> str:
    for punct in puncts_apos:
        if punct in x:
            x = x.replace(punct, ' {} '.format(punct))
    return x




In [8]:
import operator
from typing import Dict, List
def build_vocab(texts: pd.DataFrame) -> Dict[str, int]:
    """
    
    Parameters
    -----
    texts: pandas.Series
        question textの列
        
    Returns
    -----
    dict: 
        単語とカウント
    
    """
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def check_coverage(vocab: Dict[str, int], embeddings_index: Dict) -> List[str]:
    """
    Parameters
    -----
    vocab: dict
        単語とカウント
    embeddings_index: dict
        load_embedの出力
        
    Returns:
        list:
            embeddingsに入ってない単語
    """
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / float(len(vocab))))
    print('Found embeddings for  {:.2%} of all text'.format(float(nb_known_words) / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

In [9]:
from nltk.stem import PorterStemmer
p_stemmer = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
l_stemmer = LancasterStemmer()
from nltk.stem import SnowballStemmer
s_stemmer = SnowballStemmer("english")

In [10]:
import copy
def edits1(word):
    """
    wordの編集距離1の単語のリストを返す
    """
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def known(words, embed): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in embed)

def spellcheck(word, word_rank_dict):
    return min(known(edits1(word), word_rank_dict), key=lambda w: word_rank_dict[w])


punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }
import unicodedata
def process_stemmer(vocab, embed):
    
    oov_word_set = set()
    for word in vocab.keys():
        vector = embed.get(word, None)
        if vector is not None:
            continue

        vector = embed.get(word.lower(), None)
        if vector is not None:
            embed[word] = vector
            continue

        vector = embed.get(word.upper(), None)
        if vector is not None:
            embed[word] = vector
            continue

        vector = embed.get(word.capitalize(), None)
        if vector is not None:
            embed[word] = vector
            continue
            
        corr_word = punct_mapping.get(word, None)
        if corr_word is not None:
            vector = embed.get(corr_word, None)
            if vector is not None:
                embed[word] = vector
                continue
        
        try:
            vector = embed.get(p_stemmer.stem(word), None)
        except:
            vector = embed.get(p_stemmer.stem(word.decode('utf-8')), None)
        if vector is not None:
            embed[word] = vector
            continue
            
        try:
            vector = embed.get(l_stemmer.stem(word), None)
        except:
            vector = embed.get(l_stemmer.stem(word.decode('utf-8')), None)
        if vector is not None:
            embed[word] = vector
            continue
        
        try:
            vector = embed.get(s_stemmer.stem(word), None)
        except:
            vector = embed.get(s_stemmer.stem(word.decode('utf-8')), None)
                    
        if vector is not None:
            embed[word] = vector
            continue
            
        char_list = []
        any_small_capitial = False
        for char in word:
            try:
                uni_name = unicodedata.name(char)
            except ValueError:
                continue
                
            if 'LATIN LETTER SMALL CAPITAL' in uni_name:
                char = uni_name[-1]
                any_small_capitial = True
            if 'CYRILLIC SMALL LETTER GHE WITH STROKE' in uni_name:
                char = 'F'
                any_small_capitial = True
                
            char_list.append(char)
            
        if not any_small_capitial:
            oov_word_set.add(word)
            continue
        
        legit_word = ''.join(char_list)
        
        # 2週目
        
        vector = embed.get(legit_word, None)
        if vector is not None:
            embed[word] = vector
            continue

        vector = embed.get(legit_word.lower(), None)
        if vector is not None:
            embed[word] = vector
            continue

        vector = embed.get(legit_word.upper(), None)
        if vector is not None:
            embed[word] = vector
            continue

        vector = embed.get(legit_word.capitalize(), None)
        if vector is not None:
            embed[word] = vector
            continue
            
        corr_word = punct_mapping.get(legit_word, None)
        if corr_word is not None:
            vector = embed.get(corr_word, None)
            if vector is not None:
                embed[word] = vector
                continue
        
        try:
            vector = embed.get(p_stemmer.stem(legit_word), None)
        except:
            vector = embed.get(p_stemmer.stem(legit_word.decode('utf-8')), None)
        if vector is not None:
            embed[word] = vector
            continue
            
        try:
            vector = embed.get(l_stemmer.stem(legit_word), None)
        except:
            vector = embed.get(l_stemmer.stem(legit_word.decode('utf-8')), None)
        if vector is not None:
            embed[word] = vector
            continue
        
        try:
            vector = embed.get(s_stemmer.stem(legit_word), None)
        except:
            vector = embed.get(s_stemmer.stem(legit_word.decode('utf-8')), None)
                    
        if vector is not None:
            embed[word] = vector
            continue

        oov_word_set.add(word)
            
    return embed, oov_word_set

def process_spellcheck(vocab, embed, word_rank_dict, oov_set):
    for word in vocab.keys():
        if word not in oov_set:
            continue
            
        try:
            vector = embed.get(spellcheck(word, word_rank_dict), None)
        except:
            continue
        if vector is not None:
            embed[word] = vector
            continue
            
    return embed

def make_word_rank(embed):
    word_rank = {}
    for i, word in enumerate(embed):
        word_rank[word] = i
    return word_rank

In [11]:
concat_desc = pd.concat([x_train, x_test], ignore_index=True).fillna("")

In [12]:
%%time
processed_concat_desc = concat_desc.apply(lambda x: clean_text(x))

CPU times: user 11.7 s, sys: 260 ms, total: 12 s
Wall time: 12 s


In [13]:
%%time
vocab = build_vocab(processed_concat_desc)

CPU times: user 27.6 s, sys: 1.22 s, total: 28.8 s
Wall time: 29.5 s


In [14]:
len(vocab)

482096

In [15]:
gc.collect()

0

In [16]:
%%time
oov = check_coverage(vocab, crawl_emb_dict)

Found embeddings for 53.24% of vocab
Found embeddings for  98.91% of all text
CPU times: user 353 ms, sys: 19.7 ms, total: 373 ms
Wall time: 371 ms


In [17]:
oov[:100]

[('_', 65327),
 ("Trump's", 25361),
 ("aren't", 22714),
 ("Don't", 21779),
 ("wouldn't", 21158),
 ("wasn't", 19896),
 ("You're", 14954),
 ("Let's", 14817),
 ("He's", 12654),
 ("couldn't", 12060),
 ("There's", 11290),
 ("let's", 10409),
 ("what's", 10359),
 ("shouldn't", 10267),
 ("hasn't", 8539),
 ("What's", 8478),
 ("Canada's", 8409),
 ("you've", 8138),
 ('`', 7591),
 ("weren't", 6634),
 ("Here's", 6307),
 ("Obama's", 6232),
 ("They're", 5807),
 ("one's", 5607),
 ("people's", 5597),
 ("you'd", 5440),
 ("we'll", 5293),
 ("they've", 5179),
 ("We're", 5156),
 ("Can't", 4993),
 ("they'll", 4990),
 ("we've", 4944),
 ("today's", 4798),
 ("Trudeau's", 4645),
 ("who's", 4599),
 ("Isn't", 4441),
 ("Alaska's", 4127),
 ("God's", 3650),
 ("he'll", 3471),
 ("ain't", 3241),
 ("women's", 3182),
 ("Didn't", 3120),
 ("Doesn't", 3119),
 ("they'd", 3073),
 ("She's", 3060),
 ("world's", 3032),
 ("America's", 2962),
 ("he'd", 2908),
 ("Clinton's", 2901),
 ("You've", 2833),
 ("We've", 2758),
 ("else's", 25

In [18]:
%%time
crawl_emb_dict, oov = process_stemmer(vocab, crawl_emb_dict)
word_rank = make_word_rank(crawl_emb_dict)
crawl_emb_dict = process_spellcheck(vocab, crawl_emb_dict, word_rank, oov)
oov = check_coverage(vocab, crawl_emb_dict)

Found embeddings for 84.36% of vocab
Found embeddings for  99.87% of all text
CPU times: user 32.9 s, sys: 44 ms, total: 32.9 s
Wall time: 32.9 s


In [19]:
oov[:100]

[('theglobeandmail', 1420),
 ('nationalpost', 438),
 ('2gTbpns', 381),
 ('denverpost', 339),
 ('civilbeat', 285),
 ('RangerMC', 276),
 ('garycrum', 266),
 ('BCLibs', 260),
 ('cashapp24', 237),
 ('dailycaller', 217),
 ('washingtontimes', 210),
 ('Cheetolini', 203),
 ('Tridentinus', 201),
 ('Ontariowe', 184),
 ('financialpost', 179),
 ('MAGAphants', 178),
 ('Nageak', 173),
 ("O'Leary's", 170),
 ('scientificamerican', 161),
 ('motleycrew', 161),
 ('907AK', 158),
 ('ncronline', 158),
 ('talkingpointsmemo', 155),
 ('motherjones', 153),
 ('Outsider77', 151),
 ('Putrumpski', 150),
 ('diverdave', 148),
 ('Mahawker', 148),
 ('TheDonald', 140),
 ('antifluoridationists', 140),
 ('staradvertiser', 133),
 ('Lazeelink', 131),
 ('Pandora17', 128),
 ('22moneybay', 125),
 ('Bozievich', 125),
 ('thedailybeast', 122),
 ('covfefe', 121),
 ('RadirD', 121),
 ('skyofblue', 121),
 ('gubmut', 119),
 ('muckamuck', 118),
 ('conservativereview', 115),
 ('americanthinker', 115),
 ('vancouversun', 114),
 ('McWynnet

In [22]:
crawl_emb_dict['hasn']

array([ 6.9900e-02, -6.0200e-01, -1.2310e-01, -5.6850e-01,  3.8700e-02,
       -9.7500e-02,  3.6540e-01,  2.3150e-01, -1.0831e+00,  1.6100e-02,
       -1.6860e-01,  2.8690e-01, -6.5800e-02,  8.4400e-02, -1.3490e-01,
        1.1190e-01, -9.8720e-01,  1.2010e-01,  3.2180e-01,  8.1100e-02,
       -6.7800e-02,  5.3900e-02, -8.5600e-02, -1.4750e-01, -2.0620e-01,
        2.1600e-02,  1.6360e-01, -1.1330e-01,  1.3640e-01,  5.0080e-01,
       -2.1800e-01, -7.1100e-02, -3.4560e-01, -5.9890e-01,  1.0000e-01,
        3.4670e-01, -8.1900e-02, -2.7520e-01, -1.9740e-01,  6.1670e-01,
        8.6100e-02,  1.9800e-02,  1.5950e-01,  5.8010e-01, -1.3000e-03,
       -3.8500e-02,  1.4070e-01, -2.3600e-02,  3.1500e-02, -2.1140e-01,
       -3.9000e-02,  4.4750e-01,  6.0760e-01, -2.3100e-01, -1.6230e-01,
        1.1500e-02, -7.7400e-02,  1.0060e-01,  2.3250e-01,  3.4400e-02,
        1.7830e-01, -1.0450e-01, -2.1590e-01, -2.6670e-01, -3.1260e-01,
       -1.7290e-01, -1.8380e-01,  7.4600e-02,  1.7000e-02, -9.95

In [23]:
del vocab
gc.collect()

760

In [24]:
import joblib
# 7.67 -> 9.5くらい
with open('../input/crawl_emb_processed.joblib', 'wb') as f:
    joblib.dump(crawl_emb_dict, f)