## Почему n-граммы? 

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GroupKFold

In [2]:
# https://www.kaggle.com/c/random-acts-of-pizza
df = pd.read_json('pizza.json')

In [3]:
df.head()

Unnamed: 0,giver_username_if_known,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_id,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,...,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,requester_user_flair,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc
0,,0,1,0,t3_l25d7,0,Hi I am in need of food for my 4 children we a...,Hi I am in need of food for my 4 children we a...,Request Colorado Springs Help Us Please,0.0,...,False,[],0,1,0,1,,nickylvst,1317852607,1317849007
1,,2,5,0,t3_rcb83,0,I spent the last money I had on gas today. Im ...,I spent the last money I had on gas today. Im ...,"[Request] California, No cash and I could use ...",501.1111,...,False,"[AskReddit, Eve, IAmA, MontereyBay, RandomKind...",34,4258,116,11168,,fohacidal,1332652424,1332648824
2,,0,3,0,t3_lpu5j,0,My girlfriend decided it would be a good idea ...,My girlfriend decided it would be a good idea ...,"[Request] Hungry couple in Dundee, Scotland wo...",0.0,...,False,[],0,3,0,3,,jacquibatman7,1319650094,1319646494
3,,0,1,1,t3_mxvj3,4,"It's cold, I'n hungry, and to be completely ho...","It's cold, I'n hungry, and to be completely ho...","[Request] In Canada (Ontario), just got home f...",6.518438,...,False,"[AskReddit, DJs, IAmA, Random_Acts_Of_Pizza]",54,59,76,81,,4on_the_floor,1322855434,1322855434
4,,6,6,0,t3_1i6486,5,hey guys:\n I love this sub. I think it's grea...,hey guys:\n I love this sub. I think it's grea...,[Request] Old friend coming to visit. Would LO...,162.063252,...,False,"[GayBrosWeightLoss, RandomActsOfCookies, Rando...",1121,1225,1733,1887,,Futuredogwalker,1373657691,1373654091


In [4]:
df.requester_received_pizza.value_counts()

False    3046
True      994
Name: requester_received_pizza, dtype: int64

In [5]:
%%time
vectorizers = [(TfidfVectorizer(analyzer='word'), 'word'),
                (TfidfVectorizer(analyzer='char', 
                                 ngram_range=(2, 3)), 'char')
              ]
models = [(LogisticRegression(), 'logreg'),
          (RandomForestClassifier(n_estimators=100,
                                  n_jobs=-1,
                                  max_depth=4), 'rf')]

CPU times: user 141 µs, sys: 165 µs, total: 306 µs
Wall time: 149 µs


In [6]:
for mod, mod_name in models:
    for vect, vect_name in vectorizers:
        auc = cross_val_score(mod,
                              X=vect.fit_transform(df.request_text.values), 
                              y=df.requester_received_pizza.values, 
                              scoring='roc_auc'
                             )
        print(mod_name, vect_name, np.mean(auc), np.std(auc))

logreg word 0.6210191072416061 0.004537829302499092
logreg char 0.6215576739181633 0.005058974514467352
rf word 0.6148022980788828 0.005849574495967524
rf char 0.6192638800935187 0.00913729301128733


### Как быстро работает наша реализация

In [7]:
from bvtextprocessing import ngrams

In [50]:
texts = df.request_text.tolist()

In [51]:
%%time
_ = [ngrams.char_ngrams(t, 3) for t in texts]

CPU times: user 1.37 s, sys: 245 ms, total: 1.62 s
Wall time: 1.52 s


In [52]:
%%time
_ = [ngrams.word_ngrams(t, 1) for t in texts]

CPU times: user 557 ms, sys: 69.2 ms, total: 626 ms
Wall time: 616 ms


### Давайте попробуем в несколько потоков

In [53]:
from functools import partial
char_ngrams_3 = partial(ngrams.char_ngrams, n=3)
word_ngrams_1 = partial(ngrams.word_ngrams, n=1)
n_threads = 4

In [7]:
from concurrent.futures import ThreadPoolExecutor

In [55]:
%%time
with ThreadPoolExecutor(n_threads) as pool:
    results = list(pool.map(char_ngrams_3, texts))

CPU times: user 2.28 s, sys: 822 ms, total: 3.11 s
Wall time: 2.66 s


In [56]:
%%time
with ThreadPoolExecutor(n_threads) as pool:
    results = list(pool.map(word_ngrams_1, texts))

CPU times: user 1.47 s, sys: 390 ms, total: 1.86 s
Wall time: 1.62 s


### Немного о GIL, потоках и процессах? 

In [57]:
# change python2 to python3
# http://www.dabeaz.com/python/GIL.pdf
# http://www.dabeaz.com/python/NewGIL.pdf

### А теперь несколько процессов 

In [58]:
from concurrent.futures import ProcessPoolExecutor
from functools import partial

In [59]:
%%time
with ProcessPoolExecutor(n_threads) as pool:
    results = list(pool.map(char_ngrams_3, texts))

CPU times: user 1.71 s, sys: 682 ms, total: 2.4 s
Wall time: 2.6 s


In [60]:
%%time
with ProcessPoolExecutor(n_threads) as pool:
    results = list(pool.map(word_ngrams_1, texts))

CPU times: user 1.63 s, sys: 669 ms, total: 2.3 s
Wall time: 2.38 s


####  Все равно медленно :(
https://stackoverflow.com/questions/18671528/processpoolexecutor-from-concurrent-futures-way-slower-than-multiprocessing-pool

## Multiprocessing

In [61]:
# аккуранто
# multiprocessing - процессы
# multiprocessing.dummy - процессы
from multiprocessing import Pool

In [35]:
!pip install boltons

Collecting boltons
  Downloading boltons-18.0.0-py2.py3-none-any.whl (154kB)
[K    100% |████████████████████████████████| 163kB 1.7MB/s ta 0:00:01
[?25hInstalling collected packages: boltons
Successfully installed boltons-18.0.0
[33mYou are using pip version 9.0.1, however version 10.0.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [69]:
from boltons.iterutils import chunked_iter
def char_ngram_iter(values):
    res = []
    for v in values:
        res.append(char_ngrams_3(v))
    return res

In [62]:
%%time
pool = Pool(n_threads)
res = pool.map(char_ngrams_3, texts)
pool.close()

CPU times: user 700 ms, sys: 295 ms, total: 995 ms
Wall time: 1.88 s


In [74]:
%%time
pool = Pool(n_threads)
res = pool.map(word_ngrams_1, texts)
pool.close()

CPU times: user 416 ms, sys: 212 ms, total: 628 ms
Wall time: 985 ms


### Joblib

In [75]:
from joblib import Parallel, delayed

# Parallel has backend="threading"

In [80]:
%%time
res = Parallel(n_jobs=n_threads
              )(delayed(char_ngrams_3)(t) for t in texts)

CPU times: user 1.07 s, sys: 320 ms, total: 1.39 s
Wall time: 7.32 s


In [77]:
%%time
res = Parallel(n_jobs=n_threads
              )(delayed(word_ngrams_1)(t) for t in texts)

CPU times: user 748 ms, sys: 205 ms, total: 953 ms
Wall time: 2.92 s
