In [25]:
from sklearn.externals.joblib import Parallel, delayed
from math import sqrt
from time import sleep
from heapq import nlargest
import pandas as pd
import re
import nltk
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import nltk 
nltk.data.path.append("/var/lib/docker/nltk_data")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import seaborn as sns
from textblob import TextBlob
from textblob import Word
from pywsd.utils import lemmatize_sentence



Warming up PyWSD (takes ~10 secs)... took 3.694720506668091 secs.


In [22]:
def clean_str(s):
    """Clean sentence"""
    s = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", s)
    s = re.sub(r"\'s", " \'s", s)
    s = re.sub(r"\'ve", " \'ve", s)
    s = re.sub(r"n\'t", " n\'t", s)
    s = re.sub(r"\'re", " \'re", s)
    s = re.sub(r"\'d", " \'d", s)
    s = re.sub(r"\'ll", " \'ll", s)
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\(", " \( ", s)
    s = re.sub(r"\)", " \) ", s)
    s = re.sub(r"\?", " \? ", s)
    s = re.sub(r"\s{2,}", " ", s)
    s = re.sub(r'\S*(x{2,}|X{2,})\S*',"xxx", s)
    s = re.sub(r'[^\x00-\x7F]+', "", s)
    s = re.sub(r'[^\w\s]',"",s)
    return s.strip().lower()

In [29]:
stop_words = set(stopwords.words('english'))
stop_words.add('The')
stop_words.add('This')
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [27]:
def perform_lemm(sentence):
    word_tokens = word_tokenize(sentence)
    lemm_sentence = [lemmatizer.lemmatize(w,pos='v') for w in word_tokens]
    return ' '.join(lemm_sentence)

In [19]:
data = pd.read_csv('data/ads_en_us.csv')

In [23]:
%%time
data['value_clean'] = Parallel(n_jobs=-1, verbose=10)(delayed(clean_str)(i) for i in data['value'].tolist())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1789s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0066s.) Setting batch_size=120.
[Parallel(n_jobs=-1)]: Done  55 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1330s.) Setting batch_size=360.
[Parallel(n_jobs=-1)]: Done 666 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 2706 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 7386 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 12786 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 18186 tasks      | 

CPU times: user 1.32 s, sys: 384 ms, total: 1.7 s
Wall time: 17.1 s


[Parallel(n_jobs=-1)]: Done 97601 out of 97601 | elapsed:   17.0s finished


In [30]:
%%time
data['value_clean_lemm'] = Parallel(n_jobs=-1, verbose=10)(delayed(perform_lemm)(i) for i in data['value_clean'].tolist())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1685s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0268s.) Setting batch_size=28.
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0707s.) Setting batch_size=158.
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 1033 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 3087 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 5457 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 7827 tasks      | ela

CPU times: user 1.79 s, sys: 376 ms, total: 2.17 s
Wall time: 56.6 s


[Parallel(n_jobs=-1)]: Done 97601 out of 97601 | elapsed:   56.5s finished


In [34]:
%%time
data['value_processed_lemm'] = data.value_clean.apply(perform_lemm)

CPU times: user 4min 3s, sys: 133 ms, total: 4min 3s
Wall time: 4min 3s


In [31]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,catid,id,subcatid,value,value_clean,value_clean_lemm
0,0,2,45493,27,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,overview why affirma rehabilitation our innov...,overview why affirma rehabilitation our innova...
1,1,2,45499,27,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,overview why affirma rehabilitation our innov...,overview why affirma rehabilitation our innova...
2,2,2,45522,27,Overview:\r\n\r\nUnder general supervision by ...,overview under general supervision by the supe...,overview under general supervision by the supe...
3,3,2,45543,134,Be your own boss working under a Strong Brand ...,be your own boss working under a strong brand ...,be your own boss work under a strong brand wit...
4,4,2,45545,134,Be your own boss working under a Strong Brand ...,be your own boss working under a strong brand ...,be your own boss work under a strong brand wit...
5,5,2,45546,134,Be your own boss working under a Strong Brand ...,be your own boss working under a strong brand ...,be your own boss work under a strong brand wit...
6,6,2,45549,134,Be your own boss working under a Strong Brand ...,be your own boss working under a strong brand ...,be your own boss work under a strong brand wit...
7,7,2,45550,134,Be your own boss working under a Strong Brand ...,be your own boss working under a strong brand ...,be your own boss work under a strong brand wit...
8,8,2,45552,134,Be your own boss working under a Strong Brand ...,be your own boss working under a strong brand ...,be your own boss work under a strong brand wit...
9,9,2,45553,134,Be your own boss working under a Strong Brand ...,be your own boss working under a strong brand ...,be your own boss work under a strong brand wit...


In [2]:
Parallel(n_jobs=1)(delayed(sqrt)(i**2) for i in range(10))

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]

In [16]:
r = g(n_jobs=-1, verbose=10)(delayed(sleep)(.5) for _ in range(16))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  16 | elapsed:    0.5s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done   5 out of  16 | elapsed:    0.5s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done   7 out of  16 | elapsed:    0.5s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   9 out of  16 | elapsed:    1.0s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  11 out of  16 | elapsed:    1.0s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  13 out of  16 | elapsed:    1.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:    1.0s finished


In [8]:
def producer():
    for i in range(6):
        print('Produced %s' % i)
        yield i

In [12]:
out = Parallel(n_jobs=-1, verbose=100, pre_dispatch='1.5*n_jobs')(delayed(sqrt)(i) for i in producer()) #doctest: +SKIP

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Produced 0
Produced 1
Produced 2
Produced 3
Produced 4
Produced 5
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done   2 out of   6 | elapsed:    0.5s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:    0.6s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   4 out of   6 | elapsed:    0.6s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.6s finished


In [11]:
print(out)

[0.0, 1.0, 1.4142135623730951, 1.7320508075688772, 2.0, 2.23606797749979]
