In [157]:
import Levenshtein
import numpy as np
import time
import pandas as pd
from nltk.util import ngrams
from gensim.models import Word2Vec, KeyedVectors

In [68]:
# Record the consumed time when ran a code block.
class Timer(object):
    def __init__(self, block_name, prefix="----->"):
        self.block_name = block_name
        self.prefix = prefix

    def __enter__(self):
        print(self.prefix+"Started '"+self.block_name+"' block...")
        self.time_start = time.time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        elapsed_time = round(time.time() - self.time_start, 2)
        print(self.prefix+"Finished '"+self.block_name+"' block, time used:", str(elapsed_time)+"s.")

In [3]:
def edit_set_ratio(a, b):
    return Levenshtein.setratio(a.split(), b.split())

In [4]:
def edit_set_ratio1(a, b):
    return Levenshtein.setratio(a, b)

In [5]:
a = ["he she me"] * 5000000
b = ["she hate he"] * 5000000
# print(edit_set_ratio(a[0], b[0]))
# print(edit_set_ratio(a[1], b[1]))
with Timer("veced"):
    print(np.vectorize(edit_set_ratio)(a, b))

----->Started 'veced' block...
[0.77777778 0.77777778 0.77777778 ... 0.77777778 0.77777778 0.77777778]
----->Finished 'veced' block, time used: 14.3s.


In [6]:
t = pd.DataFrame({
    "a": [["he", "she", "me"]] * 5000000,
    "b": [["he", "she", "me"]] * 5000000
})
# with Timer("non-veced"):
#     print(t.apply(lambda x: edit_set_ratio1(x["a"], x["b"]), axis=1))

----->Started 'non-veced' block...
----->Finished 'non-veced' block, time used: 2.05s.


KeyboardInterrupt: 

In [7]:
a = [1, 2, 3]
b = [4, 5, 6]
print(list(map(max, a, b)))

[4, 5, 6]


In [28]:
def _unigrams(words):
    """
        Input: a list of words, e.g., ["I", "am", "Denny"]
        Output: a list of unigram
    """
    assert type(words) == list
    return words

# One line time: 5.04 µs ± 272 ns
def _bigrams(words, join_string='_', skip=0):
    """
      Input: a list of words, e.g., ["I", "am", "Denny"]
      Output: a list of bigram, e.g., ["I_am", "am_Denny"]
      I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for k in range(1,skip+2):
                if i+k < L:
                    lst.append(join_string.join([words[i], words[i+k]]) )
    else:
        # set it as unigram
        lst = _unigrams(words)
    return lst


# One line time: 5.89 µs ± 196 ns
def _trigrams(words, join_string='_', skip=0):
    """
      Input: a list of words, e.g., ["I", "am", "Denny"]
      Output: a list of trigram, e.g., ["I_am_Denny"]
      I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 2:
        lst = []
        for i in range(L-2):
            for k1 in range(1,skip+2):
                for k2 in range(1,skip+2):
                    if i+k1 < L and i+k1+k2 < L:
                        lst.append(join_string.join([words[i], words[i+k1], words[i+k1+k2]]) )
    else:
        # set it as bigram
        lst = _bigrams(words, join_string, skip)
    return lst

def unichars(text):
    return _unigrams(list(text))

def bichars(text):
    return _bigrams(list(text))

def trichars(text):
    return _trigrams(list(text))

def unigrams(text):
    return _unigrams(text.split())

def bigrams(text):
    return _bigrams(text.split())

def trigrams(text):
    return _trigrams(text.split())

In [202]:
from scipy.stats import skew, kurtosis

In [76]:
def vectorize_func(func):
    def do_it(arr):
        return [func(_) for _ in arr]
    return do_it

In [91]:
import numba

In [183]:
df = pd.DataFrame({
    "q_ngram": [["1", "2", "3"]]*100000,
    "t_ngram": [["1", "4", "2", "5"]]*100000,
})

In [None]:
with Timer("pandas apply"):
    df['q-len'] = df['a'].apply(lambda x: len(x))
with Timer("vec"):
    df['q-len'] = vectorize_func(len)(df["a"])
with Timer("np vec"):
    df['q-len'] = np.vectorize(len)(df["a"])

In [94]:
prefix = "test"
with Timer("extract ngram length"):
    df['%s_q-len' % prefix] = np.vectorize(len)(df["q_ngram"])
    df['%s_t-len' % prefix] = np.vectorize(len)(df["t_ngram"])
    df['%s_q-log-len' % prefix] = np.vectorize(lambda x: np.log1p(len(x)))(df["q_ngram"])
    df['%s_t-log-len' % prefix] = np.vectorize(lambda x: np.log1p(len(x)))(df["t_ngram"])

----->Started 'extract ngram length' block...
----->Finished 'extract ngram length' block, time used: 3.01s.


In [97]:
print(df[:2])

     q_ngram       t_ngram  test_q-len  test_t-len  test_q-log-len  \
0  [a, b, c]  [a, b, c, d]           3           4        1.386294   
1  [a, b, c]  [a, b, c, d]           3           4        1.386294   

   test_t-log-len  
0        1.609438  
1        1.609438  


In [98]:
prefix = "test"
with Timer("extract ngram length"):
    df['%s_q-len' % prefix] = df['q_ngram'].apply(lambda x: len(x))
    df['%s_t-len' % prefix] = df['t_ngram'].apply(lambda x: len(x))
    df['%s_q-log-len' % prefix] = df['q_ngram'].apply(lambda x: np.log1p(len(x)))
    df['%s_t-log-len' % prefix] = df['t_ngram'].apply(lambda x: np.log1p(len(x)))

----->Started 'extract ngram length' block...
----->Finished 'extract ngram length' block, time used: 3.69s.


In [99]:
print(df[:2])

     q_ngram       t_ngram  test_q-len  test_t-len  test_q-log-len  \
0  [a, b, c]  [a, b, c, d]           3           4        1.386294   
1  [a, b, c]  [a, b, c, d]           3           4        1.386294   

   test_t-log-len  
0        1.609438  
1        1.609438  


In [131]:
def try_divide(x, y):
    """try to divide two numbers"""
    return float(x) / y if y != 0.0 else 0.0
print(try_divide(10, 0))
def jaccard_ratio(a, b):
    a, b = set(a), set(b)
    c = a & b
    return try_divide(float(len(c)), (len(a) + len(b) - len(c)))
def dice_ratio(a, b):
    a, b = set(a), set(b)
    return try_divide(2 * len(a & b), len(a) + len(b))

0.0


In [144]:
def jaccard_ratio(a, b):
    a, b = set(a), set(b)
    c = a & b
    return try_divide(len(c), len(a) + len(b) - len(c))

In [146]:
def jaccard_ratio1(a, b):
    a, b = set(a), set(b)
    c = a & b
    return try_divide(float(len(c)), (len(a) + len(b) - len(c)))

In [127]:
# dice_ratio1 = lambda x, y: try_divide(2 * len(set(a)&set(b)), (len(set(a)) + len(set(b))))

In [147]:
print(jaccard_ratio1(["i", "hate", "him"], ["she", "hate", "me"]))

0.2


In [152]:
with Timer("similay"):
    df['%s_dice-ratio'% prefix] = df.apply(lambda x: jaccard_ratio(x['q_ngram'], x['t_ngram']), axis=1)

----->Started 'similay' block...
----->Finished 'similay' block, time used: 3.92s.


In [153]:
print(df[:2])

     q_ngram       t_ngram  test_dice-ratio
0  [a, b, c]  [a, b, c, d]             0.75
1  [a, b, c]  [a, b, c, d]             0.75


In [154]:
with Timer("similay"):
    df['%s_dice-ratio'% prefix] = np.vectorize(dice_ratio)(df['q_ngram'], df['t_ngram'])

----->Started 'similay' block...
----->Finished 'similay' block, time used: 0.14s.


In [155]:
print(df[:2])

     q_ngram       t_ngram  test_dice-ratio
0  [a, b, c]  [a, b, c, d]             0.75
1  [a, b, c]  [a, b, c, d]             0.75


In [159]:
model = KeyedVectors.load("./stage1/output/word2vec.kv")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [195]:
vector_size = model.vectors.shape[1]
print(vector_size)

200


In [190]:
@numba.jit
def wmd(s1, s2):
    # 从文档整体上来考虑两个文档之间的相似性，这种技术称为词移距离（WMD）,https://blog.csdn.net/qrlhl/article/details/78512598
    # https://radimrehurek.com/gensim/models/keyedvectors.html#gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.wmdistance
    dis = np.nan_to_num(model.wmdistance(s1, s2))
    return dis if dis < 100 else 100

In [191]:
with Timer("wmd"):
    df['%s_wmd'% prefix] = df.apply(lambda x: wmd(x['q_ngram'], x['t_ngram']), axis=1)

----->Started 'wmd' block...


Compilation is falling back to object mode WITH looplifting enabled because Function "wmd" failed type inference due to: Untyped global name 'model': cannot determine Numba type of <class 'gensim.models.keyedvectors.Word2VecKeyedVectors'>

File "<ipython-input-190-b93a11fadb36>", line 5:
def wmd(s1, s2):
    <source elided>
    # https://radimrehurek.com/gensim/models/keyedvectors.html#gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.wmdistance
    dis = np.nan_to_num(model.wmdistance(s1, s2))
    ^

  @numba.jit

File "<ipython-input-190-b93a11fadb36>", line 2:
@numba.jit
def wmd(s1, s2):
^

  self.func_ir.loc))
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit

File "<ipython-input-190-b93a11fadb36>", line 2:
@numba.jit
def wmd(s1, s2):
^



----->Finished 'wmd' block, time used: 45.67s.


In [187]:
print(df[:2])

     q_ngram       t_ngram  test_wmd
0  [1, 2, 3]  [1, 4, 2, 5]   0.02527
1  [1, 2, 3]  [1, 4, 2, 5]   0.02527


In [192]:
with Timer("wmd"):
    df['%s_wmd1'% prefix] = np.vectorize(wmd)(df['q_ngram'], df['t_ngram'])

----->Started 'wmd' block...
----->Finished 'wmd' block, time used: 41.16s.


In [189]:
print(df[:2])

     q_ngram       t_ngram  test_wmd  test_wmd1
0  [1, 2, 3]  [1, 4, 2, 5]   0.02527    0.02527
1  [1, 2, 3]  [1, 4, 2, 5]   0.02527    0.02527


In [196]:
def sent2vec(words):
    # 计算embedding均值
    global vector_size
    # assert isinstance(words, list)
    return np.nan_to_num(
        np.array([
            np.zeros(vector_size)]+[model[w] for w in words if w in model]
        ).mean(axis=0)
    )

In [203]:
with Timer("wmd"):
    df['q_sen_vec'] = df["q_ngram"].apply(sent2vec)
    df['t_sen_vec'] = df["t_ngram"].apply(sent2vec)

----->Started 'wmd' block...
----->Finished 'wmd' block, time used: 9.22s.


In [204]:
print(df[:2])

     q_ngram       t_ngram  test_wmd  test_wmd1  \
0  [1, 2, 3]  [1, 4, 2, 5]   0.02527    0.02527   
1  [1, 2, 3]  [1, 4, 2, 5]   0.02527    0.02527   

                                           q_sen_vec  \
0  [-0.003970941637817305, 0.0014471168105956167,...   
1  [-0.003970941637817305, 0.0014471168105956167,...   

                                           t_sen_vec  
0  [-0.0023434016096871347, 0.0011191037250682712...  
1  [-0.0023434016096871347, 0.0011191037250682712...  


In [208]:
with Timer("skew"):
    df['%s_q-skew' % prefix] = df['q_sen_vec'].apply(lambda x :skew(x))

----->Started 'skew' block...
----->Finished 'skew' block, time used: 21.92s.


In [209]:
print(df[:2])

     q_ngram       t_ngram  test_wmd  test_wmd1  \
0  [1, 2, 3]  [1, 4, 2, 5]   0.02527    0.02527   
1  [1, 2, 3]  [1, 4, 2, 5]   0.02527    0.02527   

                                           q_sen_vec  \
0  [-0.003970941637817305, 0.0014471168105956167,...   
1  [-0.003970941637817305, 0.0014471168105956167,...   

                                           t_sen_vec  test_q-skew  
0  [-0.0023434016096871347, 0.0011191037250682712...    -0.188391  
1  [-0.0023434016096871347, 0.0011191037250682712...    -0.188391  


In [210]:
with Timer("skew"):
    df['%s_q-skew1' % prefix] = np.vectorize(skew)(df['q_sen_vec'])

----->Started 'skew' block...
----->Finished 'skew' block, time used: 18.96s.


In [211]:
print(df[:2])

     q_ngram       t_ngram  test_wmd  test_wmd1  \
0  [1, 2, 3]  [1, 4, 2, 5]   0.02527    0.02527   
1  [1, 2, 3]  [1, 4, 2, 5]   0.02527    0.02527   

                                           q_sen_vec  \
0  [-0.003970941637817305, 0.0014471168105956167,...   
1  [-0.003970941637817305, 0.0014471168105956167,...   

                                           t_sen_vec  test_q-skew  \
0  [-0.0023434016096871347, 0.0011191037250682712...    -0.188391   
1  [-0.0023434016096871347, 0.0011191037250682712...    -0.188391   

   test_q-skew1  
0     -0.188391  
1     -0.188391  


In [217]:
with Timer("kurtosis"):
        df['%s_q-kurtosis' % prefix] = df['q_sen_vec'].apply(lambda x :kurtosis(x))
        df['%s_t-kurtosis' % prefix] = df['t_sen_vec'].apply(lambda x :kurtosis(x))

----->Started 'kurtosis' block...
----->Finished 'kurtosis' block, time used: 28.63s.


In [213]:
print(df[:2])

     q_ngram       t_ngram  test_wmd  test_wmd1  \
0  [1, 2, 3]  [1, 4, 2, 5]   0.02527    0.02527   
1  [1, 2, 3]  [1, 4, 2, 5]   0.02527    0.02527   

                                           q_sen_vec  \
0  [-0.003970941637817305, 0.0014471168105956167,...   
1  [-0.003970941637817305, 0.0014471168105956167,...   

                                           t_sen_vec  test_q-skew  \
0  [-0.0023434016096871347, 0.0011191037250682712...    -0.188391   
1  [-0.0023434016096871347, 0.0011191037250682712...    -0.188391   

   test_q-skew1  test_q-kurtosis  test_t-kurtosis  
0     -0.188391        -0.129808        -0.303393  
1     -0.188391        -0.129808        -0.303393  


In [224]:
with Timer("kurtosis"):
        df['%s_q-kurtosis1' % prefix] = np.vectorize(kurtosis)(df['q_sen_vec'])
        df['%s_t-kurtosis1' % prefix] = np.vectorize(kurtosis)(df['t_sen_vec'])

----->Started 'kurtosis' block...
----->Finished 'kurtosis' block, time used: 29.27s.


In [None]:
with Timer("kurtosis"):
        df['%s_q-kurtosis1' % prefix] = [kurtosis(_) for _ in df['q_sen_vec']]
        df['%s_t-kurtosis1' % prefix] = [kurtosis(_) for _ in df['t_sen_vec']]

In [225]:
print(df[:2])

     q_ngram       t_ngram  test_wmd  test_wmd1  \
0  [1, 2, 3]  [1, 4, 2, 5]   0.02527    0.02527   
1  [1, 2, 3]  [1, 4, 2, 5]   0.02527    0.02527   

                                           q_sen_vec  \
0  [-0.003970941637817305, 0.0014471168105956167,...   
1  [-0.003970941637817305, 0.0014471168105956167,...   

                                           t_sen_vec  test_q-skew  \
0  [-0.0023434016096871347, 0.0011191037250682712...    -0.188391   
1  [-0.0023434016096871347, 0.0011191037250682712...    -0.188391   

   test_q-skew1  test_q-kurtosis  test_t-kurtosis  test_q-kurtosis1  \
0     -0.188391        -0.129808        -0.303393         -0.129808   
1     -0.188391        -0.129808        -0.303393         -0.129808   

   test_t-kurtosis1  
0         -0.303393  
1         -0.303393  


In [232]:
from scipy.spatial.distance import cosine, jaccard, cityblock, canberra, euclidean, minkowski, braycurtis, mahalanobis

def cosine_distance(a, b):
    s = np.linalg.norm(a) * np.linalg.norm(b)
    return np.nan_to_num(np.dot(a, b)/s) if s != 0.0 else 0.0


def jaccard_distance(a, b):
    return np.nan_to_num(jaccard(a, b))


def braycurtis_distance(a, b):
    return np.nan_to_num(braycurtis(a, b))


def canberra_distance(a, b):
    return np.nan_to_num(canberra(a, b))


def cityblock_distance(a, b):
    return np.nan_to_num(cityblock(a, b))


def euclidean_distance(a, b):
    return np.nan_to_num(euclidean(a, b))


def minkowski_distance(a, b, p=1):
    return np.nan_to_num(minkowski(a, b, p))

def mahalanobis_distance(a, b):
    return np.nan_to_num(mahalanobis(a, b))

In [233]:
with Timer("extract embed distance"):
        df['%s_cosine-distance' % prefix] = df.apply(lambda x: cosine_distance(x["q_sen_vec"], x["t_sen_vec"]), axis=1)
        df['%s_jaccard-distance' % prefix] = df.apply(lambda x: jaccard_distance(x["q_sen_vec"], x["t_sen_vec"]), axis=1)
        df['%s_canberra-distance' % prefix] = df.apply(lambda x: canberra_distance(x["q_sen_vec"], x["t_sen_vec"]), axis=1)
        df['%s_cityblock-distance' % prefix] = df.apply(lambda x: cityblock_distance(x["q_sen_vec"], x["t_sen_vec"]), axis=1)
        df['%s_euclidean-distance' % prefix] = df.apply(lambda x: euclidean_distance(x["q_sen_vec"], x["t_sen_vec"]), axis=1)
        df['%s_minkowski-distance' % prefix] = df.apply(lambda x: minkowski_distance(x["q_sen_vec"], x["t_sen_vec"]), axis=1)
        df['%s_braycurtis-distance' % prefix] = df.apply(lambda x: braycurtis_distance(x["q_sen_vec"], x["t_sen_vec"]), axis=1)

----->Started 'extract embed distance' block...
----->Finished 'extract embed distance' block, time used: 67.9s.


In [236]:
with Timer("extract embed distance"):
    df['%s_cosine-distance' % prefix] = np.vectorize(cosine_distance)(df["q_sen_vec"], df["t_sen_vec"])
    df['%s_cityblock-distance' % prefix] = np.vectorize(cityblock_distance)(df["q_sen_vec"], df["t_sen_vec"])
    df['%s_euclidean-distance' % prefix] = np.vectorize(euclidean_distance)(df["q_sen_vec"], df["t_sen_vec"])
    df['%s_jaccard-distance' % prefix] = np.vectorize(jaccard_distance)(df["q_sen_vec"], df["t_sen_vec"])
    df['%s_canberra-distance' % prefix] = np.vectorize(canberra_distance)(df["q_sen_vec"], df["t_sen_vec"])
    df['%s_braycurtis-distance' % prefix] = np.vectorize(braycurtis_distance)(df["q_sen_vec"], df["t_sen_vec"])

----->Started 'extract embed distance' block...
----->Finished 'extract embed distance' block, time used: 25.11s.


In [238]:
a = np.load("stage2/input/train_v1_feature_concat.npy")
print(a[:2])

[[ 3.8000000e+01  1.8000000e+01  3.6635616e+00  2.9444390e+00
   1.2244898e-01  6.5217390e-02  2.9920635e-01  4.4801587e-01
   3.7000000e+01  1.7000000e+01  3.6375860e+00  2.8903718e+00
   3.7037037e-02  1.8867925e-02  2.9472348e-01  3.8540572e-01
   3.6000000e+01  1.6000000e+01  3.6109178e+00  2.8332133e+00
   0.0000000e+00  0.0000000e+00  2.9030600e-01  3.4875760e-01
   3.3169815e-01 -1.7374037e-01 -1.7004882e-01 -1.4810610e-01
  -1.5124854e-01  9.9992895e-01  2.2771935e+00  1.9882844e-01
   1.0000000e+00  3.8897835e+01  1.9244862e-01]
 [ 3.8000000e+01  1.5000000e+01  3.6635616e+00  2.7725887e+00
   2.5000000e-01  1.4285715e-01  3.2680443e-01  4.3620846e-01
   3.7000000e+01  1.4000000e+01  3.6375860e+00  2.7080503e+00
   7.8431375e-02  4.0816326e-02  3.2240057e-01  3.8752028e-01
   3.6000000e+01  1.3000000e+01  3.6109178e+00  2.6390574e+00
   0.0000000e+00  0.0000000e+00  3.0972785e-01  3.6288175e-01
   1.3068473e-01 -1.7374037e-01 -1.7450495e-01 -1.4810610e-01
  -1.3898596e-01  9.99

In [240]:
def f(a, b):
    print(a, b)
res1 = [[1, 2], [3, 4]]
res2 = [[5, 6], [7, 8]]
list(map(f, res1, res2))

[1, 2] [5, 6]
[3, 4] [7, 8]


[None, None]

In [246]:
a = pd.DataFrame()
if type(a) == pd.core.frame.DataFrame:
    print(True)
else:
    print(False)

True


In [263]:
from scipy.spatial.distance import cdist, braycurtis

In [271]:
print(braycurtis.__name__)
a = [[0.4, 2, 3], [2, 3, 4]]
b = [[0, 2, 3], [5, 6, 7]]
Y = list(map(braycurtis, a, b))
print(Y)

braycurtis
[0.038461538461538464, 0.3333333333333333]


In [274]:
from fuzzywuzzy import fuzz
fuzz.token_set_ratio(["a", "b", "c"], ["d", "e", "f"])

40

In [275]:
word2vec_model = {
    "1" : [1, 2, 3],
    "2": [4, 5, 6]
}

In [276]:
def sent2vec(text):
    # 计算embedding均值
    global word2vec_model
    return np.nan_to_num(
          np.array([word2vec_model[w] for w in text.split() if w in word2vec_model]
          ).mean(axis=0)
      )

In [284]:
sent2vec(" 1 2 ")
tmp = list(map(sent2vec,(["1 2", "2 1"])))
print(tmp)
np.save("tmp.npy", tmp)

[array([2.5, 3.5, 4.5]), array([2.5, 3.5, 4.5])]


In [285]:
print([] or [1, 2, 3])

[1, 2, 3]


In [450]:
df = pd.DataFrame({
    "query_id": [1, 1, 2, 2, 2, 3, 3],
    "title": ["1 2 3", "4 2 3", '3 3 2', '4 5 6', '5 5 3', '1 2 4', '4 5 6']
})

In [451]:
def findDiff(arr, ngram):
    set_list = list(map(lambda x: set(ngram(x)), arr))
    res = []
    for i, _ in enumerate(set_list):
        result = _
        for __ in set_list:
            if _ != __:
                result = result.difference(__)
        res.append(list(result))
    # print(res)
    return res

In [452]:
cal_diff_set = np.frompyfunc(f, 
    2, 1)

In [453]:
from functools import reduce

In [454]:
def reduce_diff_set_group(arr):
    return list(reduce(lambda a, b: a.difference(b), arr))

In [455]:
def reduce_diff_set(group):
    df = group.apply(lambda x: set(x.split())).values
    res = []
    for i in range(len(df)):
        if i == 0:
            res.append(reduce_diff_set_group(df))
        else:
            tmp = np.copy(df)
            tmp = tmp[np.newaxis, :]
            tmp[:, [0, i]] = tmp[:, [i, 0]]
            tmp = tmp.squeeze()
            res.append(reduce_diff_set_group(tmp))
    return res

In [457]:
grouped = df.groupby("query_id")["title"].agg(reduce_diff_set)
res = []
for _ in grouped:
    res.extend(_)
print(res)

[['1'], ['4'], ['2'], ['4', '6'], [], ['1', '2'], ['5', '6']]


In [504]:
from sklearn.metrics import roc_auc_score

In [516]:
def CalQAuc(df):
    
    auc_score = []

    for name, group in df.groupby('query_id'):
        try:
            auc_score.append(roc_auc_score(group["label"], group["prediction"]
                                       ))
        except:
            auc_score.append(0.5) 

    return np.mean(auc_score)

In [517]:
CalQAuc(pd.DataFrame({
    "query_id": [1, 1, 2, 2, 3],
    "label": [1, 0, 1, 1, 0],
    "prediction": [0.3, 0.5, 0.2, 0.1, 0.6] 
}))

[0.0, 0.5, 0.5]


0.3333333333333333

In [525]:
df = pd.DataFrame({
    "tmp":[0, 0.0, 0, 0.0],
    "tmp1": [1, 2, 0, 0]
})

df = df.loc[:,~((df==0.0).all())]
print(df)

   tmp1
0     1
1     2
2     0
3     0


In [7]:
def try_divide(x, y):
    """try to divide two numbers"""
    return float(x) / y if y != 0.0 else 0.0

In [8]:
# One line time: 4.2 µs ± 109 ns
def dice_ratio(a, b):
    a, b = set(a), set(b)
    return try_divide(2 * len(a & b), len(a) + len(b))

In [10]:
print(dice_ratio(["a", "b", "c"], "acd"))

0.6666666666666666


In [62]:
import Levenshtein
print(Levenshtein.ratio('acf e', 'de f'))

0.2222222222222222


In [61]:
print(Levenshtein.seqratio(["a", "c",  "f"," ", "e"], ["d","e"," ", "f"]))

0.2222222222222222


In [15]:
from fuzzywuzzy import fuzz

In [58]:
fuzz.ratio('Thorkel', 'Thorgier')

67

In [45]:
fuzz.ratio(["89","9", "8"], ["1", "2", "3"])

77

In [66]:
import numpy as np
np.array([
    [1, 2, 3],
    [2, 3, 4]
]).mean(axis=0)

array([1.5, 2.5, 3.5])

In [16]:
import numpy as np
from scipy.sparse import csr_matrix
A = csr_matrix([4, 0, 5])
B = csr_matrix([1, 0, 3])
print(A.multiply(B).sum())
print(A.power(2).sum())

19
41


In [5]:
A.dot(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))

array([[ 9, 12, 15],
       [21, 24, 27],
       [39, 48, 57]], dtype=int64)

In [12]:
def csr_jaccard_sim(x, y):
    xy = x.multiply(y).sum()
    x_norm = x.power(2).sum()
    y_norm = y.power(2).sum()
    return try_divide(xy, math.sqrt(x_norm)+math.sqrt(y_norm)-xy)

In [22]:
A = csr_matrix([[1, 2, 3], [4, 5, 6]], shape=(2, 3))
B = csr_matrix([[1, 2, 4], [4, 5, 7]], shape=(2, 3))

In [26]:
from sklearn.metrics.pairwise import paired_distances
import math

In [24]:
def try_divide(x, y):
    """try to divide two numbers"""
    return float(x) / y if y != 0.0 else 0.0

In [27]:
list(map(csr_jaccard_sim, A, B))

[-1.9594809496552812, -1.2820868764564912]

In [30]:
A = csr_matrix([[1, 2, 3], [4, 5, 6]], shape=(2, 3))
print(A.sum(1))

[[ 6]
 [15]]


In [31]:
import pandas as pd

In [36]:
tmp = pd.DataFrame({
    "a": [[1], [2], [3]],
    "c": [1, 2, 3]
})
print(tmp.squeeze())

     a  c
0  [1]  1
1  [2]  2
2  [3]  3


In [45]:
import numpy as np
np.array([np.array([1, 2, 3])*3, np.array([4, 5, 6])*2]).sum(0)

array([11, 16, 21])

In [49]:
import scipy
print(scipy.stats.pearsonr([1, 2, 1], [4, 1, 6]))

(-0.9176629354822472, 0.2601469382930058)


In [51]:

from sklearn.feature_extraction.text import HashingVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = HashingVectorizer(n_features=2**4)
X = vectorizer.fit_transform(corpus)
print(X.shape)
print(X.toarray())

(4, 16)
[[-0.57735027  0.          0.          0.          0.          0.
   0.          0.         -0.57735027  0.          0.          0.
   0.          0.57735027  0.          0.        ]
 [-0.81649658  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.40824829
   0.          0.40824829  0.          0.        ]
 [ 0.          0.          0.          0.         -0.70710678  0.70710678
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.        ]
 [-0.57735027  0.          0.          0.          0.          0.
   0.          0.         -0.57735027  0.          0.          0.
   0.          0.57735027  0.          0.        ]]


In [52]:
from simhash import Simhash

In [57]:
print(Simhash(['aa', "cc"]).distance(Simhash(['bb', "dd"])))

22


In [59]:
import re
from simhash import Simhash, SimhashIndex
def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]

data = {
    1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
    2: u'How are you i am fine. blar blar blar blar blar than',
    3: u'This is simhash test.',
}
objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
index = SimhashIndex(objs, k=3)

print(index.bucket_size())

s1 = Simhash(get_features(u'How are you i am fine. blar blar blar blar blar thank'))
print(index.get_near_dups(s1))

index.add('4', s1)
print(index.get_near_dups(s1))

11
['1']
['1', '4']


In [60]:
import numpy as np
def lcs(seq1, seq2):
    seq1 = seq1.split()
    seq2 = seq2.split()
    dp = np.zeros((len(seq1)+1, len(seq2)+1))
    for i in range(len(seq1)):
        for j in range(len(seq2)):
            dp[i + 1][j + 1] = max(dp[i + 1][j], dp[i][j], dp[i][j + 1])
            if seq1[i] == seq2[j]:
                dp[i + 1][j + 1] = max(dp[i][j] + 1, dp[i + 1][j + 1])
    seq1_mask = [ 0 for wd in seq1]
    seq2_mask = [ 0 for wd in seq2]
    ii, jj = len(seq1), len(seq2)
    while ii != 0 and jj != 0:
        if dp[ii][jj] == dp[ii - 1][jj - 1] + 1 and seq1[ii - 1] == seq2[jj - 1]:
            seq1_mask[ii - 1] = 1
            seq2_mask[jj - 1] = 1
            ii = ii - 1
            jj = jj - 1
            continue
        if dp[ii][jj] == dp[ii - 1][jj]:
            ii = ii - 1
        elif dp[ii][jj] == dp[ii][jj - 1]:
            jj = jj - 1
        elif dp[ii][jj] == dp[ii - 1][jj - 1]:
            ii = ii - 1
            jj = jj - 1
    seq1_left = [ wd for wd, mk in zip(seq1, seq1_mask) if mk == 0]
    seq2_left = [ wd for wd, mk in zip(seq2, seq2_mask) if mk == 0]
    
    return np.max(dp), ' '.join(seq1_left), ' '.join(seq2_left)

In [65]:
print(lcs("kiss my ass kk", "fuck you my ass"))

(2.0, 'kiss kk', 'fuck you')


In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer
data = [
            'How are you? I Am fine. blar blar blar blar blar Thanks.',
            'How are you i am fine. blar blar blar blar blar than',
            'This is simhash test.',
            'How are you i am fine. blar blar blar blar blar thank1'
        ]
vec = TfidfVectorizer()
D = vec.fit_transform(data)
voc = dict((i, w) for w, i in vec.vocabulary_.items())
shs = []
for Di in D:
    print(Di.data)
    # features as list of (token, weight) tuples)
    features = zip([voc[j] for j in Di.indices], Di.data)
    shs.append(Simhash(features))


[0.17553445 0.17553445 0.17553445 0.17553445 0.17553445 0.87767223
 0.27500863]
[0.17553445 0.17553445 0.17553445 0.17553445 0.17553445 0.87767223
 0.27500863]
[0.5 0.5 0.5 0.5]
[0.17553445 0.17553445 0.17553445 0.17553445 0.17553445 0.87767223
 0.27500863]


In [78]:
class TestSimhashIndex():
    data = {
        1: 'How are you? I Am fine. blar blar blar blar blar Thanks.',
        2: 'How are you i am fine. blar blar blar blar blar than',
        3: 'This is simhash test.',
        4: 'How are you i am fine. blar blar blar blar blar thank1',
    }

    def setUp(self):
        objs = [(str(k), Simhash(v)) for k, v in self.data.items()]
        self.index = SimhashIndex(objs, k=10)

    def test_get_near_dup(self):
        s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank')
        dups = self.index.get_near_dups(s1)
        print(dups)

#         self.index.delete('1', Simhash(self.data[1]))
#         dups = self.index.get_near_dups(s1)
#         print(dups)

#         self.index.delete('1', Simhash(self.data[1]))
#         dups = self.index.get_near_dups(s1)
#         self.assertEqual(len(dups), 2)

#         self.index.add('1', Simhash(self.data[1]))
#         dups = self.index.get_near_dups(s1)
#         self.assertEqual(len(dups), 3)

#         self.index.add('1', Simhash(self.data[1]))
#         dups = self.index.get_near_dups(s1)
#         self.assertEqual(len(dups), 3)

In [79]:
tmp = TestSimhashIndex()
tmp.setUp()
tmp.test_get_near_dup()

['1', '4', '2']


In [94]:
from gensim.summarization import keywords

In [95]:
text = "Challenges in natural language processing frequently involve speech recognition, natural language understanding, natural language"
print(keywords.get_graph(text))


AttributeError: 'function' object has no attribute 'get_graph'

In [105]:
text = """1427 5661 29788 1427 387 2299,
372 22 1586 1025 218 165 218 27 7092 22 266 25817 4550 
2136 11 60847 9156 1077 797 27 689 3447 11146 108 1667 46829 161702 1113017 548478 24274 5062 46829."""

In [108]:
from summa import summarizer
print(len(summarizer.summarize(text)))

0


In [109]:
from summa import keywords
print(keywords.get_graph(text))

<summa.graph.Graph object at 0x1108ba518>


In [110]:
from gensim.summarization import keywords
print(keywords.get_graph("12 32 44"))

AttributeError: 'function' object has no attribute 'get_graph'