Skip to content

Commit

Permalink
added a few functions to text.py
Browse files Browse the repository at this point in the history
  • Loading branch information
mikeqfu committed May 29, 2020
1 parent 765299a commit 40d7a32
Showing 1 changed file with 166 additions and 21 deletions.
187 changes: 166 additions & 21 deletions pyhelpers/text.py
@@ -1,16 +1,26 @@
""" Text-related """

import collections
import collections.abc
import re
import string

import numpy as np
import pandas as pd

from pyhelpers.ops import dict_to_dataframe


# Find similar string from a list of strings
def find_similar_str(x, lookup_list, processor='fuzzywuzzy', **kwargs):
"""
:param x: [str]
:param lookup_list: [iterable]
:param x: [str] a string-type variable
:param lookup_list: [iterable] (a list of) candidate strings for lookup
:param processor: [str] (default: 'fuzzywuzzy')
:param substitution_cost: [int] (default: 100)
:return: [str]
:param kwargs: optional arguments used by
`fuzzywuzzy.fuzz.token_set_ratio()` if `processor == 'fuzzywuzzy'`;
`nltk.edit_distance()` if `processor == 'nltk'`, e.g. `substitution_cost=100`
:return: [str] a string-type variable that should be similar to (or the same as) `x`
"""
assert processor in ('fuzzywuzzy', 'nltk'), '\"processor\" must be either \"fuzzywuzzy\" or \"nltk\".'

Expand All @@ -33,12 +43,12 @@ def find_similar_str(x, lookup_list, processor='fuzzywuzzy', **kwargs):
# Find from a list the closest, case-insensitive, str to the given one
def find_matched_str(x, lookup_list):
"""
:param x: [str; None] (if None, return None)
:param lookup_list: [iterable]
:return: [str; list; None]
:param x: [str; None] a string-type variable; if None, the function will return None
:param lookup_list: [iterable] (a list of) candidate strings for lookup
:return: [str; list; None] a string-type variable that is case-insensitively the same as `x`
"""
# assert isinstance(x, str), "'x' must be a string."
# assert isinstance(lookup, list) or isinstance(lookup, tuple), "'lookup' must be a list/tuple"
assert isinstance(x, str), "\"x\" must be a string."
assert isinstance(lookup_list, collections.abc.Iterable), "\"lookup_list\" must be iterable"
if x == '' or x is None:
return None
else:
Expand All @@ -47,17 +57,152 @@ def find_matched_str(x, lookup_list):
return y


# Convert compressed sparse matrix to a dictionary
def csr_matrix_to_dict(csr_matrix, vectorizer):
features = vectorizer.get_feature_names()
dict_data = []
for i in range(len(csr_matrix.indptr) - 1):
sid, eid = csr_matrix.indptr[i: i + 2]
row_feat = [features[x] for x in csr_matrix.indices[sid:eid]]
row_data = csr_matrix.data[sid:eid]
dict_data.append(dict(zip(row_feat, row_data)))
# Remove punctuation
def remove_punctuation(raw_txt):
"""
:param raw_txt: [str]
:return:
Example:
raw_txt = 'Hello world!'
remove_punctuation(raw_txt) # 'Hello world'
"""
try:
# txt = text.lower().translate(str.maketrans('', '', string.punctuation))
txt = raw_txt.translate(str.maketrans('', '', string.punctuation))
except Exception as e:
print(e)
# txt = ''.join(x for x in text.lower() if x not in string.punctuation)
txt = ''.join(x for x in raw_txt if x not in string.punctuation)
return txt


# Count number of each different word
def count_words(raw_txt):
"""
:param raw_txt: [str] any text
:return: [dict] total number of each different word in `raw_docs`
Example:
raw_txt = 'This is an apple. That is a pear. Hello world!'
count_words(raw_txt)
"""
import nltk
doc_text = str(raw_txt)
tokens = nltk.word_tokenize(doc_text)
word_count_dict = dict(collections.Counter(tokens))
return word_count_dict


# Calculate inverse document frequency
def calculate_idf(raw_documents):
"""
:param raw_documents: [pd.Series]
:return: [tuple] of length 2 - (tf of a number of documents, idf)
Example:
raw_documents = pd.Series(['This is an apple.', 'That is a pear.', 'It is human being.', 'Hello world!'])
docs_tf, corpus_idf = calculate_idf(raw_documents)
"""
assert isinstance(raw_documents, (pd.Series, pd.DataFrame))

docs_tf = raw_documents.map(count_words)
tokens_in_docs = docs_tf.map(lambda x: list(x.keys()))

n = len(raw_documents)
tokens = [w for tokens in tokens_in_docs for w in tokens]
tokens_counter = dict_to_dataframe(dict(collections.Counter(tokens)), 'token', 'count')
tokens_counter['idf'] = np.log(n / (1 + tokens_counter['count'].values))

corpus_idf = dict(zip(tokens_counter['token'], tokens_counter['idf']))

return docs_tf, corpus_idf


# Count term frequency–inverse document frequency
def calculate_tf_idf(raw_documents):
"""
:param raw_documents: [pd.Series]
:return: [dict] tf-idf of the `raw_documents`
Example:
raw_documents = pd.Series(['This is an apple.', 'That is a pear.', 'It is human being.', 'Hello world!'])
calculate_tf_idf(raw_documents)
"""
docs_tf, corpus_idf = calculate_idf(raw_documents)
docs_tf_idf = docs_tf.apply(lambda x: {k: v * corpus_idf[k] for k, v in x.items() if k in corpus_idf})
return docs_tf_idf


# Compute Euclidean distance of two sentences
def euclidean_distance_between_texts(txt1, txt2):
"""
:param txt1: [str] any text
:param txt2: [str] any text
:return: [numbers.Number]
Example:
s1 = 'This is an apple.'
s2 = 'That is a pear.'
ed = euclidean_distance_between_texts(s1, s2) # 2.6457513110645907
"""
if isinstance(txt1, str) and isinstance(txt2, str):
doc_words = set(txt1.split() + txt2.split())
else:
assert isinstance(txt1, list), isinstance(txt2, list)
doc_words = set(txt1 + txt2)

s1_count, s2_count = [], []
for word in doc_words:
s1_count.append(txt1.count(word))
s2_count.append(txt2.count(word))

# ed = np.sqrt(np.sum((np.array(s1_count) - np.array(s2_count)) ** 2))
ed = np.linalg.norm(np.array(s1_count) - np.array(s2_count))

return ed


# Compute cosine similarity of two sentences
def cosine_similarity_between_texts(txt1, txt2, cosine_distance=False):
"""
:param txt1: [str] any text
:param txt2: [str] any text
:param cosine_distance: [bool] whether to get cosine distance, which is 1 - cosine similarity (default: False)
:return: [numbers.Number] cosine similarity (or distance)
Example:
s1 = 'This is an apple.'
s2 = 'That is a pear.'
cosine_distance = False
cos_similarity = cosine_similarity_between_texts(s1, s2) # 0.6963106238227914
cosine_distance = True
cos_similarity = cosine_similarity_between_texts(s1, s2, cosine_distance) # 0.3036893761772086
"""
if isinstance(txt1, str) and isinstance(txt2, str):
doc_words = set(txt1.split() + txt2.split())
else:
assert isinstance(txt1, list), isinstance(txt2, list)
doc_words = set(txt1 + txt2)

s1_count, s2_count = [], []
for word in doc_words:
s1_count.append(txt1.count(word))
s2_count.append(txt2.count(word))

s1_count, s2_count = np.array(s1_count), np.array(s2_count)

similarity = np.dot(s1_count, s2_count)
cos_similarity = np.divide(similarity, np.linalg.norm(s1_count) * np.linalg.norm(s2_count))

import pandas as pd
mat_dict = pd.Series(dict_data).to_frame('word_count')
if cosine_distance:
cos_similarity = 1 - cos_similarity

return mat_dict
return cos_similarity

0 comments on commit 40d7a32

Please sign in to comment.