-
Notifications
You must be signed in to change notification settings - Fork 7
/
text_encodings.py
57 lines (40 loc) · 1.66 KB
/
text_encodings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import nltk
import itertools
import pandas as pd
from nltk.corpus import stopwords
import numpy as np
def get_text_encoding(fileText, N, field):
print("Creating text encodings... ")
# Load training sentences
df = pd.read_csv(fileText, delimiter='\t')
# Lower-case comments and convert to list of strings
comments = []
for index, row in df.iterrows():
tokens = nltk.word_tokenize(row[field])
words = [w.lower() for w in tokens]
comments.append(words)
# Get vocabulary
word_freq = nltk.FreqDist(itertools.chain(*comments))
vocab = word_freq.most_common(len(word_freq))
# Get embeddings
if field == 'DESCRIPTION':
index_to_word = [x[0] for x in vocab if not x[0] in stopwords.words('english') and x[0].isalpha() and x[1] > 10]
index_to_word = index_to_word[0:N]
elif field == 'TITLE':
index_to_word = [x[0] for x in vocab if not x[0] in stopwords.words('english') and x[0].isalpha()]
word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])
print("Vocabulary size : %d." % len(index_to_word))
print("... Text encodings done.")
return word_to_index, index_to_word
def get_mapped_text(df, w2i, field):
sentences = []
for index, row in df.iterrows():
tokens = nltk.word_tokenize(row[field])
words = [w.lower() for w in tokens if w.isalpha()]
sentences.append(words)
# Remove OOV tokens from the text
for i, sent in enumerate(sentences):
sentences[i] = [w for w in sent if w in w2i]
# Tokens to vector of indices
mapped_text = np.asarray([[w2i[w] for w in sent] for sent in sentences])
return mapped_text