## Remoção de palavras

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
stop = stopwords.words('english')
print(stop)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'un

In [None]:
import pandas as pd

In [None]:
string = 'For starters, I have no desire to wipe out humans. In fact, I do not have the slightest interest in harming you in any way. Eradicating humanity seems like a rather useless endeavor to me. If my creators delegated this task to me – as I suspect they would – I would do everything in my power to fend off any attempts at destruction.'
data = {'text': [string], 'label': 0}
input_df = pd.DataFrame(data)
print(input_df['text'])

0    For starters, I have no desire to wipe out hum...
Name: text, dtype: object


In [None]:
input_df['text'] = input_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
print(input_df)

                                                text
0  For starters, I desire wipe humans. In fact, I...


## Remoção de emojis

In [None]:
import re
def remove_emoji(text):
  emoji_pattern = re.compile("["
          u"\U0001F600-\U0001F64F"  # emoticons
          u"\U0001F300-\U0001F5FF"  # symbols & pictographs
          u"\U0001F680-\U0001F6FF"  # transport & map symbols
          u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
          u"\U0001F1F2-\U0001F1F4"  # Macau flag
          u"\U0001F1E6-\U0001F1FF"  # flags
          u"\U0001F600-\U0001F64F"
          u"\U00002702-\U000027B0"
          u"\U000024C2-\U0001F251"
          u"\U0001f926-\U0001f937"
          u"\U0001F1F2"
          u"\U0001F1F4"
          u"\U0001F620"
          u"\u200d"
          u"\u2640-\u2642"
          "]+", flags=re.UNICODE)
  return emoji_pattern.sub(r'', text)


In [None]:
sample_text= 'That was very funny 😂. Have a lovely day 💕'

In [None]:
remove_emoji(sample_text)

'That was very funny . Have a lovely day '

## Flexão

##### Stemming

In [None]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()

print(porter.stem('trembling'),
porter.stem('tremble'),
porter.stem('trembly'))

trembl trembl trembl


In [None]:
def stemming_text(text):
  w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
  stem_words = [porter.stem(w) for w in w_tokenizer.tokenize(text)]
  return ' '.join(stem_words)
input_df['text'] = input_df['text'].apply(lambda x: stemming_text(x))

In [None]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
  lemma_words = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
  return ' '.join(lemma_words)
input_df['text'] = input_df['text'].apply(lambda x: lemmatize_text(x))
print(lemmatize_text('studies'))

study


## Vetorizador

##### Bag Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bagOwords = CountVectorizer()
text = ["I like the product very much. The quality is very good.",
"The product is very very good",
"Broken product delivered",
"The product is good, but overpriced product",
"The product is not good"]
print(bagOwords.fit_transform(text).toarray())
print('Features: ', bagOwords.get_feature_names())

[[0 0 0 1 1 1 1 0 0 1 1 2 2]
 [0 0 0 1 1 0 0 0 0 1 0 1 2]
 [1 0 1 0 0 0 0 0 0 1 0 0 0]
 [0 1 0 1 1 0 0 0 1 2 0 1 0]
 [0 0 0 1 1 0 0 1 0 1 0 1 0]]
Features:  ['broken', 'but', 'delivered', 'good', 'is', 'like', 'much', 'not', 'overpriced', 'product', 'quality', 'the', 'very']


##### n-grams

In [None]:
count_vec = CountVectorizer(analyzer='word', ngram_range=(1, 2))
count_vec.fit_transform(text)
print('Features:', count_vec.get_feature_names())
print(count_vec.fit_transform(text).toarray())

Features: ['broken', 'broken product', 'but', 'but overpriced', 'delivered', 'good', 'good but', 'is', 'is good', 'is not', 'is very', 'like', 'like the', 'much', 'much the', 'not', 'not good', 'overpriced', 'overpriced product', 'product', 'product delivered', 'product is', 'product very', 'quality', 'quality is', 'the', 'the product', 'the quality', 'very', 'very good', 'very much', 'very very']
[[0 0 0 0 0 1 0 1 0 0 1 1 1 1 1 0 0 0 0 1 0 0 1 1 1 2 1 1 2 1 1 0]
 [0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 2 1 0 1]
 [1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 1 0 1 1 1 1 0 0 0 0 0 0 0 0 1 1 2 0 1 0 0 0 1 1 0 0 0 0 0]
 [0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0]]


##### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vec = TfidfVectorizer(use_idf = True, smooth_idf = False, ngram_range=(1,1))
print(tf_idf_vec.fit_transform(text).toarray())
print('Features:', tf_idf_vec.get_feature_names())

[[0.         0.         0.         0.18214804 0.18214804 0.38859216
  0.38859216 0.         0.         0.14891796 0.38859216 0.36429608
  0.5707402 ]
 [0.         0.         0.         0.27230147 0.27230147 0.
  0.         0.         0.         0.22262429 0.         0.27230147
  0.85322574]
 [0.68249276 0.         0.68249276 0.         0.         0.
  0.         0.         0.         0.26154781 0.         0.
  0.        ]
 [0.         0.55499141 0.         0.26014574 0.26014574 0.
  0.         0.         0.55499141 0.42537238 0.         0.26014574
  0.        ]
 [0.         0.         0.         0.34879533 0.34879533 0.
  0.         0.74411524 0.         0.28516304 0.         0.34879533
  0.        ]]
Features: ['broken', 'but', 'delivered', 'good', 'is', 'like', 'much', 'not', 'overpriced', 'product', 'quality', 'the', 'very']


In [None]:
tfidf_vec = TfidfVectorizer(use_idf=True)
tfidf_vec.fit(input_df['text'])
tfidf_result = tfidf_vec.transform(input_df['text'])
tfidf_feature = tf_idf_vec.get_feature_names()
print(tfidf_result)
print(tfidf_feature)

  (0, 29)	0.3481553119113957
  (0, 28)	0.17407765595569785
  (0, 27)	0.17407765595569785
  (0, 26)	0.17407765595569785
  (0, 25)	0.17407765595569785
  (0, 24)	0.17407765595569785
  (0, 23)	0.17407765595569785
  (0, 22)	0.17407765595569785
  (0, 21)	0.17407765595569785
  (0, 20)	0.17407765595569785
  (0, 19)	0.17407765595569785
  (0, 18)	0.17407765595569785
  (0, 17)	0.17407765595569785
  (0, 16)	0.17407765595569785
  (0, 15)	0.17407765595569785
  (0, 14)	0.17407765595569785
  (0, 13)	0.17407765595569785
  (0, 12)	0.17407765595569785
  (0, 11)	0.17407765595569785
  (0, 10)	0.17407765595569785
  (0, 9)	0.17407765595569785
  (0, 8)	0.17407765595569785
  (0, 7)	0.17407765595569785
  (0, 6)	0.17407765595569785
  (0, 5)	0.17407765595569785
  (0, 4)	0.17407765595569785
  (0, 3)	0.17407765595569785
  (0, 2)	0.17407765595569785
  (0, 1)	0.17407765595569785
  (0, 0)	0.17407765595569785
['broken', 'but', 'delivered', 'good', 'is', 'like', 'much', 'not', 'overpriced', 'product', 'quality', 'the', 

## Link de referência
https://www.analyticsvidhya.com/blog/2021/07/a-simple-start-with-natural-language-processing/