# Natural Language Processing

## Data Cleaning

### Lowercase

In [None]:
sentence = "Her cat's name is Luna"

In [None]:
lower_sentence = sentence.lower()

In [None]:
print(lower_sentence)

In [None]:
sentence_list = [
    "Could you pass me the TV remote?",
    "It is IMPOSSIBLE to find this hotel",
    "Want to go for dinner on Tuesday?"
]

In [None]:
lower_sentence_list = [x.lower() for x in sentence_list]

In [None]:
print(lower_sentence_list)

### Remove Stop Words

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

In [None]:
en_stopwords = stopwords.words('english')

In [None]:
print(en_stopwords)

In [None]:
sentence = "it was too far to go to the shop and we did not want her to walk"

In [None]:
sentence_no_stopwords = ' '.join([word for word in sentence.split() if word not in en_stopwords])

In [None]:
print(sentence_no_stopwords)

In [None]:
en_stopwords.remove('did')
en_stopwords.remove('not')

In [None]:
en_stopwords.append('go')

In [None]:
sentence_no_stopwords_custom = ' '.join([word for word in sentence.split() if word not in en_stopwords])

In [None]:
print(sentence_no_stopwords_custom)

### Regular Expression (Regex)

In [None]:
import re

In [None]:
my_folder = r"C:\desktop\notes"
print(my_folder)

In [None]:
result_search = re.search("pattern", r"string to contain the pattern")

In [None]:
print(result_search)

In [None]:
result_search_2 = re.search("pattern", r"string without it")

In [None]:
print(result_search_2)

In [None]:
string = r"sara was able to help me find the item I needed quickly"

In [None]:
new_string = re.sub("sara", "Sarah", string)

In [None]:
print(new_string)

In [None]:
customer_reviews = ["Sam was a great help to me in the store",
                   "The cashier was very rude to me. I think her name was Eleanor",
                   "Amazing work from Sadeen!",
                   "sarah was able to help me find the item I needed quickly",
                   "Lucy is such a great addition to the team",
                   "Great service from sara she found me what I wanted"]

In [None]:
sarah_reviews = []

In [None]:
pattern_to_find = r"sarah?"

In [None]:
for string in customer_reviews:
    if re.search(pattern_to_find, string):
        sarah_reviews.append(string)

In [None]:
print(sarah_reviews)

In [None]:
a_reviews = []
pattern_to_find = r"^A"

for string in customer_reviews:
    if re.search(pattern_to_find, string):
        a_reviews.append(string)

print(a_reviews)

In [None]:
y_reviews = []
pattern_to_find = r"y$"

for string in customer_reviews:
    if re.search(pattern_to_find, string):
        y_reviews.append(string)

print(y_reviews)

In [None]:
needwant_reviews = []
pattern_to_find = r"(need|want)ed"

for string in customer_reviews:
    if re.search(pattern_to_find, string):
        needwant_reviews.append(string)

print(needwant_reviews)

In [None]:
no_punct_reviews = []
pattern_to_find = r"[^\w\s]"

for string in customer_reviews:
    no_punct_string = re.sub(pattern_to_find, "", string)
    no_punct_reviews.append(no_punct_string)

print(no_punct_reviews)

### Tokenization

In [None]:
import nltk

In [None]:
nltk.download('punkt')

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
sentences = "Her cat's name is Luna. Her dog's name is Max"

In [None]:
sent_tokenize(sentences)

In [None]:
sentence = "Her cat's name is Luna"
word_tokenize(sentence)

In [None]:
sentence_2 = "Her cat's name is Luna and her dog's name is Max"
word_tokenize(sentence_2)

### Stemming (standardize text)

In [None]:
from nltk import PorterStemmer

In [None]:
ps = PorterStemmer()

In [None]:
connect_tokens = ['connecting', 'connected', 'connectivity', 'connect', 'connects']

In [None]:
for t in connect_tokens:
    print(t, ': ', ps.stem(t))

In [None]:
learn_tokens = ['learned', 'learning', 'learn', 'learns', 'learner', 'learners']

In [None]:
for t in learn_tokens:
    print(t, ": ", ps.stem(t))

In [None]:
likes_tokens = ['likes', 'better', 'worse']

In [None]:
for t in likes_tokens:
    print(t, ": ", ps.stem(t))

### Lemmatization (stems words to a more meaningful base form)

In [None]:
nltk.download('wordnet')

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
for t in connect_tokens:
    print(t, ": ", lemmatizer.lemmatize(t))

In [None]:
for t in learn_tokens:
    print(t, ": ", lemmatizer.lemmatize(t))

In [None]:
for t in likes_tokens:
    print(t, ": ", lemmatizer.lemmatize(t))

### N-grams

In [None]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
tokens = ["apple", "book", "desk", "pen", "desk",  "cat", "dog", "tree", "house","desk", "car", "phone",
             "computer", "laptop", "keyboard", "mouse", "mouse", "chair", "table", "door", "window", "phone", "wall", "floor"]
print(tokens)

In [None]:
unigrams = (pd.Series(nltk.ngrams(tokens, 1)).value_counts())
print(unigrams[:10])

In [None]:
unigrams[:10].sort_values().plot.barh(color='lightsalmon', width=.9, figsize=(12,8))
plt.title("10 Most Frequently Occuring Unigrams")

In [None]:
bigrams = (pd.Series(nltk.ngrams(tokens, 2)).value_counts())
print(bigrams[:10])

In [None]:
trigrams = (pd.Series(nltk.ngrams(tokens, 3)).value_counts())
print(trigrams[:10])

### Real-Life Example

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re 
import pandas as pd

In [None]:
data = pd.read_csv("tripadvisor_hotel_reviews.csv")

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data['Review'][0]

In [None]:
data['review_lowercase'] = data['Review'].str.lower()

In [None]:
data.head()

In [None]:
en_stopwords = stopwords.words('english')

In [None]:
en_stopwords.remove('not')

In [None]:
data['review_no_stopwords'] = data['review_lowercase'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

In [None]:
data['review_no_stopwords'][0]

In [None]:
data['review_no_stopwords_no_punct'] = data.apply(lambda x: re.sub(r'[*]', 'star', x['review_no_stopwords']), axis=1)

In [None]:
data.head()

In [None]:
data['review_no_stopwords_no_punct'] = data.apply(lambda x: re.sub(r'([^\w\s])', '', x['review_no_stopwords_no_punct']), axis=1)

In [None]:
data.head()

In [None]:
data['tokenized']= data.apply(lambda x: word_tokenize(x['review_no_stopwords_no_punct']), axis=1)

In [None]:
data['tokenized'][0]

In [None]:
ps = PorterStemmer()

In [None]:
data['stemmed'] = data['tokenized'].apply(lambda tokens: [ps.stem(token) for token in tokens])

In [None]:
data.head()

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
data['lemmatized'] = data['tokenized'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

In [None]:
data['lemmatized'][0]

In [None]:
tokens_clean = sum(data['lemmatized'], [])

In [None]:
unigrams = (pd.Series(nltk.ngrams(tokens_clean, 1)).value_counts())
print(unigrams)

In [None]:
bigrams = (pd.Series(nltk.ngrams(tokens_clean, 2)).value_counts())
print(bigrams)