# NLP Tutorial

### Elena Kochkina

NESTA HackSTIR

22.10.2019

# Part I

## Imports

In [None]:
import nltk
nltk.download('punkt')
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
nltk.download('reuters')
from nltk.corpus import reuters
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import stopwords
import spacy
spacy_nlp = spacy.load("en_core_web_sm")
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy
import gensim 
import warnings
warnings.filterwarnings("ignore")

In [None]:
text_example = 'WHEN I found that I was a prisoner a sort of wild feeling came over me. I rushed up and down the stairs, trying every door and peering out of every window I could find; but after a little the conviction of my helplessness overpowered all other feelings. When I look back after a few hours I think I must have been mad for the time, for I behaved much as a rat does in a trap. When, however, the conviction had come to me that I was helpless I sat down quietly—as quietly as I have ever done anything in my life—and began to think over what was best to be done. I am thinking still, and as yet have come to no definite conclusion. Of one thing only am I certain; that it is no use making my ideas known to the Count. He knows well that I am imprisoned; and as he has done it himself, and has doubtless his own motives for it, he would only deceive me if I trusted him fully with the facts. So far as I can see, my only plan will be to keep my knowledge and my fears to myself, and my eyes open. I am, I know, either being deceived, like a baby, by my own fears, or else I am in desperate straits; and if the latter be so, I need, and shall need, all my brains to get through.'

*WHEN I found that I was a prisoner a sort of wild feeling came over me. I rushed up and down the stairs, trying every door and peering out of every window I could find; but after a little the conviction of my helplessness overpowered all other feelings. When I look back after a few hours I think I must have been mad for the time, for I behaved much as a rat does in a trap. When, however, the conviction had come to me that I was helpless I sat down quietly—as quietly as I have ever done anything in my life—and began to think over what was best to be done. I am thinking still, and as yet have come to no definite conclusion. Of one thing only am I certain; that it is no use making my ideas known to the Count. He knows well that I am imprisoned; and as he has done it himself, and has doubtless his own motives for it, he would only deceive me if I trusted him fully with the facts. So far as I can see, my only plan will be to keep my knowledge and my fears to myself, and my eyes open. I am, I know, either being deceived, like a baby, by my own fears, or else I am in desperate straits; and if the latter be so, I need, and shall need, all my brains to get through.*

## Lower casing

In [None]:
text_example

In [None]:
lowercase_text_example = text_example.lower()
lowercase_text_example

## Removal of non-alphanumeric characters

**Regular expressions** 

Sequences of characters that define a search pattern. 


[documentation](https://docs.python.org/3/howto/regex.html)

In [None]:
alphanum_text_example = re.sub(r'[^A-Za-z0-9 ]+', '', lowercase_text_example)
alphanum_text_example

Other regular expression examples include:

In [None]:
content = "Hi I am Elena, my website is https://warwick.ac.uk/ekochkina"
# remove URLs
content = re.sub(r'\b(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?\b', '', content)
# remove 3 characters and shorter
content = re.sub(r'\b[A-Za-z0-9]{1,3}\b', '', content) 
content

## Tokenization

Process of splitting the text into chunks, called tokens. 

Often tokens==words, however tokens can be bigrams (word pairs), n-grams (sequences of n words), sentences, word chuncks or characters. 

[documentation](https://www.nltk.org/api/nltk.tokenize.html)

In [None]:
sent_tokenized_text = sent_tokenize(lowercase_text_example)
print(sent_tokenized_text)

In [None]:
word_tokenized_text = word_tokenize(alphanum_text_example)
print(word_tokenized_text)

there are alternative tokenization functions, e.g. there is a special tokenizer for tweets

[spacy documentation](https://spacy.io/usage/spacy-101)


In [None]:
word_tokenized_text = alphanum_text_example.split()

In [None]:
spacy_doc = spacy_nlp(alphanum_text_example)
for token in spacy_doc:
    print(token.text)

## Stop-word removal

[nltk corpora documentation](https://www.nltk.org/book/ch02.html)

In [None]:
stopwords.words("english")

In [None]:
words_filtered = [w for w in word_tokenized_text if not w in stopwords.words("english")]
words_filtered

alternative:

In [None]:
words_filtered_spacy = []
for token in spacy_doc:
  if not token.is_stop:
    words_filtered_spacy.append(token)
words_filtered_spacy

In [None]:
words_filtered_spacy==words_filtered

## Stemming


[documentation](http://www.nltk.org/howto/stem.html)

In [None]:
ps = PorterStemmer()

In [None]:
stemmed_words=[]
for w in words_filtered:
    stemmed_words.append(ps.stem(w))
    
stemmed_words  

## Lemmatization


In [None]:
lem = WordNetLemmatizer()

word = "flying"
print("Lemmatized Word:",lem.lemmatize(word,"v"))
print("Stemmed Word:",ps.stem(word))

In [None]:
word = "acceptance"
print("Lemmatized Word:",lem.lemmatize(word,"n"))
print("Stemmed Word:",ps.stem(word))

In [None]:
lemmatized_words=[]
for w in words_filtered:
    lemmatized_words.append(lem.lemmatize(w,"v")) # need to know POS tag of each word
    
lemmatized_words

alternative:

In [None]:
words_lemmas_spacy = []
for token in spacy_doc:
  if not token.is_stop:
    words_lemmas_spacy.append(token.lemma_)
words_lemmas_spacy

## Part of Speech (POS) tagging

In [None]:
nltk.pos_tag(words_filtered)

alternative:

In [None]:
words_pos_spacy = []
for token in spacy_doc:
  if not token.is_stop:
    words_pos_spacy.append(token.pos_) # for simple pos tag or use token.tag_ for detailed POS tag
words_pos_spacy

## Parsing and Named Entity Recognition

Spacy has in built dependency parser and also Named Entity Recognition system

In [None]:
for token in spacy_doc:
  if not token.is_stop:
    print (token.dep_)

In [None]:
for ent in spacy_doc.ents:
  print(ent.text, ent.start_char, ent.end_char, ent.label_)

## Word frequency

In [None]:
from nltk.probability import FreqDist
fdist = FreqDist(words_lemmas_spacy)
fdist.most_common(10)

## Bag of Words

In [None]:
sent_tokenized_text

In [None]:
len(sent_tokenized_text)

**class sklearn.feature_extraction.text.CountVectorizer**(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), analyzer=’word’, max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>)

---
Convert a collection of text documents to a matrix of token counts

[documentation](https://https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [None]:
documents_example = sent_tokenized_text
vectorizer = CountVectorizer(stop_words='english')
bow_matrix = vectorizer.fit_transform(documents_example)
vocabulary = vectorizer.get_feature_names()
print(len(vocabulary))
print(vocabulary)

In [None]:
bow_matrix.todense()

In [None]:
numpy.shape(bow_matrix.todense())

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer.fit_transform(documents_example)
bow_matrix = tfidf_vectorizer.fit_transform(documents_example)
vocabulary = tfidf_vectorizer.get_feature_names()
print(len(vocabulary))
print(vocabulary)

In [None]:
bow_matrix.todense()

## Word2vec embeddings


[gensim documentation](https://radimrehurek.com/gensim/)

### Pre-trained using large Google News corpus

download pre-trained word vectors

In [None]:
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
!gzip -d GoogleNews-vectors-negative300.bin.gz

In [None]:
!ls

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
print(model.similarity('man', 'woman'))
print(model.similarity('king', 'queen'))

In [None]:
model.most_similar('king')[:5]

### Training on your own corpus

In [None]:
movie_reviews.categories()

In [None]:
movie_reviews.raw('neg/cv000_29416.txt')

In [None]:
documents = []
for fileid in movie_reviews.fileids():
  documents.append(movie_reviews.raw(fileid))

In [None]:
def preprocess(text):
  
  lowercase = text.lower()
  alphanum = re.sub(r'[^A-Za-z0-9 ]+', '', lowercase)
  tokens = word_tokenize(alphanum)
  
  return tokens

In [None]:
#preprocess(movie_reviews.raw('neg/cv000_29416.txt'))

In [None]:
preprocessed_docs = []

for d in documents:
  preprocessed_docs.append(preprocess(d))
  

In [None]:
model_from_movie_reviews = gensim.models.Word2Vec(preprocessed_docs, min_count=1,size=50,workers=3, window=3, sg=1)

In [None]:
print(model_from_movie_reviews['king'])
print(model_from_movie_reviews.similarity('queen', 'king'))
print(model_from_movie_reviews.most_similar('king')[:5])

In [None]:
model_from_movie_reviews.save("word2vec_movie.model")

In [None]:
model_from_file = gensim.models.Word2Vec.load("word2vec_movie.model")