# Vectorizing Text and Transformations and n-grams
In the previous lab, we’ve seen how we clean our data before starting processing it. In this lab we’ll introduce the data structures largely used in text analysis involving machine learning techniques – vectors. 

We'll see the bag of words model, TF-IDF and build N-grams on the NLTK gutenberg corpus.

In [10]:
# Imports
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora, models
import gensim
import spacy
nlp = spacy.load("en_core_web_sm")

In [11]:
# we chose some of the texts available on the NLTK gutenberg corpus which constitutes of english litterature novels.
text_names = ["austen-emma.txt", 
              "shakespeare-macbeth.txt" ,
              "shakespeare-hamlet.txt",
              "chesterton-ball.txt"]  

documents = []

# we create a list of our custom stopwords
my_stop_words = [u'say', u'\'s', u'Mr', u'be', u'said', u'says', u'saying', u'would',  u'could']

# we do some preprocessing on our corpus, which includes the tokenization, lower casing, 
# removal of spacy stopwords and our custom stopwords, removal of punctuation and special characters.
for text_name in text_names:
    text = gutenberg.raw(text_name)
    tokens = word_tokenize(text)
    filtered_tokens = [token.lower() for token in tokens if token.lower() not in stopwords.words('english') and token.isalpha() and token.lower() not in my_stop_words]
    doc = nlp(" ".join(filtered_tokens))
    lemmatized_tokens = [token.lemma_ for token in doc]
    
    documents.append(lemmatized_tokens)


In [12]:
# create a dictionary of our corpus
dictionary = corpora.Dictionary(documents)
# save the dictionary in a txt file
dictionary.save_as_text("old_dictionary.txt")
print(len(dictionary))

12250


In [13]:
# transform the corpus into a bag of words dictionary
corpus = [dictionary.doc2bow(doc) for doc in documents]
corpus

[[(0, 24),
  (1, 1),
  (2, 1),
  (3, 2),
  (4, 1),
  (5, 3),
  (6, 71),
  (7, 1),
  (8, 1),
  (9, 7),
  (10, 8),
  (11, 3),
  (12, 5),
  (13, 1),
  (14, 16),
  (15, 10),
  (16, 1),
  (17, 2),
  (18, 4),
  (19, 33),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 3),
  (24, 5),
  (25, 1),
  (26, 1),
  (27, 2),
  (28, 1),
  (29, 1),
  (30, 6),
  (31, 28),
  (32, 8),
  (33, 2),
  (34, 1),
  (35, 1),
  (36, 3),
  (37, 2),
  (38, 2),
  (39, 8),
  (40, 1),
  (41, 9),
  (42, 11),
  (43, 2),
  (44, 3),
  (45, 6),
  (46, 1),
  (47, 4),
  (48, 1),
  (49, 60),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 2),
  (54, 1),
  (55, 2),
  (56, 1),
  (57, 1),
  (58, 2),
  (59, 1),
  (60, 31),
  (61, 6),
  (62, 26),
  (63, 63),
  (64, 7),
  (65, 1),
  (66, 5),
  (67, 4),
  (68, 2),
  (69, 7),
  (70, 1),
  (71, 1),
  (72, 7),
  (73, 1),
  (74, 25),
  (75, 1),
  (76, 7),
  (77, 7),
  (78, 4),
  (79, 3),
  (80, 29),
  (81, 2),
  (82, 1),
  (83, 2),
  (84, 49),
  (85, 10),
  (86, 4),
  (87, 23),
  (88, 2),
  (89, 2),
  (9

In [14]:
# creation of bigrams and trigrams
bigram = gensim.models.Phrases(documents)
trigram= gensim.models.Phrases(bigram[documents]) 

In [15]:
# add the created bigrams and trigrams to our documents
documents = [bigram[line] for line in documents]
documents = [trigram[line] for line in documents]
documents

[['emma',
  'jane',
  'austen',
  'volume',
  'chapter',
  'emma',
  'woodhouse',
  'handsome',
  'clever',
  'rich',
  'comfortable',
  'home',
  'happy',
  'disposition',
  'seem',
  'unite',
  'good',
  'blessing',
  'existence',
  'live',
  'nearly',
  'year',
  'world',
  'little',
  'distress',
  'vex',
  'young',
  'two',
  'daughter',
  'affectionate',
  'indulgent',
  'father',
  'consequence',
  'sister',
  'marriage',
  'mistress',
  'house',
  'early',
  'period',
  'mother',
  'die',
  'long',
  'ago',
  'indistinct',
  'remembrance',
  'caress',
  'place',
  'supply',
  'excellent',
  'woman',
  'governess',
  'fall',
  'little',
  'short',
  'mother',
  'affection',
  'sixteen',
  'year',
  'miss_taylor',
  'woodhouse',
  'family',
  'less',
  'governess',
  'friend',
  'fond',
  'daughter',
  'particularly',
  'emma',
  'intimacy',
  'sister',
  'even',
  'miss_taylor',
  'cease',
  'hold',
  'nominal',
  'office',
  'governess',
  'mildness',
  'temper',
  'hardly',
  

In [16]:
# create the dictionary of the new corpus
dictionary = corpora.Dictionary(documents)
dictionary.save_as_text("new_dictionary.txt")

In [17]:
# transform it into bag of words
corpus = [dictionary.doc2bow(text) for text in documents]
# print the new form of our corpus
for doc in corpus:
    print(doc)

print(len(dictionary))


[(0, 15), (1, 1), (2, 1), (3, 2), (4, 1), (5, 3), (6, 65), (7, 6), (8, 1), (9, 1), (10, 7), (11, 8), (12, 3), (13, 5), (14, 1), (15, 16), (16, 10), (17, 1), (18, 2), (19, 4), (20, 33), (21, 1), (22, 1), (23, 1), (24, 3), (25, 5), (26, 1), (27, 1), (28, 2), (29, 1), (30, 1), (31, 6), (32, 28), (33, 8), (34, 2), (35, 1), (36, 1), (37, 3), (38, 2), (39, 2), (40, 8), (41, 1), (42, 9), (43, 11), (44, 2), (45, 3), (46, 6), (47, 1), (48, 4), (49, 1), (50, 60), (51, 1), (52, 1), (53, 1), (54, 2), (55, 1), (56, 2), (57, 1), (58, 1), (59, 2), (60, 1), (61, 31), (62, 6), (63, 26), (64, 56), (65, 7), (66, 1), (67, 5), (68, 4), (69, 2), (70, 7), (71, 1), (72, 1), (73, 3), (74, 1), (75, 1), (76, 25), (77, 1), (78, 7), (79, 7), (80, 4), (81, 3), (82, 29), (83, 2), (84, 1), (85, 2), (86, 49), (87, 10), (88, 4), (89, 23), (90, 2), (91, 2), (92, 1), (93, 1), (94, 1), (95, 6), (96, 2), (97, 3), (98, 1), (99, 24), (100, 27), (101, 2), (102, 5), (103, 2), (104, 1), (105, 24), (106, 4), (107, 1), (108, 2), 

In [18]:
# apply TF-IDF on our corpus
tfidf = models.TfidfModel(corpus)

tfidf_corpus = tfidf[corpus]

for doc in tfidf_corpus:
    print(doc)

[(0, 0.005639729223065177), (1, 0.0007519638964086904), (2, 0.0007519638964086904), (3, 0.0003120932151134383), (4, 0.00015604660755671914), (5, 0.0011279458446130355), (6, 0.024438826633282434), (7, 0.002255891689226071), (8, 0.0007519638964086904), (9, 0.0007519638964086904), (10, 0.005263747274860832), (12, 0.0011279458446130355), (13, 0.0018799097410217257), (14, 0.0003759819482043452), (15, 0.0024967457209075063), (17, 0.0007519638964086904), (18, 0.0015039277928173807), (20, 0.01240740429074339), (21, 0.0003759819482043452), (22, 0.0007519638964086904), (23, 0.00015604660755671914), (24, 0.002255891689226071), (25, 0.0037598194820434513), (26, 0.0007519638964086904), (27, 0.0007519638964086904), (28, 0.0003120932151134383), (29, 0.0007519638964086904), (30, 0.0007519638964086904), (32, 0.004369305011588136), (33, 0.006015711171269523), (34, 0.0015039277928173807), (35, 0.0007519638964086904), (36, 0.00015604660755671914), (37, 0.0011279458446130355), (38, 0.0015039277928173807), 