<a href="https://colab.research.google.com/github/michalis0/DataMining_and_MachineLearning/blob/master/week10/Text_Analytics_2_solutions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Solution of the exercise in the "Text_Analytics_2.ipynb" notebook. 


In [1]:
# Import required packages
import gensim
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import bs4 as bs
import urllib.request
import spacy
import string
import math
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import seaborn as sns

# Load English language model of spacy
sp = spacy.load('en_core_web_sm')


In [3]:
# Tokens in document
def get_tokens(document):
  doc_tokens = []
  for token in sp(document):
      if (token.is_punct == False) and (token.is_space == False):
        doc_tokens.append(token.lower_)
  return doc_tokens

In [7]:
# Get texts from Wikipedia
def get_text(url):
  scrapped_data = urllib.request.urlopen(url)
  article = scrapped_data.read()
  parsed_article = bs.BeautifulSoup(article,'lxml')
  paragraphs = parsed_article.find_all('p')
  article_text = ""
  for p in paragraphs:
    article_text += p.text
  return article_text

In [9]:
# Create tokenizer function for preprocessing
def spacy_tokenizer(text):

    # Define stopwords, punctuation, and numbers
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    punctuations = string.punctuation
    numbers = "0123456789"

    # Create spacy object
    mytokens = sp(text)

    # Lemmatize each token and convert each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Remove stop words and punctuation
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # Remove sufix like ".[1" in "experience.[1"
    mytokens_2 = []
    for word in mytokens:
      for char in word:
        if (char in punctuations) or (char in numbers):
          word = word.replace(char, "")
      if word != "":
        mytokens_2.append(word)

    # Return preprocessed list of tokens
    return mytokens_2

### 2.2 Exercise
Analyze the wikipedia article on [Coronavirus](https://en.wikipedia.org/wiki/Coronavirus) as above. 

In [10]:
# 1. Get text from URL - use the get_text() function defined above
coronavirus = get_text('https://en.wikipedia.org/wiki/Coronavirus')

# 2. Processing - tokenization using the spacy_tokenizer() function
processed_corona = spacy_tokenizer(coronavirus)
processed_corona[:10]

['coronaviruse',
 'group',
 'relate',
 'rna',
 'virus',
 'cause',
 'disease',
 'mammal',
 'bird',
 'human']

In [11]:
# 3. What is the number of occurence of the word "virus"?
count = 0
for word in processed_corona:
  if word == 'virus':
    count += 1
count

67

In [12]:
# 4. Create a Word2Vec representation of the article with a min_count of 1 and a vector size of 50
word2vec_corona = Word2Vec([processed_corona], min_count=1, size=50)


# 5. What is the 10 most similar words of "virus"
word2vec_corona.wv.most_similar('virus')



[('poly', 0.4091958701610565),
 ('peplomer', 0.3946215510368347),
 ('mrnas', 0.39174190163612366),
 ('homodimer', 0.3748185634613037),
 ('amino', 0.36334195733070374),
 ('Î±', 0.35977503657341003),
 ('national', 0.3595016300678253),
 ('continuous', 0.35437700152397156),
 ('bovine', 0.3467666506767273),
 ('french', 0.3427498936653137)]