FastText supports both Continuous Bag of Words and Skip-Gram models. In this note, we will implement the skip-gram model to learn vector representation of words from the Wikipedia articles on artificial intelligence, machine learning, deep learning, and neural networks. 

### Import related modules

In [8]:
import wikipedia

In [1]:
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

In [2]:
from gensim.models.fasttext import FastText

In [3]:
from keras.preprocessing.text import Tokenizer


Using TensorFlow backend.


In [14]:
import re
import numpy 
import matplotlib.pyplot as plt

In [5]:
%matplotlib inline

### 1. Scraping Wikipedia Articles

In [9]:
# use the page method from the wikipedia module. 
# The method returns WikipediaPage object, can retrieve the page contents via the content attribute, 
artificial_intelligence = wikipedia.page("Artificial Intelligence").content
machine_learning = wikipedia.page("Machine Learning").content
deep_learning = wikipedia.page("Deep Learning").content
neural_network = wikipedia.page("Neural Network").content

In [10]:
# tokenize text into sentences using the sent_tokenize() method. 
# The sent_tokenize method returns list of sentences.

artificial_intelligence = sent_tokenize(artificial_intelligence)
machine_learning = sent_tokenize(machine_learning)
deep_learning = sent_tokenize(deep_learning)
neural_network = sent_tokenize(neural_network)

In [12]:
# sentences from the four articles are joined together via the extend method.
artificial_intelligence.extend(machine_learning)
artificial_intelligence.extend(deep_learning)
artificial_intelligence.extend(neural_network)

In [15]:
len(artificial_intelligence)

1154

### 2. Data Preprocessing


In [6]:
en_stop = set(nltk.corpus.stopwords.words('english'))

In [13]:
stemmer = WordNetLemmatizer()

In [30]:
def preprocess_text(document):
    # Remove all the special characters, \W is short for [^\w]
    document = re.sub(r'\W', ' ', str(document))  
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Lemmatization
    tokens = [word.lower() for word in document.split()]
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in en_stop]
    tokens = [word for word in tokens if len(word) > 1]

    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [31]:
s = 'a'
a = re.search(r'\s+[a-zA-Z]\s+', s)

In [32]:
# test function
preprocess_text("Artificial intelligence, is the most advanced technology of the present era")


'artificial intelligence advanced technology present era'

In [33]:
preprocess_text(artificial_intelligence)

'computer science artificial intelligence ai sometimes called machine intelligence intelligence demonstrated machine contrast natural intelligence displayed human colloquially term artificial intelligence often used describe machine computer mimic cognitive function human associate human mind learning problem solving machine become increasingly capable task considered require intelligence often removed definition ai phenomenon known ai effect quip tesler theorem say ai whatever done yet instance optical character recognition frequently excluded thing considered ai become routine technology modern machine capability generally classified ai include successfully understanding human speech competing highest level strategic game system chess go autonomously operating car intelligent routing content delivery network military simulation artificial intelligence classified three different type system analytical human inspired humanized artificial intelligence analytical ai ha characteristic con