In [None]:
"""
1. Download the pre-trained models for the following word
embedding models - check the notebooks uploaded.
a. GloVe
b. Word2Vec
"""

import os

# GloVe

# Download the GloVe model
!wget http://nlp.stanford.edu/data/glove.6B.zip

# Unzip the GloVe model
!unzip glove.6B.zip

# Word2Vec

# Download the Word2Vec model
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

# Unzip the Word2Vec model
!gunzip GoogleNews-vectors-negative300.bin.gz


In [None]:
"""
2. Create document vectors by the following formula:
where:
a. doc_Veci : ith document in the corpus.
b. wj : word vector of j
th word in the document. The word vector is
taken model.
"""

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from gensim.models import KeyedVectors
from gensim.models import FastText


# Load the GloVe model
glove_model = KeyedVectors.load_word2vec_format('glove.6B.300d.txt', binary=False)

# Load the Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Load the FastText model
fasttext_model = FastText.load_fasttext_format('wiki.en.bin')

# Load the corpus
corpus = pd.read_csv('corpus.csv')

# Preprocess the corpus
def preprocess(text):
    # Convert to lower case
    text = text.lower()
    # Tokenize the text
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    # Remove the stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # Remove the punctuations
    tokens = [w for w in tokens if w.isalpha()]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    # Stem the tokens
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(w) for w in tokens]
    return tokens

# Create the document vectors
def create_doc_vectors(model, corpus):
    # Preprocess the corpus
    corpus['tokens'] = corpus['text'].apply(preprocess)
    # Create the document vectors
    doc_vectors = []
    for i in range(len(corpus)):
        doc_vector = np.zeros(300)
        for j in range(len(corpus['tokens'][i])):
            try:
                doc_vector += model[corpus['tokens'][i][j]]
            except:
                pass
        doc_vectors.append(doc_vector)
    return doc_vectors

# Create the document vectors for the GloVe model
glove_doc_vectors = create_doc_vectors(glove_model, corpus)

# Create the document vectors for the Word2Vec model
word2vec_doc_vectors = create_doc_vectors(word2vec_model, corpus)

# Create the document vectors for the FastText model
fasttext_doc_vectors = create_doc_vectors(fasttext_model, corpus)