## FEATURE ENGINEERING

In [1]:
from IPython.core.interactiveshell import InteractiveShell
%load_ext autoreload
%autoreload 2
InteractiveShell.ast_node_interactivity = "all"

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim
import numpy as np
from gensim import corpora
from gensim.models import TfidfModel, LsiModel
import fasttext
import os
from utils import save_pickle_file, load_pickle_file, write_to_text_file, load_fasttext_model
from constants import *

In [3]:
processed_data = load_pickle_file(PROCESSED_DATA_PATH)
processed_documents = processed_data['processed_documents']
lemmas = processed_data['lemmas']
documents = processed_data['documents']

### Bag-of-words model

In [4]:
tdidf_vectorizer = TfidfVectorizer(max_df = 0.8, min_df = 0.02, max_features = 5000, ngram_range=(1,3))
bag = tdidf_vectorizer.fit_transform(processed_documents)
bag_array = bag.toarray()
bow_vocabulary = {y:x for x,y in tdidf_vectorizer.vocabulary_.items()}

In [5]:
bag_array.shape

(6659, 2286)

In [6]:
model_data = {
    'model': bag,
    'X': bag_array,
    'vocabulary': bow_vocabulary,
    'tdidf_vectorizer': tdidf_vectorizer,
}
save_pickle_file(model_data, BOW_MODEL_PATH)

### Word2Vec model

In [7]:
# filename = 'C:\\repos\\GoogleNews-vectors-negative300.bin'
# word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=True)
# word2vec_model.init_sims(replace=True)

In [8]:
word2vec_model = gensim.models.Word2Vec(lemmas, min_count=2, size=100)
word2vec_model.train(lemmas, total_examples=len(lemmas), epochs=word2vec_model.iter)

  


(8257177, 8648925)

In [9]:
def calculate_document_vector(word2vec_model, lemmas):
    lemmas = [lemma for lemma in lemmas if lemma in word2vec_model.wv.vocab]
    return np.mean(word2vec_model[lemmas], axis=0)

In [10]:
document_vectors = [calculate_document_vector(word2vec_model, document_lemmas) 
                    for document_lemmas in lemmas]
document_vectors_array = np.array(document_vectors)

  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
len(word2vec_model.wv.vocab)

30528

In [12]:
document_vectors_array.shape

(6659, 100)

In [13]:
test_word = 'python'
word2vec_model.wv.most_similar(positive=test_word, topn=10)

[('scala', 0.7569723129272461),
 ('bash', 0.7142179608345032),
 ('scripting', 0.705761194229126),
 ('ruby', 0.7035665512084961),
 ('matlab', 0.6969648003578186),
 ('j', 0.6933891773223877),
 ('java', 0.6803004741668701),
 ('jvm', 0.6669284701347351),
 ('clojure', 0.6640642881393433),
 ('perl', 0.6622717380523682)]

In [14]:
model_data = {
    'model': word2vec_model,
    'X': document_vectors_array,
    'vocabulary': word2vec_model.wv.vocab,
}
save_pickle_file(model_data, WORD2VEC_MODEL_PATH)

### LSA model

In [15]:
dictionary = corpora.Dictionary(lemmas)
corpus_gensim = [dictionary.doc2bow(document) for document in lemmas]
tfidf = TfidfModel(corpus_gensim)
corpus_tfidf = tfidf[corpus_gensim]
lsa_model = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)

### FastText model

In [6]:
lemmas_flattened = [item for sublist in lemmas for item in sublist]
words = ' '.join(lemmas_flattened)

In [7]:
path = os.path.join('data','lemmas.txt')
write_to_text_file(words, path)

In [8]:
fasttext.skipgram(input_file=path,
                          output=FASTTEXT_MODEL_PATH,
                          thread=6,
                          silent=False,
                          dim=100,
                          minn=3,
                          maxn=6,
                          epoch=5,
                          lr=0.05,
                          min_count=3)

<fasttext.model.WordVectorModel at 0x7faddc3ec048>

In [4]:
model = load_fasttext_model(FASTTEXT_MODEL_PATH + '.bin')

In [5]:
test_word = 'python'
model.wv.most_similar(test_word, topn=10)

[('cython', 0.8306050300598145),
 ('qml', 0.8013296127319336),
 ('pythonexperience', 0.7899424433708191),
 ('bash', 0.7756275534629822),
 ('javacript', 0.7741864919662476),
 ('bpmn', 0.7604794502258301),
 ('javaprogramming', 0.7591749429702759),
 ('pythonia', 0.7578659653663635),
 ('pytorch', 0.757329523563385),
 ('gitlab', 0.7497174143791199)]

In [6]:
test_words = ['data', 'science']
model.wv.most_similar(positive=test_words,topn=10)

[('datascience', 0.9024331569671631),
 ('dataanalytics', 0.7857604026794434),
 ('bioscience', 0.7785853147506714),
 ('datarelated', 0.7641087770462036),
 ('analytics', 0.7460598945617676),
 ('lifescience', 0.744613766670227),
 ('scienceled', 0.7378937005996704),
 ('dataanalyst', 0.7292890548706055),
 ('dataanalysis', 0.7169116735458374),
 ('datajob', 0.7148062586784363)]