In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.preprocessing import OneHotEncoder

from preprocessing import Preprocessing

In [4]:
# Download stopwords dataset
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jacopo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
df = pd.read_csv('./data/stackechange_csv/datascience.stackexchange.com-posts.csv', sep=',')
df.dtypes

Id                 int64
PostTypeId         int64
CreationDate      object
Score              int64
ViewCount        float64
FavoriteCount    float64
Title             object
Body              object
Tags              object
Topic             object
dtype: object

In [6]:
df['Body'].fillna('', inplace=True)
df['Title'].fillna('', inplace=True)
df['Tags'].fillna('', inplace=True)

In [7]:
# def remove_html(column: str):
#   df[column] = df[column].str.replace('<.{1,6}>', '')

# def apply_lowercase(column: str):
#   df[column] = df[column].str.lower()

# def remove_special_characters(column: str):
#   df[column] = df[column].str.replace('\W', ' ')

# def apply_decontractions(column: str):
#   df[column] = df[column].str.replace("won't", "will not").str.replace("can\'t", "can not").str.replace("n\'t", " not").str.replace("\'re", " are").str.\
#                                                 replace("\'s", " is").str.replace("\'d", " would").str.replace("\'ll", " will").str.\
#                                                 replace("\'t", " not").str.replace("\'ve", " have").str.replace("\'m", " am")

# def preprocess_body(column):
#   remove_html(column)
#   apply_lowercase(column)
#   remove_special_characters(column)


# def preprocess_title(column):
#   remove_special_characters(column)
#   apply_lowercase(column)

In [22]:
df['Title'] = df['Title'].apply(Preprocessing.apply_lowercase)
df['Title'] = df['Title'].apply(Preprocessing.remove_special_characters)
df['Title'] = df['Title'].apply(Preprocessing.remove_html)

df['Title']

# TODO: try gensim simple_preprocess

0        how can i do simple machine learning without h...
1        what open source books  or other materials  pr...
2                                                      nan
3                                                      nan
4                 is data science the same as data mining 
                               ...                        
72866    question about non linearity of activation fun...
72867                                                  nan
72868    is it possible to  link couple connect  certai...
72869                                                  nan
72870                      one word changes everything nlp
Name: Title, Length: 72871, dtype: object

In [23]:
# Tokenization
sentences = []
for title in df['Title'].values:
    words = nltk.word_tokenize(title)
    sentences.append(words)

sentences[1]

['what',
 'open',
 'source',
 'books',
 'or',
 'other',
 'materials',
 'provide',
 'a',
 'relatively',
 'thorough',
 'overview',
 'of',
 'data',
 'science']

In [24]:
# Remove stopwords from vocabulary
from nltk.corpus import stopwords
# stopwords.words('english')
def remove_stopwords(words):
    return [w for w in words if w not in stopwords.words('english')]

sentences = [remove_stopwords(words) for words in sentences]
    
sentences[1]

['open',
 'source',
 'books',
 'materials',
 'provide',
 'relatively',
 'thorough',
 'overview',
 'data',
 'science']

### TF-IDF

In [25]:
from gensim import corpora, similarities
from gensim.models import TfidfModel
corpus = sentences
dictionary = corpora.Dictionary(corpus)
# dictionary.token2id
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]
tfidf = TfidfModel(bow_corpus)

# transform the whole corpus via TfIdf and store in index matrix
nf=len(dictionary.dfs)
index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=nf)

query = corpus[1]
query_bow = dictionary.doc2bow(query)

# Compute similarity between query and this index
sims = index[tfidf[query_bow]]
print(f'Similarity between query (2nd document) and 3rd document: {sims[2]}%')
# Similarity between query and each document sorted
res = [e for e in sorted(enumerate(sims), key=lambda x: x[1], reverse=True)]


Similarity between query (2nd document) and 3rd document: 0.0%


### Word2Vec 

In [27]:
from embeddings import Embeddings

Embeddings.word2vec_similarity("hadoop", corpus)

[(['term', 'r', 'squared', 'vif', 'variance', 'inflation', 'factor', 'different', 'normal', 'r', 'squared', 'calculation'], 0.9804725), (['get', 'p', 'value', 'confident', 'interval', 'logisticregression', 'sklearn'], 0.97957623), (['regression', 'scatterplot', 'low', 'r', 'squared', 'high', 'p', 'values'], 0.97941035), (['difference', 'r', 'squared', 'adjusted', 'r', 'squared'], 0.97894543), (['r', 'phi', 'coefficient', 'calculation'], 0.9758071), (['necessary', 'take', 'log', 'transformation', 'data', 'values', 'get', 'minimum', 'mean', 'squared', 'error'], 0.9755108), (['computing', 'adjusted', 'p', 'values', 'batches'], 0.97486234), (['least', 'mean', 'square', 'linear', 'regression', 'discrete', 'values', 'axis'], 0.9743969), (['getting', 'wrong', 'ch2', 'values', 'sklearn', 'chi2'], 0.9724077), (['getting', 'different', 'chi', 'square', 'values', 'sklearn', 'function'], 0.9722057)]


In [13]:
# Word2Vec model
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

# Training algorithm: 1 for skip-gram; otherwise CBOW.
word2vec = Word2Vec(corpus, min_count=10, sg=0, window=10)

# Precompute L2-normalized vectors. 
# If replace is set, forget the original vectors and only keep the normalized ones = saves lots of memory!
# Note that you cannot continue training after doing a replace. The model becomes effectively read-only = you can call most_similar, similarity etc., but not train.
word2vec.init_sims(replace=True)
word2vec.wv.most_similar('classification')


    

  word2vec.init_sims(replace=True)


KeyError: "Key 'classification' not present in vocabulary"

In [20]:
# Word2Vec similarity
def word2vec_similarity(model, ws1, ws2):
    """Compute cosine similarity between two sets of words."""
    return model.wv.n_similarity(ws1, ws2)

query = "beer"
r = sorted([(d, word2vec_similarity(word2vec, query, d)) for d in corpus if len(d) != 0], key=lambda x: x[1], reverse=True)
print(query)
print(r[:10])

beer
[(['citra', 'hop', 'differ', 'hops'], 0.0), (['first', 'beer', 'ever', 'brewed'], 0.0), (['reduced', 'alcoholic', 'beer', 'made'], 0.0), (['temperature', 'serve', 'beer'], 0.0), (['best', 'angle', 'store', 'beer', 'bottles'], 0.0), (['certain', 'types', 'beer', 'get', 'drunk', 'quickly'], 0.0), (['difference', 'ale', 'lager'], 0.0), (['mull', 'beer'], 0.0), (['ipas', 'cause', 'worse', 'hangovers'], 0.0), (['average', 'brewing', 'time', 'craft', 'beer'], 0.0)]
