# Load data

In [None]:
import pandas as pd
from google.colab import drive
import os
drive.mount('/content/drive/')
os.chdir('/content/drive/My Drive/CA683_Assignment/YelpDataset/20210411')

Mounted at /content/drive/


In [None]:
path = './20210411_final_data_265062.csv' #/content/drive/MyDrive/CA683_Assignment/YelpDataset/20210411/20210411_final_data_265062.csv
review_col_list2 = ["stars","text"]
df = pd.read_csv(path, usecols=review_col_list2)

In [None]:
#df =pd.read_pickle('./pickle_review_df_preprocessed_104756.txt')

In [None]:
df.head(5)

Unnamed: 0,stars,text
0,1.0,10pm on a super bowl Sunday and they're alread...
1,5.0,Holy heck this place is amazing. I love their ...
2,4.0,Amazing shrimp taco. The others were good but...
3,3.0,the chips may well be the only thing worth goi...
4,4.0,Great food and fun atmosphere. Nothing bad to...


In [None]:
df = df[['stars','text']]

In [None]:
import spacy


nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
stop_words

In [None]:
spacy.blank("en")

In [None]:
from string import punctuation

## declare function

In [None]:
import numpy as np
import re
import glob
from smart_open import smart_open
import os
import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn import utils
from collections import namedtuple, defaultdict
import logging
from sklearn.feature_extraction.text import TfidfVectorizer


# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
	"""
	Ref: https://stackoverflow.com/questions/20802056/python-regular-expression-1
	:param text: string
	:return:
		clean string
	"""
	norm_text = text.lower()
	# Replace breaks with spaces
	norm_text = norm_text.replace('<br />', ' ')
	norm_text = norm_text.replace('\n', ' ')
	# Pad punctuation with spaces on both sides
	#norm_text = re.sub(r"([\.\",\(\)!\?;:])", r" \1 ", norm_text)\n
	norm_text = norm_text.translate(str.maketrans('', '', string.punctuation))
	return norm_text


def concat_files(dirname, folders):
	"""
	Concatenate text from files to one file, and return a file list.
	:param dirname: string of directory
	:param folders: list of folder names
	:return
		files: list of file paths
	"""
	files = []

	for fol in folders:
		output = fol.replace('/', '-') + '.txt'
		txt_files = glob.glob(os.path.join(dirname, fol, '*.txt'))
		print('{} records in {}...'.format(len(txt_files), output))
		files.append(output)

		with smart_open(os.path.join(dirname, output), 'wb') as n:
			for i, txt in enumerate(txt_files):
				with smart_open(txt, 'rb') as f:
					one_text = f.read().decode('utf-8')  # from binary to string
					one_text = normalize_text(one_text)  # convert to lower-case and strip punctuations
					n.write(one_text.encode('utf-8') + b'\n')  # from string to binary + newline

	return files


def select_imdb(select_num, dirname, files, file_splits, file_sentiments):
	"""
	Subset and split IMDB dataset into train/test.
	:param select_num: num of rows to select
	:param dirname: directory of txt files
	:param files: list of string name of files
	:param file_splits: list of string on train/test split
	:param file_sentiments: list of string on pos/neg sentiment label
	:return:
		list of namedtuple
	"""

	sent_doc = namedtuple('sent_doc', ['words', 'tags', 'split', 'sentiment'])
	all_doc = []
	doc_id = 0
	for i, fi in enumerate(files[:-1]):
		s_ = file_splits[i]
		se_ = file_sentiments[i]

		with smart_open(os.path.join(dirname, fi), 'rb', encoding='utf-8') as texts:
			for line_no, line in enumerate(texts):
				if line_no < select_num:
					tokens = gensim.utils.to_unicode(line).split()
					words = tokens  # must be a list for doc2vec
					tags = [doc_id]  # must be a list for doc2vec
					doc_id += 1
					split = s_
					sentiment = se_
					all_doc.append(sent_doc(words, tags, split, sentiment))
				else:
					break

	return all_doc


class DocPreprocess(object):

	def __init__(self,
				 nlp,
				 stop_words,
				 docs,
				 labels,
				 build_bi=False,
				 min_count=5,
				 threshold=10,
				 allowed_postags=['ADV', 'VERB', 'ADJ', 'NOUN', 'PROPN', 'NUM']):

		self.nlp = nlp  # spacy nlp object
		self.stop_words = stop_words  # spacy.lang.en.stop_words.STOP_WORDS
		self.docs = docs  # docs must be either list or numpy array or series of docs
		self.labels = labels # labels must be list or or numpy array or series of labels
		self.doc_ids = np.arange(len(docs))
		self.simple_doc_tokens = [gensim.utils.simple_preprocess(doc, deacc=True) for doc in self.docs]

		if build_bi:
			self.bi_detector = self.build_bi_detect(self.simple_doc_tokens, min_count=min_count, threshold=threshold)
			self.new_docs = self.make_bigram_doc(self.bi_detector, self.simple_doc_tokens)
		else:
			self.new_docs = self.make_simple_doc(self.simple_doc_tokens)
		self.doc_words = [self.lemmatize(doc, allowed_postags=allowed_postags) for doc in self.new_docs]
		self.tagdocs = [TaggedDocument(words=words, tags=[tag]) for words, tag in zip(self.doc_words, self.doc_ids)]


	def build_bi_detect(self, simple_doc_tokens, min_count, threshold):
		bi_ = gensim.models.phrases.Phrases(simple_doc_tokens, min_count=min_count, threshold=threshold)
		bi_detector = gensim.models.phrases.Phraser(bi_)  # wrapper enhance efficiency
		return bi_detector


	def make_bigram_doc(self, bi_detector, simple_doc_tokens):
		bi_doc_tokens = [bi_detector[doc_tokens] for doc_tokens in simple_doc_tokens]
		bi_docs = []
		for bi_tokens in bi_doc_tokens:
			bi_doc = " ".join(bi_tokens)  # concatenate back to a sentence
			bi_docs.append(bi_doc)
		return bi_docs


	def make_simple_doc(self, simple_doc_tokens):
		simple_docs = []
		for doc_tokens in simple_doc_tokens:
			simple = " ".join(doc_tokens)  # concatenate back to a sentence
			simple_docs.append(simple)
		return simple_docs


	def lemmatize(self, doc, allowed_postags):
		"""
		Lemmatize words and remove stop_words.
		:param doc: text
		:param allowed_postags: list of pos tags
		:return:
			list of tokens
		"""
		doc = self.nlp(doc)
		tokens = [token.lemma_ for token in doc if (
				token.pos_ in allowed_postags) and (token.text not in self.stop_words)
    ]
		return tokens



class DocModel(object):

	def __init__(self, docs, **kwargs):
		"""
		:param docs: list of TaggedDocument
		:param kwargs: dictionary of (key,value) for Doc2Vec arguments
		"""
		self.model = Doc2Vec(**kwargs)
		self.docs = docs
		self.model.build_vocab([x for x in self.docs])

	def custom_train(self, fixed_lr=False, fixed_lr_epochs=None):
		"""
		Train Doc2Vec with two options, without fixed learning rate(recommended) or with fixed learning rate.
		Fixed learning rate also includes implementation of shuffling training dataset.
		:param fixed_lr: boolean
		:param fixed_lr_epochs: num of epochs for fixed lr training
		"""
		if not fixed_lr:
			self.model.train([x for x in self.docs],
							 total_examples=len(self.docs),
							 epochs=self.model.epochs)
		else:
			for _ in range(fixed_lr_epochs):
				self.model.train(utils.shuffle([x for x in self.docs]),
								 total_examples=len(self.docs),
								 epochs=1)
				self.model.alpha -= 0.002
				self.model.min_alpha = self.model.alpha  # fixed learning rate


	def test_orig_doc_infer(self):
		"""
		Use the original doc as input for model's vector inference,
		and then compare using most_similar()
		to see if model finds the original doc id be the most similar doc to the input.
		"""
		idx = np.random.randint(len(self.docs))
		print('idx: ' + str(idx))
		doc = [doc for doc in self.docs if doc.tags[0] == idx]
		inferred_vec = self.model.infer_vector(doc[0].words)
		print(self.model.docvecs.most_similar([inferred_vec]))  # wrap vec in a list


class MeanEmbeddingVectorizer(object):


	def __init__(self, word_model):
		self.word_model = word_model
		self.vector_size = word_model.wv.vector_size

	def fit(self):  # comply with scikit-learn transformer requirement
		return self

	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector

	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""
		mean = []
		for word in sent:
			if word in self.word_model.wv.vocab:
				mean.append(self.word_model.wv.get_vector(word))

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])


class TfidfEmbeddingVectorizer(object):

	def __init__(self, word_model):

		self.word_model = word_model
		self.word_idf_weight = None
		self.vector_size = word_model.wv.vector_size

	def fit(self, docs):  # comply with scikit-learn transformer requirement
		"""
		Fit in a list of docs, which had been preprocessed and tokenized,
		such as word bi-grammed, stop-words removed, lemmatized, part of speech filtered.
		Then build up a tfidf model to compute each word's idf as its weight.
		Noted that tf weight is already involved when constructing average word vectors, and thus omitted.
		:param
			pre_processed_docs: list of docs, which are tokenized
		:return:
			self
		"""

		text_docs = []
		for doc in docs:
			text_docs.append(" ".join(doc))

		tfidf = TfidfVectorizer()
		tfidf.fit(text_docs)  # must be list of text string

		# if a word was never seen - it must be at least as infrequent
		# as any of the known words - so the default idf is the max of
		# known idf's
		max_idf = max(tfidf.idf_)  # used as default value for defaultdict
		self.word_idf_weight = defaultdict(lambda: max_idf,
										   [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()])
		return self


	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector


	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""

		mean = []
		for word in sent:
			if word in self.word_model.wv.vocab:
				mean.append(self.word_model.wv.get_vector(word) * self.word_idf_weight[word])  # idf weighted

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])

## declare data frame

In [None]:
import numpy as np
import gensim


In [None]:
all_docs = DocPreprocess(nlp, stop_words, df['text'], df['stars'])

In [None]:
all_docs_df = pd.DataFrame(all_docs.doc_words)
print('Shape of dm doc2vec...')
display(all_docs_df.shape)


Shape of dm doc2vec...


(275197, 27)

Save dm doc2vec as csv file...


In [None]:

all_docs_df['combined'] = all_docs_df.iloc[:, list(range(27))].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [None]:
all_docs_df['combined'] = all_docs_df['combined'].str.replace("None", "")


In [None]:
all_docs_df[all_docs_df.columns[-1]]

0         pm super bowl sunday close weak wonder hard ro...
1         holy place amazing love chicken taco far favor...
2         amazing shrimp taco good shrimp good come     ...
3         chip thing worth go salsa good expensive booze...
4         great food fun atmosphere bad trip vegas reaso...
                                ...                        
275192    good bubble tea refreshing delicious great var...
275193    yesterday order extra long bbq cheeseburger or...
275194    great service great sushi great korean style f...
275195    great place sushi seaweed salad salmon roll fr...
275196    literally good indian long long time super hig...
Name: combined, Length: 275197, dtype: object

In [None]:
all_docs_df.to_csv(os.path.join('./word_embedding/', 'all_docs_df_string.csv'), index=False, header=False)

In [None]:
df.head(5)

Unnamed: 0,stars,text
0,1.0,10pm on a super bowl Sunday and they're alread...
1,5.0,Holy heck this place is amazing. I love their ...
2,4.0,Amazing shrimp taco. The others were good but...
3,3.0,the chips may well be the only thing worth goi...
4,4.0,Great food and fun atmosphere. Nothing bad to...


In [None]:
all_docs_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
0,pm,super,bowl,sunday,close,weak,wonder,hard,rock,die,,,,,,,,,,,,,,,,,
1,holy,place,amazing,love,chicken,taco,far,favorite,great,customer,service,round,awesome,experience,,,,,,,,,,,,,
2,amazing,shrimp,taco,good,shrimp,good,come,,,,,,,,,,,,,,,,,,,,
3,chip,thing,worth,go,salsa,good,expensive,booze,cheap,woman,draw,,,,,,,,,,,,,,,,
4,great,food,fun,atmosphere,bad,trip,vegas,reasonable,pricing,,,,,,,,,,,,,,,,,,


In [None]:
print('Demo of doc words...')
all_docs.doc_words[5][:10]

Demo of doc words...


['price',
 'high',
 'food',
 'good',
 'service',
 'awesome',
 'gamble',
 'hard',
 'rock',
 'eat']

In [None]:
all_docs.labels.iloc[4]

4.0

In [None]:
import multiprocessing
import sys
from gensim.models.word2vec import Word2Vec

workers = multiprocessing.cpu_count()
print('number of cpu: {}'.format(workers))
assert gensim.models.doc2vec.FAST_VERSION > -1

number of cpu: 2


Word2Vec:
size: Using a higher dimensionality than vocabulary size would more-or-less guarantee 'overfitting'. The training could tend toward an idiosyncratic vector for each word – essentially like a 'one-hot' encoding – that would perform better than any other encoding, because there's no cross-word interference forced by representing a larger number of words in a smaller number of dimensions.
https://stackoverflow.com/questions/45444964/python-what-is-the-size-parameter-in-gensim-word2vec-model-class

In [None]:
word_model = Word2Vec(all_docs.doc_words,
                      min_count=5,
                      size=200,
                      window=5,
                      workers=workers,
                      iter=100)

In [None]:
#word_model1 = Word2Vec(all_docs.doc_words,
                      min_count=5,
                      size=100,
                      window=5,
                      workers=workers,
                      iter=100)

## closed

In [None]:
_save_word2vec_format()

In [None]:
class SaveEmbeddingVectorizer(object):


	def __init__(self, word_model):
		self.word_model = word_model
		self.vector_size = word_model.wv.vector_size

	def fit(self):  # comply with scikit-learn transformer requirement
		return self

	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector

	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""
		mean = []
		for word in sent:
			if word in self.word_model.wv.vocab:
				mean.append(self.word_model.wv.get_vector(word))

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])


In [None]:
vec_tr = SaveEmbeddingVectorizer(word_model)
word_vec = vec_tr.transform(all_docs.doc_words)

print('Demo of word averaging doc vector...')
display(word_vec[4])



Demo of word averaging doc vector...


array([-0.73100859,  0.53037333, -0.58329123,  1.62846267,  0.87991852,
        0.27366176,  0.37439668,  0.31001189,  0.5886091 ,  0.25364378,
       -0.31257653, -2.46649408,  1.61072803, -0.43540999,  0.32875502,
        0.15425001, -0.03543835,  0.00551971, -1.01624215, -0.05144159,
       -0.60872883,  0.47550207, -0.48558488,  0.35515547, -0.79492766,
        0.20305444,  0.1457051 ,  0.06479654,  1.43793595,  0.67293614,
        0.98830068, -0.49632972,  0.64017451, -0.28098071, -1.48678672,
       -0.77395618,  0.40729362, -0.16168569,  0.5208267 ,  0.25416315,
        1.6975286 ,  0.36621776, -0.32313335, -1.1038897 , -0.01257934,
        0.33968061, -0.21247689,  0.1328427 , -0.59084111, -0.90501291,
       -0.08454107,  1.32522094, -1.07139111,  0.8720004 , -0.4511869 ,
        1.09826005,  0.02371061,  0.85070676,  1.84448028, -0.88466096,
        0.76252353, -0.23367882,  0.25538737,  0.19234794,  0.50928164,
        0.8666243 , -0.04602426, -0.15665436,  0.18787341,  0.44

In [None]:
np.savetxt(os.path.join('./','word_vec.csv'), word_vec, delimiter=',')

In [None]:
word_vec.shape

(1063674, 200)

## declare function

In [None]:
class MeanEmbeddingVectorizer(object):


	def __init__(self, word_model):
		self.word_model = word_model
		self.vector_size = word_model.wv.vector_size

	def fit(self):  # comply with scikit-learn transformer requirement
		return self

	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector

	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""
		mean = []
		for word in sent:
			if word in self.word_model.wv.vocab:
				mean.append(self.word_model.wv.get_vector(word))

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])


In [None]:
mean_vec_tr = MeanEmbeddingVectorizer(word_model)
doc_vec = mean_vec_tr.transform(all_docs.doc_words)

print('Demo of word averaging doc vector...')
display(doc_vec[4])



Demo of word averaging doc vector...


array([-0.26582885,  0.22404371,  0.1287943 , -0.53785056,  0.07666932,
       -0.25357029,  0.54859519,  0.48824978, -0.16573307,  0.08325347,
       -0.67627025,  0.17856796, -0.25076663,  1.58109665,  0.30031291,
       -0.27396011,  0.01270146, -0.4202247 ,  0.08845488,  0.43136281,
        0.01242834,  0.04251913, -1.2551403 , -0.18725334,  0.10683066,
       -0.86247623,  0.45531714, -0.01915023,  0.27224302,  0.45840284,
       -0.25301138,  0.11330594,  0.47666591, -0.24454993, -0.51720333,
        0.08282014,  0.06324197,  0.68052262, -0.41023502, -0.15548965,
        0.7444976 , -0.60998619, -0.40074015,  0.27874401,  0.12731276,
        0.56095165, -0.43628484,  0.36843908, -0.37784827,  0.32394493,
        0.32103467, -0.00942253,  0.62965661, -0.97230816,  0.09360031,
       -0.62158096,  0.04257327,  0.7877357 ,  0.35178643,  0.53417367,
        0.02722833,  0.11257032, -0.50214124,  0.27306372, -0.59637451,
        0.09255053,  0.33958894, -0.80236363, -0.01565541, -0.33

In [None]:
print('Shape of word-mean doc2vec...')
display(doc_vec.shape)
#print('Save word-mean doc2vec as csv file...')
#np.savetxt(os.path.join(dir_path,'doc_vec.csv'), doc_vec, delimiter=',')

Shape of word-mean doc2vec...


(275197, 200)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275197 entries, 0 to 275196
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   stars   275197 non-null  float64
 1   text    275197 non-null  object 
dtypes: float64(1), object(1)
memory usage: 4.2+ MB


In [None]:
np.savetxt(os.path.join('./word_embedding/','doc_vec.csv'), doc_vec, delimiter=',')

In [None]:
class TfidfEmbeddingVectorizer(object):

	def __init__(self, word_model):

		self.word_model = word_model
		self.word_idf_weight = None
		self.vector_size = word_model.wv.vector_size

	def fit(self, docs):  # comply with scikit-learn transformer requirement
		"""
		Fit in a list of docs, which had been preprocessed and tokenized,
		such as word bi-grammed, stop-words removed, lemmatized, part of speech filtered.
		Then build up a tfidf model to compute each word's idf as its weight.
		Noted that tf weight is already involved when constructing average word vectors, and thus omitted.
		:param
			pre_processed_docs: list of docs, which are tokenized
		:return:
			self
		"""

		text_docs = []
		for doc in docs:
			text_docs.append(" ".join(doc))

		tfidf = TfidfVectorizer()
		tfidf.fit(text_docs)  # must be list of text string

		# if a word was never seen - it must be at least as infrequent
		# as any of the known words - so the default idf is the max of
		# known idf's
		max_idf = max(tfidf.idf_)  # used as default value for defaultdict
		self.word_idf_weight = defaultdict(lambda: max_idf,
										   [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()])
		return self


	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector


	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""

		mean = []
		for word in sent:
			if word in self.word_model.wv.vocab:
				mean.append(self.word_model.wv.get_vector(word) * self.word_idf_weight[word])  # idf weighted

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])

In [None]:
tfidf_vec_tr = TfidfEmbeddingVectorizer(word_model)

In [None]:

tfidf_vec_tr.fit(all_docs.doc_words)  # fit tfidf model first
tfidf_doc_vec = tfidf_vec_tr.transform(all_docs.doc_words)



In [None]:
tfidf_doc_vec.shape

(275197, 200)

In [None]:
# Save tfidf word averaging doc2vec.
print('Shape of tfidf-word-mean doc2vec...')
display(tfidf_doc_vec.shape)
print('Save tfidf-word-mean doc2vec as csv file...')
np.savetxt(os.path.join('./word_embedding/', 'tfidf_doc_vec.csv'), tfidf_doc_vec, delimiter=',')

Shape of tfidf-word-mean doc2vec...


(275197, 200)

Save tfidf-word-mean doc2vec as csv file...


#GloVe
CLOSED - pretrained GloVe data file is not available for Yelp dataset. Cannot find tutorials for converting to GloVe

In [None]:
from gensim.test.utils import get_tmpfile, datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec



# Load in GloVe vector.
glove_vec_fi = datapath('/content/drive/MyDrive/CA683_Assignment/YelpDataset/102442_related/glove.twitter.27B.200d.txt')
tmp_word2vec_fi = get_tmpfile('tmp_glove2word2vec.txt')

glove2word2vec(glove_vec_fi, tmp_word2vec_fi)

glove_word_model = KeyedVectors.load_word2vec_format(tmp_word2vec_fi)

In [None]:
class MeanEmbeddingVectorizerGlove(object):


	def __init__(self, glove_word_model):
		self.glove_word_model = glove_word_model
		self.vector_size = glove_word_model.wv.vector_size

	def fit(self):  # comply with scikit-learn transformer requirement
		return self

	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector

	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""
		mean = []
		for word in sent:
			if word in self.glove_word_model.wv.vocab:
				mean.append(self.glove_word_model.wv.get_vector(word))

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])

In [None]:
mean_vec_tr_Glove = MeanEmbeddingVectorizerGlove(glove_word_model)
doc_vec_Glove = mean_vec_tr_Glove.transform(all_docs.doc_words)

print('Demo of word averaging doc vector...')
display(doc_vec_Glove[4])

  


Demo of word averaging doc vector...


array([-3.36566567e-01, -1.11159466e-01,  1.77930407e-02,  1.02578469e-01,
        1.28990591e-01,  1.90640658e-01,  5.73259413e-01, -5.35635948e-02,
       -2.61349324e-02, -2.10969269e-01, -1.07924724e-02, -7.87606016e-02,
       -4.86807376e-01,  4.09824625e-02, -3.72564942e-01,  2.20287666e-01,
        4.27749865e-02,  4.95696627e-02, -2.86673814e-01, -4.95376699e-02,
       -2.89237231e-01,  7.31457919e-02, -1.47759393e-01, -8.19321489e-04,
        1.18219942e-01,  6.85406089e-01,  6.41084015e-02,  1.77990809e-01,
        9.39447656e-02, -9.43731591e-02, -5.30000543e-04,  1.60350017e-02,
        1.12101942e-01, -1.38113603e-01, -2.05214038e-01, -1.66838542e-01,
        3.24341282e-02, -4.01958041e-02,  1.09357357e-01,  2.86124021e-01,
        9.71353352e-02,  7.64728815e-04,  2.05038022e-02, -2.60471880e-01,
        2.23166659e-01, -1.53897271e-01,  2.72532284e-01, -6.57266527e-02,
        1.44903868e-01,  1.51310107e-02, -3.65707949e-02, -3.26788351e-02,
       -1.10615902e-01, -

In [None]:
print('Shape of word-mean doc2vec Glove...')
display(doc_vec_Glove.shape)
#print('Save word-mean doc2vec Glove as csv file...')
#np.savetxt(os.path.join(dir_path,'doc_vec.csv'), doc_vec, delimiter=',')

Shape of word-mean doc2vec Glove...


(102442, 200)

In [None]:
np.savetxt(os.path.join('./','doc_vec_Glove.csv'), doc_vec_Glove, delimiter=',')

In [None]:
class TfidfEmbeddingVectorizerGlove(object):

	def __init__(self, glove_word_model):

		self.glove_word_model = glove_word_model
		self.word_idf_weight = None
		self.vector_size = glove_word_model.wv.vector_size

	def fit(self, docs):  # comply with scikit-learn transformer requirement
		"""
		Fit in a list of docs, which had been preprocessed and tokenized,
		such as word bi-grammed, stop-words removed, lemmatized, part of speech filtered.
		Then build up a tfidf model to compute each word's idf as its weight.
		Noted that tf weight is already involved when constructing average word vectors, and thus omitted.
		:param
			pre_processed_docs: list of docs, which are tokenized
		:return:
			self
		"""

		text_docs = []
		for doc in docs:
			text_docs.append(" ".join(doc))

		tfidf = TfidfVectorizer()
		tfidf.fit(text_docs)  # must be list of text string

		# if a word was never seen - it must be at least as infrequent
		# as any of the known words - so the default idf is the max of
		# known idf's
		max_idf = max(tfidf.idf_)  # used as default value for defaultdict
		self.word_idf_weight = defaultdict(lambda: max_idf,
										   [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()])
		return self


	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector


	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""

		mean = []
		for word in sent:
			if word in self.glove_word_model.wv.vocab:
				mean.append(self.glove_word_model.wv.get_vector(word) * self.word_idf_weight[word])  # idf weighted

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])

In [None]:
tfidf_vec_tr_Glove = TfidfEmbeddingVectorizerGlove(glove_word_model)

  import sys


In [None]:
tfidf_vec_tr_Glove.fit(all_docs.doc_words)  # fit tfidf model first
tfidf_doc_vec_Glove = tfidf_vec_tr_Glove.transform(all_docs.doc_words)



In [None]:
tfidf_doc_vec_Glove.shape

(102442, 200)

In [None]:
# Save tfidf word averaging doc2vec.
print('Shape of tfidf-word-mean doc2vec...')
display(tfidf_doc_vec_Glove.shape)
print('Save tfidf-word-mean doc2vec as csv file...')
np.savetxt(os.path.join('./', 'tfidf_doc_vec_Glove.csv'), tfidf_doc_vec_Glove, delimiter=',')

Shape of tfidf-word-mean doc2vec...


(102442, 200)

Save tfidf-word-mean doc2vec as csv file...


# Docvec

In [None]:
class DocModel(object):

	def __init__(self, docs, **kwargs):
		"""
		:param docs: list of TaggedDocument
		:param kwargs: dictionary of (key,value) for Doc2Vec arguments
		"""
		self.model = Doc2Vec(**kwargs)
		self.docs = docs
		self.model.build_vocab([x for x in self.docs])

	def custom_train(self, fixed_lr=False, fixed_lr_epochs=None):
		"""
		Train Doc2Vec with two options, without fixed learning rate(recommended) or with fixed learning rate.
		Fixed learning rate also includes implementation of shuffling training dataset.
		:param fixed_lr: boolean
		:param fixed_lr_epochs: num of epochs for fixed lr training
		"""
		if not fixed_lr:
			self.model.train([x for x in self.docs],
							 total_examples=len(self.docs),
							 epochs=self.model.epochs)
		else:
			for _ in range(fixed_lr_epochs):
				self.model.train(utils.shuffle([x for x in self.docs]),
								 total_examples=len(self.docs),
								 epochs=1)
				self.model.alpha -= 0.002
				self.model.min_alpha = self.model.alpha  # fixed learning rate


	def test_orig_doc_infer(self):
		"""
		Use the original doc as input for model's vector inference,
		and then compare using most_similar()
		to see if model finds the original doc id be the most similar doc to the input.
		"""
		idx = np.random.randint(len(self.docs))
		print('idx: ' + str(idx))
		doc = [doc for doc in self.docs if doc.tags[0] == idx]
		inferred_vec = self.model.infer_vector(doc[0].words)
		print(self.model.docvecs.most_similar([inferred_vec]))  # wrap vec in a list

dm ({1,0}, optional) – Defines the training algorithm. If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.
negative (int, optional) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
hs ({1,0}, optional) – If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
sample (float, optional) – The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
alpha (float, optional) – The initial learning rate.

min_alpha (float, optional) – Learning rate will linearly drop to min_alpha as training progresses.
epochs (int, optional) – Number of iterations (epochs) over the corpus. Defaults to 10 for Doc2Vec.
https://radimrehurek.com/gensim/models/doc2vec.html

In [None]:
dm_args = {
    'dm': 1,
    'dm_mean': 1,
    'vector_size': 100,
    'window': 5,
    'negative': 5,
    'hs': 0,
    'min_count': 5,
    'sample': 0,
    'workers': workers,
    'alpha': 0.025,
    'min_alpha': 0.025,
    'epochs': 100,
    'comment': 'alpha=0.025'
}

In [None]:
dm = DocModel(docs=all_docs.tagdocs, **dm_args)

In [None]:
dm.custom_train()

In [None]:
# Save doc2vec as feature dataframe.
dm_doc_vec_ls = []
for i in range(len(dm.model.docvecs)):
    dm_doc_vec_ls.append(dm.model.docvecs[i])


dm_doc_vec = pd.DataFrame(dm_doc_vec_ls)
print('Shape of dm doc2vec...')
display(dm_doc_vec.shape)

print('Save dm doc2vec as csv file...')
dm_doc_vec.to_csv(os.path.join('./word_embedding/', 'dm_doc_vec.csv'), index=False, header=False)

Shape of dm doc2vec...


(275197, 100)

Save dm doc2vec as csv file...


In [None]:

print('Shape of target labels...')
display(all_docs.labels.shape)
target_labels = all_docs.labels

print('Save target labels...')
target_labels.to_csv(os.path.join('./word_embedding/', 'target_labels.csv'), index=False, header=True)

Shape of target labels...


(275197,)

Save target labels...


#Classification Models
SGDClassifier or Logistic Regression applied on 

Tf-Idf Weighted Averaging Word Vector
PV-DM Doc2vec
Tf-Idf and Doc2vec Concatenated Feature

## Prepare

In [None]:

import os
import pandas as pd

# Read in saved files.


doc_vec = pd.read_csv(os.path.join('./word_embedding/', 'doc_vec.csv'), header=None)
tfidf_doc_vec = pd.read_csv(os.path.join('./word_embedding/', 'tfidf_doc_vec.csv'), header=None)
#doc_vec_Glove = pd.read_csv(os.path.join('./word_embedding/', 'doc_vec_Glove.csv'), header=None)
#tfidf_doc_vec_Glove = pd.read_csv(os.path.join('./word_embedding/', 'tfidf_doc_vec_Glove.csv'), header=None)
dm_doc_vec = pd.read_csv(os.path.join('./word_embedding/', 'dm_doc_vec.csv'), header=None)
target_labels = pd.read_csv(os.path.join('./word_embedding/', 'target_labels.csv'), header=0)

In [None]:
from sklearn.linear_model import LogisticRegression

# Classification via Logistic Model
logistic = LogisticRegression(random_state=1, multi_class='multinomial', solver='saga')

In [None]:
from sklearn.linear_model import SGDClassifier

# (Optional) Classification via stochastic gradient descent classifier.
sgd = SGDClassifier(loss='hinge',
                    verbose=1,
                    random_state=1,
                    learning_rate='invscaling',
                    eta0=1)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:

import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.utils import shuffle
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import math
import seaborn as sns

def split_size(df, train=0.8, valid=0.1):
    train_size = math.floor(len(df) * train)
    valid_size = math.floor(len(df) * valid)
    test_size = len(df) - train_size - valid_size
    return train_size, valid_size, test_size

In [None]:
from sklearn.model_selection import train_test_split




def main(model, df, concate, concat_df):
    if concate:
        df = pd.concat([df, concat_df], axis=1, ignore_index=True)
    else:
        df = df

    # Specify train/valid/test size.
    train_size, valid_size, test_size = split_size(df, train=0.8, valid=0.1)  # no need to use valid dataset here
    # Prepare test dataset.
    train_X, test_X, train_y, test_y = train_test_split(df,
                                                    target_labels,
                                                    test_size=test_size,
                                                    random_state=1,
                                                    stratify=target_labels)

    # Prepare valid dataset.
    if valid_size != 0:
        train_X, valid_X, train_y, valid_y = train_test_split(train_X,
                                                      train_y,
                                                      test_size=valid_size,
                                                      random_state=1,
                                                      stratify=train_y)
    
    print('Shape of train_X: {}'.format(train_X.shape))
    print('Shape of valid_X: {}'.format(valid_X.shape if 'valid_X' in vars() else (0,0)))
    print('Shape of text_X: {}'.format(test_X.shape))
    
    model.fit(train_X, train_y)
    
    if valid_size != 0:
        return model, train_X, valid_X, test_X, train_y, valid_y, test_y
    else:
        return model, train_X, None, test_X, train_y, None, test_y

## Simple Averaging Word Vector

In [None]:

model = RandomForestClassifier()  # or choose sgd.
df = doc_vec
concate = False
concat_df = dm_doc_vec

In [None]:

# __main__
clf, train_X, valid_X, test_X, train_y, valid_y, test_y = main(model, 
                                                               df, 
                                                               concate=concate, 
                                                               concat_df=concat_df)

Shape of train_X: (220157, 200)
Shape of valid_X: (0, 0)
Shape of text_X: (55040, 200)




In [None]:
def sk_evaluate(model, feature, label, label_names):
    pred = model.predict(feature)
    true = np.array(label)

    print('Score on dataset...\n')
    print('Confusion Matrix:\n', confusion_matrix(true, pred))
    print('\nClassification Report:\n', classification_report(true, pred, target_names=label_names))
    print('\naccuracy: {:.3f}'.format(accuracy_score(true, pred)))
    print('f1 score: {:.3f}'.format(f1_score(true, pred, average='weighted')))

    return pred, true

In [None]:
print('Performance of Mean Word Vector on training dataset...')
_, _ = sk_evaluate(clf, train_X, train_y, label_names=None)

Performance of Mean Word Vector on training dataset...
Score on dataset...

Confusion Matrix:
 [[ 16906      0      0      0     27]
 [     4   9572      1      3      7]
 [     1      1  16322      6     44]
 [     0      0      5  46704     66]
 [     0      0      2      3 130483]]

Classification Report:
               precision    recall  f1-score   support

         1.0       1.00      1.00      1.00     16933
         2.0       1.00      1.00      1.00      9587
         3.0       1.00      1.00      1.00     16374
         4.0       1.00      1.00      1.00     46775
         5.0       1.00      1.00      1.00    130488

    accuracy                           1.00    220157
   macro avg       1.00      1.00      1.00    220157
weighted avg       1.00      1.00      1.00    220157


accuracy: 0.999
f1 score: 0.999


In [None]:
print('Performance of Mean Word Vector on testing dataset...')
_, _ = sk_evaluate(clf, test_X, test_y, label_names=None)

Performance of Mean Word Vector on testing dataset...
Score on dataset...

Confusion Matrix:
 [[ 2946    71   145   181   891]
 [  943   107   298   333   716]
 [  539    92   520   854  2088]
 [  230    19   206  1399  9840]
 [  213     8    50   816 31535]]

Classification Report:
               precision    recall  f1-score   support

         1.0       0.60      0.70      0.65      4234
         2.0       0.36      0.04      0.08      2397
         3.0       0.43      0.13      0.20      4093
         4.0       0.39      0.12      0.18     11694
         5.0       0.70      0.97      0.81     32622

    accuracy                           0.66     55040
   macro avg       0.50      0.39      0.38     55040
weighted avg       0.59      0.66      0.59     55040


accuracy: 0.663
f1 score: 0.588


##Tf-Idf Weighted Averaging Word Vector

In [None]:
model = RandomForestClassifier()  # or choose sgd.
df = tfidf_doc_vec
concate = False
concat_df = dm_doc_vec

In [None]:
clf, train_X, valid_X, test_X, train_y, valid_y, test_y = main(model, 
                                                               df, 
                                                               concate=concate, 
                                                               concat_df=concat_df)

Shape of train_X: (220157, 200)
Shape of valid_X: (0, 0)
Shape of text_X: (55040, 200)




In [None]:
print('Performance of Tf-Idf Mean Word Vector on training dataset...')
_, _ = sk_evaluate(clf, train_X, train_y, label_names=None)

Performance of Tf-Idf Mean Word Vector on training dataset...
Score on dataset...

Confusion Matrix:
 [[ 16906      0      0      0     27]
 [     4   9572      1      3      7]
 [     1      1  16322      6     44]
 [     0      0      5  46704     66]
 [     0      0      2      3 130483]]

Classification Report:
               precision    recall  f1-score   support

         1.0       1.00      1.00      1.00     16933
         2.0       1.00      1.00      1.00      9587
         3.0       1.00      1.00      1.00     16374
         4.0       1.00      1.00      1.00     46775
         5.0       1.00      1.00      1.00    130488

    accuracy                           1.00    220157
   macro avg       1.00      1.00      1.00    220157
weighted avg       1.00      1.00      1.00    220157


accuracy: 0.999
f1 score: 0.999


In [None]:
#test on testing data
print('Performance of Tf-Idf Mean Word Vector on testing dataset...')
_, _ = sk_evaluate(clf, test_X, test_y, label_names=None)

Performance of Tf-Idf Mean Word Vector on testing dataset...
Score on dataset...

Confusion Matrix:
 [[ 2946    71   145   181   891]
 [  943   107   298   333   716]
 [  539    92   520   854  2088]
 [  230    19   206  1399  9840]
 [  213     8    50   816 31535]]

Classification Report:
               precision    recall  f1-score   support

         1.0       0.60      0.70      0.65      4234
         2.0       0.36      0.04      0.08      2397
         3.0       0.43      0.13      0.20      4093
         4.0       0.39      0.12      0.18     11694
         5.0       0.70      0.97      0.81     32622

    accuracy                           0.66     55040
   macro avg       0.50      0.39      0.38     55040
weighted avg       0.59      0.66      0.59     55040


accuracy: 0.663
f1 score: 0.588


##PV-DM Doc2vec 

In [None]:
model = RandomForestClassifier()  # or choose sgd.
df = dm_doc_vec
concate = False
concat_df = dm_doc_vec

In [None]:
clf, train_X, valid_X, test_X, train_y, valid_y, test_y = main(model, 
                                                               df, 
                                                               concate=concate, 
                                                               concat_df=concat_df)

Shape of train_X: (220157, 100)
Shape of valid_X: (0, 0)
Shape of text_X: (55040, 100)




In [None]:
print('Performance of Doc2vec on training dataset...')
_, _ = sk_evaluate(clf, train_X, train_y, label_names=None)

Performance of Doc2vec on training dataset...
Score on dataset...

Confusion Matrix:
 [[ 16933      0      0      0      0]
 [     0   9587      0      0      0]
 [     0      0  16374      0      0]
 [     0      0      0  46775      0]
 [     0      0      0      0 130488]]

Classification Report:
               precision    recall  f1-score   support

         1.0       1.00      1.00      1.00     16933
         2.0       1.00      1.00      1.00      9587
         3.0       1.00      1.00      1.00     16374
         4.0       1.00      1.00      1.00     46775
         5.0       1.00      1.00      1.00    130488

    accuracy                           1.00    220157
   macro avg       1.00      1.00      1.00    220157
weighted avg       1.00      1.00      1.00    220157


accuracy: 1.000
f1 score: 1.000


In [None]:
print('Performance of Doc2vec on testing dataset...')
_, _ = sk_evaluate(clf, test_X, test_y, label_names=None)

Performance of Doc2vec on testing dataset...
Score on dataset...

Confusion Matrix:
 [[  179     0     5    83  3967]
 [   56     2    14   105  2220]
 [   31     0    23   248  3791]
 [    6     0     7   303 11378]
 [    4     0     1   224 32393]]

Classification Report:
               precision    recall  f1-score   support

         1.0       0.65      0.04      0.08      4234
         2.0       1.00      0.00      0.00      2397
         3.0       0.46      0.01      0.01      4093
         4.0       0.31      0.03      0.05     11694
         5.0       0.60      0.99      0.75     32622

    accuracy                           0.60     55040
   macro avg       0.61      0.21      0.18     55040
weighted avg       0.55      0.60      0.46     55040


accuracy: 0.598
f1 score: 0.462


##Tf-Idf and Doc2vec Concatenated Feature

###logistic

In [None]:
model = logistic  # or choose sgd.
df = tfidf_doc_vec
concate = True  # set to True.
concat_df = dm_doc_vec

In [None]:
clf, train_X, valid_X, test_X, train_y, valid_y, test_y = main(model, 
                                                               df, 
                                                               concate=concate, 
                                                               concat_df=concat_df)

Shape of train_X: (220157, 300)
Shape of valid_X: (0, 0)
Shape of text_X: (55040, 300)


  y = column_or_1d(y, warn=True)


In [None]:
print('Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on training dataset...')
_, _ = sk_evaluate(clf, train_X, train_y, label_names=None)

Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on training dataset...
Score on dataset...

Confusion Matrix:
 [[ 12947   1059    941    424   1562]
 [  3577   1641   2151    823   1395]
 [  1767   1012   4978   4099   4518]
 [   782    223   2490  11024  32256]
 [   877    117    793   6842 121859]]

Classification Report:
               precision    recall  f1-score   support

         1.0       0.65      0.76      0.70     16933
         2.0       0.40      0.17      0.24      9587
         3.0       0.44      0.30      0.36     16374
         4.0       0.47      0.24      0.32     46775
         5.0       0.75      0.93      0.83    130488

    accuracy                           0.69    220157
   macro avg       0.54      0.48      0.49    220157
weighted avg       0.65      0.69      0.65    220157


accuracy: 0.692
f1 score: 0.653


In [None]:
#test on testing data
print('Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on testing dataset using logistic ...')
_, _ = sk_evaluate(clf, test_X, test_y, label_names=None)

Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on testing dataset using logistic ...
Score on dataset...

Confusion Matrix:
 [[ 3187   264   245   115   423]
 [  928   384   553   210   322]
 [  470   259  1202  1004  1158]
 [  203    43   562  2750  8136]
 [  212    26   201  1636 30547]]

Classification Report:
               precision    recall  f1-score   support

         1.0       0.64      0.75      0.69      4234
         2.0       0.39      0.16      0.23      2397
         3.0       0.44      0.29      0.35      4093
         4.0       0.48      0.24      0.32     11694
         5.0       0.75      0.94      0.83     32622

    accuracy                           0.69     55040
   macro avg       0.54      0.48      0.48     55040
weighted avg       0.65      0.69      0.65     55040


accuracy: 0.692
f1 score: 0.651


### decision tree

In [None]:
model = DecisionTreeClassifier()  # or choose sgd.
df = tfidf_doc_vec
concate = True  # set to True.
concat_df = dm_doc_vec

In [None]:
clf, train_X, valid_X, test_X, train_y, valid_y, test_y = main(model, 
                                                               df, 
                                                               concate=concate, 
                                                               concat_df=concat_df)

Shape of train_X: (81953, 300)
Shape of valid_X: (0, 0)
Shape of text_X: (20489, 300)


In [None]:
print('Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on training dataset using decision tree...')
_, _ = sk_evaluate(clf, train_X, train_y, label_names=None)

Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on training dataset using decision tree...
Score on dataset...

Confusion Matrix:
 [[10908     0     0     0     0]
 [    0  5218     0     0     0]
 [    0     0  6198     0     0]
 [    0     0     0 16393     0]
 [    0     0     0     0 43236]]

Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00     10908
           2       1.00      1.00      1.00      5218
           3       1.00      1.00      1.00      6198
           4       1.00      1.00      1.00     16393
           5       1.00      1.00      1.00     43236

    accuracy                           1.00     81953
   macro avg       1.00      1.00      1.00     81953
weighted avg       1.00      1.00      1.00     81953


accuracy: 1.000
f1 score: 1.000


In [None]:
#test on testing data
print('Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on testing dataset using decision tree...')
_, _ = sk_evaluate(clf, test_X, test_y, label_names=None)

Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on testing dataset using decision tree...
Score on dataset...

Confusion Matrix:
 [[1356  435  274  263  399]
 [ 414  241  204  199  247]
 [ 266  200  234  380  469]
 [ 218  222  337 1074 2248]
 [ 406  339  513 2268 7283]]

Classification Report:
               precision    recall  f1-score   support

           1       0.51      0.50      0.50      2727
           2       0.17      0.18      0.18      1305
           3       0.15      0.15      0.15      1549
           4       0.26      0.26      0.26      4099
           5       0.68      0.67      0.68     10809

    accuracy                           0.50     20489
   macro avg       0.35      0.35      0.35     20489
weighted avg       0.50      0.50      0.50     20489


accuracy: 0.497
f1 score: 0.500


### random forest

In [None]:
model = RandomForestClassifier()  # or choose sgd.
df = tfidf_doc_vec
concate = True  # set to True.
concat_df = dm_doc_vec

In [None]:
clf, train_X, valid_X, test_X, train_y, valid_y, test_y = main(model, 
                                                               df, 
                                                               concate=concate, 
                                                               concat_df=concat_df)

Shape of train_X: (81953, 300)
Shape of valid_X: (0, 0)
Shape of text_X: (20489, 300)




In [None]:
print('Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on training dataset using RandomForestClassifier ...')
_, _ = sk_evaluate(clf, train_X, train_y, label_names=None)

Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on training dataset using RandomForestClassifier ...
Score on dataset...

Confusion Matrix:
 [[10908     0     0     0     0]
 [    0  5218     0     0     0]
 [    0     0  6198     0     0]
 [    0     0     0 16393     0]
 [    0     0     0     0 43236]]

Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00     10908
           2       1.00      1.00      1.00      5218
           3       1.00      1.00      1.00      6198
           4       1.00      1.00      1.00     16393
           5       1.00      1.00      1.00     43236

    accuracy                           1.00     81953
   macro avg       1.00      1.00      1.00     81953
weighted avg       1.00      1.00      1.00     81953


accuracy: 1.000
f1 score: 1.000


In [None]:
#test on testing data
print('Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on testing dataset using RandomForestClassifier ...')
_, _ = sk_evaluate(clf, test_X, test_y, label_names=None)

Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on testing dataset using RandomForestClassifier ...
Score on dataset...

Confusion Matrix:
 [[ 2156    29    17    51   474]
 [  646    66    40    78   475]
 [  350    32    82   163   922]
 [  160    12    17   251  3659]
 [  169     6     6   130 10498]]

Classification Report:
               precision    recall  f1-score   support

           1       0.62      0.79      0.69      2727
           2       0.46      0.05      0.09      1305
           3       0.51      0.05      0.10      1549
           4       0.37      0.06      0.11      4099
           5       0.65      0.97      0.78     10809

    accuracy                           0.64     20489
   macro avg       0.52      0.39      0.35     20489
weighted avg       0.57      0.64      0.54     20489


accuracy: 0.637
f1 score: 0.539


# DAO_TEST

In [None]:
import time
import datetime

#import cPickle as pickle
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import pylab
import re
import scipy as sp
import seaborn

from gensim import corpora, models
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
#from sklearn.lda import LDA
#from sklearn.qda import QDA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

plt.rc('figure', figsize=(10,6))
seaborn.set()
colors = seaborn.color_palette()

In [None]:
label_keys =[1, 2, 3, 4, 5]

In [None]:
df = tfidf_doc_vec
concate = True  # set to True.
concat_df = dm_doc_vec

In [None]:
        df = pd.concat([df, concat_df], axis=1, ignore_index=True)
 

In [None]:
   # Specify train/valid/test size.
train_size, valid_size, test_size = split_size(df, train=0.8, valid=0.)  # no need to use valid dataset here
    # Prepare test dataset.
train_X, test_X, train_y, test_y = train_test_split(df,
                                                    target_labels,
                                                    test_size=test_size,
                                                    random_state=1,
                                                    stratify=target_labels)

In [None]:
clfs = [RandomForestClassifier(), LogisticRegression(),DecisionTreeClassifier()]
clf_names = ['Random Forest', 'Logistic Regression','Decision Tree']

NBResults = {}
for (i, clf_) in enumerate(clfs):
    clf = clf_.fit(train_X, train_y)
    preds = clf.predict(test_X)
    
    precision = metrics.precision_score(test_y, preds,average ='micro')
    recall = metrics.recall_score(test_y, preds,average ='micro')
    f1 = metrics.f1_score(test_y, preds,average ='micro')
    accuracy = accuracy_score(test_y, preds)
    report = classification_report(test_y, preds)
    matrix = metrics.confusion_matrix(test_y, preds, labels=label_keys)
    
    data = {'precision':precision,
            'recall':recall,
            'f1_score':f1,
            'accuracy':accuracy,
            'clf_report':report,
            'clf_matrix':matrix,
            'y_predicted':preds}
    
    NBResults[clf_names[i]] = data

cols = ['precision', 'recall', 'f1_score', 'accuracy']
pd.DataFrame(NBResults).T[cols].T

  
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Random Forest,Logistic Regression,Decision Tree
precision,0.635707,0.669872,0.498902
recall,0.635707,0.669872,0.498902
f1_score,0.635707,0.669872,0.498902
accuracy,0.635707,0.669872,0.498902


In [None]:
for model, val in NBResults.items():
    print ('-------'+'-'*len(model))
    print ('MODEL:', model)
    print ('-------'+'-'*len(model))
    print ('The precision for this classifier is ' + str(val['precision']))
    print ('The recall for this classifier is    ' + str(val['recall']))
    print ('The f1 for this classifier is        ' + str(val['f1_score']))
    print ('The accuracy for this classifier is  ' + str(val['accuracy']))
    print ('Here is the classification report:')
    print (val['clf_report'])

--------------------
MODEL: Random Forest
--------------------
The precision for this classifier is 0.6357069647127727
The recall for this classifier is    0.6357069647127727
The f1 for this classifier is        0.6357069647127727
The accuracy for this classifier is  0.6357069647127727
Here is the classification report:
              precision    recall  f1-score   support

           1       0.62      0.79      0.70      2727
           2       0.37      0.04      0.08      1305
           3       0.42      0.04      0.08      1549
           4       0.37      0.06      0.10      4099
           5       0.65      0.97      0.78     10809

    accuracy                           0.64     20489
   macro avg       0.49      0.38      0.35     20489
weighted avg       0.56      0.64      0.54     20489

--------------------------
MODEL: Logistic Regression
--------------------------
The precision for this classifier is 0.6698716384401386
The recall for this classifier is    0.6698716384401

# DAO_TEST_GLOVE

In [None]:
df = tfidf_doc_vec_Glove
concate = True  # set to True.
concat_df = dm_doc_vec

In [None]:
   df = pd.concat([df, concat_df], axis=1, ignore_index=True)

In [None]:
 # Specify train/valid/test size.
train_size, valid_size, test_size = split_size(df, train=0.8, valid=0.)  # no need to use valid dataset here
    # Prepare test dataset.
train_X, test_X, train_y, test_y = train_test_split(df,
                                                    target_labels,
                                                    test_size=test_size,
                                                    random_state=1,
                                                    stratify=target_labels)

In [None]:
clfs = [RandomForestClassifier(), LogisticRegression(),DecisionTreeClassifier()]
clf_names = ['Random Forest', 'Logistic Regression','Decision Tree']

NBResults = {}
for (i, clf_) in enumerate(clfs):
    clf = clf_.fit(train_X, train_y)
    preds = clf.predict(test_X)
    
    precision = metrics.precision_score(test_y, preds,average ='micro')
    recall = metrics.recall_score(test_y, preds,average ='micro')
    f1 = metrics.f1_score(test_y, preds,average ='micro')
    accuracy = accuracy_score(test_y, preds)
    report = classification_report(test_y, preds)
    matrix = metrics.confusion_matrix(test_y, preds, labels=label_keys)
    
    data = {'precision':precision,
            'recall':recall,
            'f1_score':f1,
            'accuracy':accuracy,
            'clf_report':report,
            'clf_matrix':matrix,
            'y_predicted':preds}
    
    NBResults[clf_names[i]] = data

cols = ['precision', 'recall', 'f1_score', 'accuracy']
pd.DataFrame(NBResults).T[cols].T

  
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Random Forest,Logistic Regression,Decision Tree
precision,0.605398,0.661965,0.450388
recall,0.605398,0.661965,0.450388
f1_score,0.605398,0.661965,0.450388
accuracy,0.605398,0.661965,0.450388


In [None]:
for model, val in NBResults.items():
    print ('-------'+'-'*len(model))
    print ('MODEL:', model)
    print ('-------'+'-'*len(model))
    print ('The precision for this classifier is ' + str(val['precision']))
    print ('The recall for this classifier is    ' + str(val['recall']))
    print ('The f1 for this classifier is        ' + str(val['f1_score']))
    print ('The accuracy for this classifier is  ' + str(val['accuracy']))
    print ('Here is the classification report:')
    print (val['clf_report'])

--------------------
MODEL: Random Forest
--------------------
The precision for this classifier is 0.6053980184489238
The recall for this classifier is    0.6053980184489238
The f1 for this classifier is        0.6053980184489238
The accuracy for this classifier is  0.6053980184489238
Here is the classification report:
              precision    recall  f1-score   support

           1       0.64      0.63      0.63      2727
           2       0.38      0.00      0.01      1305
           3       0.50      0.00      0.01      1549
           4       0.32      0.02      0.05      4099
           5       0.61      0.98      0.75     10809

    accuracy                           0.61     20489
   macro avg       0.49      0.33      0.29     20489
weighted avg       0.53      0.61      0.49     20489

--------------------------
MODEL: Logistic Regression
--------------------------
The precision for this classifier is 0.661964956806091
The recall for this classifier is    0.66196495680609

In [None]:
df = tfidf_doc_vec_Glove
#concate = False  # set to True.
#concat_df = dm_doc_vec

#df = pd.concat([df, concat_df], axis=1, ignore_index=True)

 # Specify train/valid/test size.
train_size, valid_size, test_size = split_size(df, train=0.8, valid=0.)  # no need to use valid dataset here
    # Prepare test dataset.
train_X, test_X, train_y, test_y = train_test_split(df,
                                                    target_labels,
                                                    test_size=test_size,
                                                    random_state=1,
                                                    stratify=target_labels)

clfs = [RandomForestClassifier(), LogisticRegression(),DecisionTreeClassifier()]
clf_names = ['Random Forest', 'Logistic Regression','Decision Tree']

NBResults = {}
for (i, clf_) in enumerate(clfs):
    clf = clf_.fit(train_X, train_y)
    preds = clf.predict(test_X)
    
    precision = metrics.precision_score(test_y, preds,average ='micro')
    recall = metrics.recall_score(test_y, preds,average ='micro')
    f1 = metrics.f1_score(test_y, preds,average ='micro')
    accuracy = accuracy_score(test_y, preds)
    report = classification_report(test_y, preds)
    matrix = metrics.confusion_matrix(test_y, preds, labels=label_keys)
    
    data = {'precision':precision,
            'recall':recall,
            'f1_score':f1,
            'accuracy':accuracy,
            'clf_report':report,
            'clf_matrix':matrix,
            'y_predicted':preds}
    
    NBResults[clf_names[i]] = data

cols = ['precision', 'recall', 'f1_score', 'accuracy']
pd.DataFrame(NBResults).T[cols].T


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Random Forest,Logistic Regression,Decision Tree
precision,0.604519,0.636195,0.448631
recall,0.604519,0.636195,0.448631
f1_score,0.604519,0.636195,0.448631
accuracy,0.604519,0.636195,0.448631


In [None]:
for model, val in NBResults.items():
    print ('-------'+'-'*len(model))
    print ('MODEL:', model)
    print ('-------'+'-'*len(model))
    print ('The precision for this classifier is ' + str(val['precision']))
    print ('The recall for this classifier is    ' + str(val['recall']))
    print ('The f1 for this classifier is        ' + str(val['f1_score']))
    print ('The accuracy for this classifier is  ' + str(val['accuracy']))
    print ('Here is the classification report:')
    print (val['clf_report'])

--------------------
MODEL: Random Forest
--------------------
The precision for this classifier is 0.604519498267363
The recall for this classifier is    0.604519498267363
The f1 for this classifier is        0.604519498267363
The accuracy for this classifier is  0.604519498267363
Here is the classification report:
              precision    recall  f1-score   support

           1       0.62      0.65      0.64      2727
           2       0.18      0.00      0.01      1305
           3       0.62      0.01      0.01      1549
           4       0.28      0.03      0.05      4099
           5       0.61      0.97      0.75     10809

    accuracy                           0.60     20489
   macro avg       0.46      0.33      0.29     20489
weighted avg       0.52      0.60      0.49     20489

--------------------------
MODEL: Logistic Regression
--------------------------
The precision for this classifier is 0.6361950314803065
The recall for this classifier is    0.6361950314803065


In [None]:
df = doc_vec_Glove
#concate = False  # set to True.
#concat_df = dm_doc_vec

#df = pd.concat([df, concat_df], axis=1, ignore_index=True)

 # Specify train/valid/test size.
train_size, valid_size, test_size = split_size(df, train=0.8, valid=0.)  # no need to use valid dataset here
    # Prepare test dataset.
train_X, test_X, train_y, test_y = train_test_split(df,
                                                    target_labels,
                                                    test_size=test_size,
                                                    random_state=1,
                                                    stratify=target_labels)

clfs = [RandomForestClassifier(), LogisticRegression(),DecisionTreeClassifier()]
clf_names = ['Random Forest', 'Logistic Regression','Decision Tree']

NBResults = {}
for (i, clf_) in enumerate(clfs):
    clf = clf_.fit(train_X, train_y)
    preds = clf.predict(test_X)
    
    precision = metrics.precision_score(test_y, preds,average ='micro')
    recall = metrics.recall_score(test_y, preds,average ='micro')
    f1 = metrics.f1_score(test_y, preds,average ='micro')
    accuracy = accuracy_score(test_y, preds)
    report = classification_report(test_y, preds)
    matrix = metrics.confusion_matrix(test_y, preds, labels=label_keys)
    
    data = {'precision':precision,
            'recall':recall,
            'f1_score':f1,
            'accuracy':accuracy,
            'clf_report':report,
            'clf_matrix':matrix,
            'y_predicted':preds}
    
    NBResults[clf_names[i]] = data

cols = ['precision', 'recall', 'f1_score', 'accuracy']
pd.DataFrame(NBResults).T[cols].T

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Random Forest,Logistic Regression,Decision Tree
precision,0.618527,0.648543,0.470692
recall,0.618527,0.648543,0.470692
f1_score,0.618527,0.648543,0.470692
accuracy,0.618527,0.648543,0.470692


In [None]:
for model, val in NBResults.items():
    print ('-------'+'-'*len(model))
    print ('MODEL:', model)
    print ('-------'+'-'*len(model))
    print ('The precision for this classifier is ' + str(val['precision']))
    print ('The recall for this classifier is    ' + str(val['recall']))
    print ('The f1 for this classifier is        ' + str(val['f1_score']))
    print ('The accuracy for this classifier is  ' + str(val['accuracy']))
    print ('Here is the classification report:')
    print (val['clf_report'])

In [None]:
df = doc_vec_Glove
concate = False  # set to True.
concat_df = dm_doc_vec

df = pd.concat([df, concat_df], axis=1, ignore_index=True)

 # Specify train/valid/test size.
train_size, valid_size, test_size = split_size(df, train=0.8, valid=0.)  # no need to use valid dataset here
    # Prepare test dataset.
train_X, test_X, train_y, test_y = train_test_split(df,
                                                    target_labels,
                                                    test_size=test_size,
                                                    random_state=1,
                                                    stratify=target_labels)

clfs = [RandomForestClassifier(), LogisticRegression(),DecisionTreeClassifier()]
clf_names = ['Random Forest', 'Logistic Regression','Decision Tree']

NBResults = {}
for (i, clf_) in enumerate(clfs):
    clf = clf_.fit(train_X, train_y)
    preds = clf.predict(test_X)
    
    precision = metrics.precision_score(test_y, preds,average ='micro')
    recall = metrics.recall_score(test_y, preds,average ='micro')
    f1 = metrics.f1_score(test_y, preds,average ='micro')
    accuracy = accuracy_score(test_y, preds)
    report = classification_report(test_y, preds)
    matrix = metrics.confusion_matrix(test_y, preds, labels=label_keys)
    
    data = {'precision':precision,
            'recall':recall,
            'f1_score':f1,
            'accuracy':accuracy,
            'clf_report':report,
            'clf_matrix':matrix,
            'y_predicted':preds}
    
    NBResults[clf_names[i]] = data

cols = ['precision', 'recall', 'f1_score', 'accuracy']
pd.DataFrame(NBResults).T[cols].T

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Random Forest,Logistic Regression,Decision Tree
precision,0.615208,0.666846,0.476841
recall,0.615208,0.666846,0.476841
f1_score,0.615208,0.666846,0.476841
accuracy,0.615208,0.666846,0.476841


#RNN

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

model = Sequential()

# Embedding layer


# Masking layer for pre-trained embeddings
model.add(Masking(mask_value=0.0))

# Recurrent layer
model.add(LSTM(64, return_sequences=False, 
               dropout=0.1, recurrent_dropout=0.1))

# Fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(Dense(5, activation='sigmoid'))

# Compile the model
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Create callbacks
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath = 'yelp_lstm_gru_weights.hdf5', save_best_only=True, 
                             save_weights_only=False)]

In [None]:
history = model.fit(X_train,  y_train, 
                    batch_size=2048, epochs=150,
                    callbacks=callbacks,
                    validation_data=(X_valid, y_valid))

In [None]:
tfidf_doc_vec.info()
tfidf_doc_vec.head(5)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102442 entries, 0 to 102441
Columns: 200 entries, 0 to 199
dtypes: float64(200)
memory usage: 156.3 MB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,4.241534,-0.346359,-3.430429,1.740263,-0.881717,0.311229,-1.979204,-0.181652,-0.644655,-1.254866,-0.764523,-1.593858,-0.710255,5.791879,6.53138,-0.094709,4.021795,3.892041,-2.560342,0.974002,-3.033901,-5.886605,-1.866182,-4.86161,-0.216919,-1.368133,-5.397421,-1.443106,1.039358,2.114127,-5.281143,4.021247,3.425811,0.292956,0.945736,0.002539,0.709782,-2.012698,2.450704,-3.295984,...,2.739785,-1.350479,4.276579,-5.200751,-1.74818,-2.766015,-1.576548,-2.170029,-2.850688,1.789694,-0.243569,0.094091,-2.139257,3.867438,-1.34551,-1.114932,4.069699,1.102934,-3.819638,-1.040834,-1.044949,-1.570153,-1.82744,-0.271423,3.345941,1.119662,-2.546671,2.289669,-1.249396,4.977643,-5.24302,5.681765,0.641808,5.069246,-0.921466,-1.215854,-3.658603,-0.201084,2.376911,-2.882553
1,-4.256276,7.571361,6.65878,-4.826914,2.247503,-8.215537,2.946768,6.347083,0.474824,-5.031849,4.728989,-1.718058,4.690131,2.545589,0.81382,-0.709955,1.880499,0.800798,-2.287667,-1.95516,0.674691,-5.252521,0.695431,4.10746,-4.718019,1.965088,5.1048,3.402157,-0.837587,2.0111,2.745435,-3.664337,-3.892045,-7.367734,-3.790738,-2.515052,-5.052354,4.241956,-0.099118,2.694211,...,-1.071032,4.440019,-0.080174,0.850033,5.1721,6.572573,1.007506,-1.624678,-1.139039,4.209938,6.290622,-3.402455,-3.605804,-6.764954,2.966686,-3.687046,-1.158803,1.585174,1.999677,-1.591179,2.93365,-1.906195,3.296059,-2.718283,-1.080722,-2.002822,-0.129199,-1.778279,-3.614841,-6.437204,1.577998,4.989604,3.143239,0.496878,5.884332,-2.10894,1.566612,-3.506336,-5.213051,-2.940182
2,-1.655971,2.690683,-0.239536,-1.089188,1.03021,1.825364,1.06476,2.333301,0.073958,-1.068034,1.397268,0.406684,-1.991326,1.66988,0.834133,-0.736294,-3.435335,-0.99051,-0.073842,-0.899898,-3.570755,0.048199,-1.467797,1.780982,-1.141539,-2.411931,1.42555,0.820652,0.43051,1.073295,0.629742,1.864826,1.177647,-1.807969,-0.79446,2.09776,2.439232,2.420574,3.230696,0.632595,...,1.843291,2.876377,-1.803974,-1.793558,-0.390911,-1.739682,-3.015334,0.399557,-2.420814,2.205678,-2.252358,0.618523,2.63501,2.740447,2.677685,-6.293159,-1.923197,1.295902,-0.751943,-4.281597,-2.643601,-0.853006,5.780793,0.947092,0.206086,-1.130668,0.023397,1.084631,2.764695,-1.054006,-2.653031,1.76226,0.672126,1.644587,2.107002,3.911186,2.789545,-4.561905,-0.893687,-3.496453
3,-2.03675,-4.824581,4.206439,-2.848737,1.616262,-3.578638,-1.332959,-1.482494,1.21781,-0.974078,-1.421062,-2.796275,-1.916879,1.660348,-1.376542,-2.248796,-1.713045,0.813905,-0.667315,-2.356043,0.97927,-0.081409,-1.11676,1.621463,-1.32463,-0.408252,4.920693,-0.52533,0.363341,-0.31972,-3.637743,-0.82032,3.643006,2.674333,-3.114033,-3.036408,2.789954,-2.692105,1.084654,3.1675,...,4.865672,-2.302689,4.509201,2.358053,-1.162396,1.977106,0.350616,0.651786,2.509765,3.054227,2.250558,1.893524,-0.308948,0.329153,-1.069556,7.370651,0.400089,2.070981,0.203354,1.388923,0.570043,-0.057124,4.142462,0.414542,-0.128299,-1.227175,0.095076,0.577437,-3.428213,-0.73198,2.572657,3.157101,-3.1243,-2.856102,-2.432476,1.572566,1.261692,-5.538431,-1.663208,5.69881
4,-3.407166,-1.889094,3.155638,-0.763353,0.410485,-1.845323,-4.484404,-3.674747,0.105887,2.348912,3.278773,-1.409052,-2.402828,0.514554,0.610011,-1.077408,-1.824826,-0.865337,-0.888725,-2.125015,-1.113216,-1.629092,-1.249926,-2.004465,-0.209894,2.297168,3.067438,-3.503512,1.606484,-0.844252,1.13046,2.512684,0.436818,2.080024,-6.421183,-4.124078,1.182407,0.216152,2.032705,4.664596,...,7.615785,-0.463298,2.431035,2.073502,3.424167,3.15025,2.360209,1.179626,-0.517552,-0.493751,3.996435,1.174502,-2.608114,1.326632,-0.256042,3.152467,-2.050098,-3.396377,2.757633,1.831535,-0.138438,-0.902062,-0.626832,0.1756,1.505481,-1.170212,0.986665,-0.46688,5.122849,4.613226,-0.041371,3.270757,-5.278025,-1.001384,2.151574,-0.15919,-1.968791,-0.348048,1.632455,4.105619


In [None]:
model = model  # or choose sgd.
df = tfidf_doc_vec
concate = True  # set to True.
concat_df = dm_doc_vec

In [None]:
clf, train_X, valid_X, test_X, train_y, valid_y, test_y = main(model, 
                                                               df, 
                                                               concate=concate, 
                                                               concat_df=concat_df)

Shape of train_X: (81953, 300)
Shape of valid_X: (0, 0)
Shape of text_X: (20489, 300)


ValueError: ignored