In [4]:
import pandas as pd
import fasttext
import fasttext_models


print("First thing's first, we need to make our models.\nWe will be building 10 models in total: 5 will be trained on cleaned and lemmatized data, 5 will be trained on the raw text. Each of the 5 for those two training sets will vary by vector length: 25, 50, 100, 200, and 300.")
# execfile('fasttext_models.py') - Does not work in python 3
exec(open("./fasttext_models.py").read())

First thing's first, we need to make our models.
We will be building 10 models in total: 5 will be trained on cleaned and lemmatized data, 5 will be trained on the raw text. Each of the 5 for those two training sets will vary by vector length: 25, 50, 100, 200, and 300.
reviews_train.json loaded:
   Id  HelpfulnessNumerator  HelpfulnessDenominator  Score  \
0   1                     1                       1      5   
1   2                     0                       0      1   
2   3                     1                       1      4   
3   4                     3                       3      2   
4   5                     0                       0      5   

                 Summary                                               Text  \
0  Good Quality Dog Food  I have bought several of the Vitality canned d...   
1      Not as Advertised  Product arrived labeled as Jumbo Salted Peanut...   
2  "Delight" says it all  This is a confection that has been around a fe...   
3         Cou

ValueError: reviews_cleaned.train cannot be opened for training!

In [29]:
# load the models for evaluation
mc0 = fasttext.load_model('data/fasttext_skipgram_cleaned_D25.bin')
mc1 = fasttext.load_model('data/fasttext_skipgram_cleaned_D50.bin')
mc2 = fasttext.load_model('data/fasttext_skipgram_cleaned_D100.bin')
mc3 = fasttext.load_model('data/fasttext_skipgram_cleaned_D200.bin')
mc4 = fasttext.load_model('data/fasttext_skipgram_cleaned_D300.bin')

mu0 = fasttext.load_model('data/fasttext_skipgram_uncleaned_D25.bin')
mu1 = fasttext.load_model('data/fasttext_skipgram_uncleaned_D50.bin')
mu2 = fasttext.load_model('data/fasttext_skipgram_uncleaned_D100.bin')
mu3 = fasttext.load_model('data/fasttext_skipgram_uncleaned_D200.bin')
mu4 = fasttext.load_model('data/fasttext_skipgram_uncleaned_D300.bin')

print("\nHere are the top 50 words for the model trained on cleaned data with D100:")
mc2_words = mc4.get_words()
print(mc2_words[:50])
print("\nHere are the top 50 words for the model trained on uncleaned data with D100:")
mu2_words = mu2.get_words()
print(mu2_words[:50])


Here are the top 50 words for the model trained on cleaned data with D100:
['>', '<', 'br', '</s>', 'like', 'good', 'taste', 'flavor', 'one', 'get', 'love', 'product', 'make', 'use', 'coffee', 'great', 'try', 'well', 'food', 'buy', 'tea', 'find', 'would', 'eat', 'dog', 'go', 'really', 'time', 'much', 'amazon', 'order', 'also', 'price', 'bag', 'cup', 'give', 'little', 'even', 'drink', 'say', 'think', 'store', 'day', 'cat', 'add', 'box', 'chocolate', 'treat', 'come', 'first']

Here are the top 50 words for the model trained on uncleaned data with D100:
['the', 'I', 'and', 'a', 'to', 'of', 'is', 'it', '</s>', 'for', 'in', 'this', 'that', 'my', 'with', 'have', 'but', 'are', 'was', 'not', 'you', '/><br', 'on', 'as', 'like', 'they', 'so', 'be', 'The', 'or', 'at', 'these', 'just', 'them', 'very', 'from', 'one', 'good', 'It', '"I', 'has', 'can', 'taste', 'will', 'would', 'had', 'all', 'more', 'than', 'when']


In [30]:
print("We can measure the accuracy of these classfiers by examining their precision and recall when running on a test set.\nWe will continue to use mc2 and mu2, the models trained with dim=100 on cleaned and uncleaned data respectively.")
print("The fasttext test() function will calculate the precision and recall \"at k\" for our models. Note that these are not the usual definitions of precision and recall used in most discussion about binary classifiers. More precisely precision at k is:\n      P@k = r / k,\n      the # of relevant labels r divided by the number of top predictions k.")
print("The k value determines how many of the top predictions we consider, e.g. do we want to evaluate just the label with the highest rank outputted by the model (k=1), or the top 3 labels (k=3)?\nFurther, prec@1 will either be 1 or 0 in each test case, because the top guess either is or is not the relevant label.\nFor our purposes, just considering precision and recall at 1 makes sense, because there is only one label we care about: the correct one. When k>1, our models can only get a value of either 1/k or 0 in each test case, making them seem less accurate than they are.\nIt is not explicitly stated in the fasttext documentation, but it is reasonable to say that P@1 will give the same output as we would get using the sci kit library's tools.\nSpecifically, it gives the same results as sklearn.metrics.precision_score() with parameter 'average' set to 'micro' would. This means that fasttext.test() is calculating metrics globally by counting the total true positives, false negatives and false positives across all the labels and producing one number for the precision and recall score. Unfortunately, there seems to be no way to change this calculation beyond varying the k value.")
print("Now, to calculate accuracy for comparision with our other models (with different embeddings), we")


n0,p0,r0 = mc2.test('reviews_cleaned.test', k=5)
n1,p1,r1 = mu2.test('reviews_uncleaned.test', k=5)
d = {'Model':['Cleaned', 'Uncleaned'], 'N':[n0,n1], 'Precision':[p0,p1],'Recall':[r0,r1]}
df = pd.DataFrame(data=d)
print(df)

n0,p0,r0 = mc2.test('reviews_cleaned.test', k=2)
n1,p1,r1 = mu2.test('reviews_uncleaned.test', k=2)
d = {'Model':['Cleaned', 'Uncleaned'], 'N':[n0,n1], 'Precision':[p0,p1],'Recall':[r0,r1]}
df = pd.DataFrame(data=d)
print(df)

n0,p0,r0 = mc2.test('reviews_cleaned.test', k=1)
n1,p1,r1 = mu2.test('reviews_uncleaned.test', k=1)
d = {'Model':['Cleaned', 'Uncleaned'], 'N':[n0,n1], 'Precision':[p0,p1],'Recall':[r0,r1]}
df = pd.DataFrame(data=d)
print(df)

n0,p0,r0 = mc2.test('reviews_cleaned.test')
n1,p1,r1 = mu2.test('reviews_uncleaned.test')
d = {'Model':['Cleaned', 'Uncleaned'], 'N':[n0,n1], 'Precision':[p0,p1],'Recall':[r0,r1]}
df = pd.DataFrame(data=d)
print(df)

print("\nThough the two models are different, they have both have equal precision and recall. I believe this is because each doc has only one possible label.")

print("\nTherefore, the accuracy of these models is equivalent to their recall and precision. (Partners confirm this pls)\n")

We can measure the accuracy of these classfiers by examining their precision and recall when running on a test set.
We will continue to use mc2 and mu2, the models trained with dim=100 on cleaned and uncleaned data respectively.
The fasttext test() function will calculate the precision and recall "at k" for our models. Note that these are not the usual definitions of precision and recall used in most discussion about binary classifiers. More precisely precision at k is:
      P@k = r / k,
      the # of relevant labels r divided by the number of top predictions k.
The k value determines how many of the top predictions we consider, e.g. do we want to evaluate just the label with the highest rank outputted by the model (k=1), or the top 3 labels (k=3)?
Further, prec@1 will either be 1 or 0 in each test case, because the top guess either is or is not the relevant label.
For our purposes, just considering precision and recall at 1 makes sense, because there is only one label we care about:

In [31]:
print("Now let's take a look at some of the vector embeddings.")
print("Let's look at the vector for \"chocolate\" in the cleaned-data model:\n")
print(mc2.get_word_vector('chocolate'))
print("\nWe can also look at the most similar words, or the vector's 'neighbors.' These are determined by the cosine similarity of vectors.")
print(mc2.get_nearest_neighbors('chocolate'))

print("\nFasttext also lets us try out analogies. Let's see how it does with the following:")
print("\n\"\'hot\' is to \'cold\', what \'good\' is to _____\"")
print(mc2.get_analogies('hot','cold','good'))

Now let's take a look at some of the vector embeddings.
Let's look at the vector for "chocolate" in the cleaned-data model:

[ 8.92456621e-02 -9.63591859e-02  5.99380285e-02  9.64247733e-02
  4.60756496e-02 -6.81003854e-02  6.64454475e-02  1.67603754e-02
 -6.05224706e-02  9.76952687e-02 -1.69180539e-02  6.51830062e-02
  1.29161635e-03  1.36304225e-04  3.21914777e-02 -3.85867171e-02
  6.38445616e-02  1.40688255e-01  2.88648601e-03  2.83443537e-02
  2.13524718e-02  3.63133550e-02 -2.01489497e-02 -7.10156709e-02
 -1.24792894e-02  1.49864722e-02  8.00867677e-02 -1.40068531e-02
  3.85700725e-02 -2.26944825e-03 -6.20292835e-02 -2.59185918e-02
 -1.35131320e-02  6.95468113e-02  5.18812127e-02  3.57772559e-02
  4.59828749e-02  5.87178767e-02 -2.27381978e-02 -5.41094132e-02
  2.19275672e-02  7.17993453e-02 -2.78918557e-02 -1.29705947e-02
  2.38936301e-02 -1.34344334e-02  2.18468644e-02 -6.45970851e-02
 -5.87611576e-04 -2.17915699e-02  2.19032820e-02 -1.86101273e-02
  6.49569137e-03  1.41359083e-

In [69]:
# Get Accuracy
import get_accuracy


'''
  FINALLY!
  This may have been unecessarily complicated, but now we can compute accuracy with get_accuracy()!
'''
print("Accuracy of classifier with dim=50 and trained on cleaned data:")
print(get_accuracy.get_accuracy(mc1))

print(len(mc2.words))
print(len(mc2.words))

print(mc2.predict("Apples and Bananas"))

# maybe here we can try out some vector addition/subtraction to see what kind of meaning the model captures. 

Accuracy of classifier with dim=50 and trained on cleaned data:
0.5940399855749355
563
563
(('__label__5',), array([0.74742788]))
