In [1]:
# Import libraries
import pandas as pd
import numpy as np

from datetime import datetime
import regex as re

from surprise import Dataset
from surprise import Reader
from surprise import NormalPredictor
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import PredefinedKFold
from surprise.model_selection import train_test_split

from utilities import data_basic_utility as databasic
from utilities import regex_utility as reutil

import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams

In [2]:
print(str(len(stopwords.words("english"))))

print(sorted(stopwords.words("english")))

179
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'so

## File Details

To start with, this is basically just a copy paste of the surprise normal sample from Week 12.

# Assignment 3 -- Recommendation Systems

* The final challenge is much like Assignment 2 -- but scoped appropriately for the time and your current abilities.
* It is ratings prediction, just like the movielens recommendations we have seen and many other similar problems.
* The features created are based on Beer Reviews from experts on a website.
* Each beer has been scored between 0 and 5 (on a real scale, so 2.75 or 3.5 is OK).
* The official measure is Mean Average Error (MAE) which is pretty intuitive to work with. Everything supports is and it is easy to interpret.
* A set of features have been created based on the reviewer, the written review, and information about the Beer being reviewed.
* Not all features have to be used, and you can easily create new features using the data if you like.
* The features included are:

![title](Images/A3Features.png)

* Sizes of the files are:
|Size | File|
|---|---|
| 1.9G | features.tsv |
| 88B  | header-features.tsv|
| 48B  | header.tsv |
| 15M  | test.tsv |
| 50M  | train.tsv |
| 16M |  val.tsv |


In [3]:
filePrefix = "A3_012_investigate beer name"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
seed = databasic.get_random_seed()

In [4]:
trainFilePath = baseDataDir + 'train.tsv'
valiFilePath = baseDataDir + 'val.tsv'
featuresFilePath = baseDataDir + 'features.tsv'
testFilePath = baseDataDir + 'test.tsv'

# trainFilePath = baseDataDir + 'train_200k.tsv'
# valiFilePath = baseDataDir + 'vali_200k.tsv'
# featuresFilePath = baseDataDir + 'features_200k.tsv'
# testFilePath = baseDataDir + 'test_200k.tsv'

In [5]:
df_train = pd.read_csv(trainFilePath, sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])

df_vali = pd.read_csv(valiFilePath, sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])


df_test = pd.read_csv(testFilePath, sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])                                


In [6]:
df_train = df_train[["RowID", "BeerName"]]
df_vali = df_vali[["RowID", "BeerName"]]
df_test = df_test[["RowID", "BeerName"]]

In [7]:
df_combined = df_train.append(df_vali).append(df_test)

del df_train
del df_vali
del df_test

Tinker with NLP preprocessing stuff on Beer Name

In [8]:
stopwordsSorted = sorted(stopwords.words("english"))
print(stopwordsSorted)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some',

In [9]:
def print_top(lstTokens, numberToPrint=5):
  for i in range(numberToPrint):
    print("[" + ",".join(lstTokens[i]) + "]")

In [10]:
colName = "BeerName"
# Remove ascii encoding and punctuation
df_combined[colName] = df_combined.apply(lambda x: str(x[colName]).encode("ascii", "ignore").decode(), axis=1)
df_combined[colName] = df_combined.apply(lambda x: reutil.str_strip_punctuation(str(x[colName])), axis=1)

In [11]:
# convert target column into a list of word tokens  
lstTokens = df_combined.apply(lambda x: str(x["BeerName"]).split(" "), axis=1)
lstTokens = lstTokens.to_list()



# Do Text Preprocessing: 
# remove capitalisation, single letter tokens and stop words
lstTokens = list(map(lambda x: list(map(lambda y: y.lower(), x)), lstTokens))
lstTokens = list(map(lambda x: list(filter(lambda y: len(y) >= 2, x)), lstTokens))

print_top(lstTokens)

[rauch,bock]
[rauch,bock]
[rauch,bock]
[rauch,bock]
[rauch,bock]


In [12]:

# sort each token list so we can use bsearch
stopwordsSorted = sorted(stopwords.words("english"))  
lstTokens = list(map(lambda x: databasic.filter_by_words_bsearch(x, stopwordsSorted), lstTokens))

# Create variables for the words and vocab. When we update out tokens lists, we will want to recompile our word list and vocabulary to keep it up to date
words, vocab = databasic.createWordsAndVocabForTokenLists(lstTokens)

# Create a term Frequency distribution
term_fd = nltk.FreqDist(words)

# remove single occurrence words
setSingleWords = set(term_fd.hapaxes())
lstSingleWords = sorted(list(setSingleWords))
lstTokens = list(map(lambda x: databasic.filter_by_words_bsearch(x, lstSingleWords), lstTokens))


print_top(lstTokens)

[rauch,bock]
[rauch,bock]
[rauch,bock]
[rauch,bock]
[rauch,bock]


In [13]:
setTopBigrams = 50
removeTopFrequentTokens = 50


# find the most commong bigrams and join them together to be a single term
if setTopBigrams > 0:
  bigrams = ngrams(words, n = 2)
  fdbigram = nltk.FreqDist(bigrams)
  mostFreqBigrams = fdbigram.most_common(setTopBigrams)    
  print(mostFreqBigrams)


[(('pale', 'ale'), 80431), (('imperial', 'stout'), 31652), (('india', 'pale'), 31589), (('samuel', 'adams'), 31002), (('sierra', 'nevada'), 20164), (('brown', 'ale'), 19706), (('oatmeal', 'stout'), 14025), (('style', 'ale'), 11846), (('ale', 'samuel'), 11155), (('amber', 'ale'), 11035), (('great', 'lakes'), 10825), (('ale', 'sierra'), 10398), (('ale', 'stone'), 10035), (('double', 'ipa'), 9832), (('samuel', 'smiths'), 9729), (('barley', 'wine'), 8201), (('chocolate', 'stout'), 7904), (('red', 'ale'), 7614), (('winter', 'ale'), 7217), (('christmas', 'ale'), 7153), (('ale', 'bells'), 7136), (('minute', 'ipa'), 6950), (('anniversary', 'ale'), 6887), (('pumpkin', 'ale'), 6698), (('stout', 'bells'), 6568), (('barrel', 'aged'), 6333), (('nut', 'brown'), 6049), (('imperial', 'ipa'), 6028), (('grand', 'cru'), 5970), (('bastard', 'ale'), 5905), (('stout', 'founders'), 5865), (('oak', 'aged'), 5634), (('heavy', 'seas'), 5611), (('stout', 'samuel'), 5572), (('lager', 'samuel'), 5516), (('bourbon'

In [14]:
lstWorking = lstTokens.copy()
print_top(lstWorking)

[rauch,bock]
[rauch,bock]
[rauch,bock]
[rauch,bock]
[rauch,bock]


In [15]:


if setTopBigrams > 0:
  # Convert the bigrams to a list of bigram strings (with spaces)
  rep_patterns = list(map(lambda x: x[0], mostFreqBigrams))
  rep_patterns = list(map(lambda x: x[0] + " " + x[1], rep_patterns))

  # Create another list which is the replacements with the
  replacements = list(map(lambda x: x.replace(" ", "_"), rep_patterns))
  
  print(replacements)

  lstWorking = [" ".join(tokens) for tokens in lstWorking] # convert all the token lists back into a single string

  # Loop thought and basically find/replace all the bigrams in the string
  for i in range(0, len(lstWorking)): 
      for j in  range(0,len(rep_patterns)):
          lstWorking[i] = re.sub(rep_patterns[j], replacements[j], lstWorking[i]) # replace with bigram representation 

  lstWorking = [tokens.split(" ") for tokens in lstWorking] # convert back to tokenised lists    


print_top(lstWorking)  

['pale_ale', 'imperial_stout', 'india_pale', 'samuel_adams', 'sierra_nevada', 'brown_ale', 'oatmeal_stout', 'style_ale', 'ale_samuel', 'amber_ale', 'great_lakes', 'ale_sierra', 'ale_stone', 'double_ipa', 'samuel_smiths', 'barley_wine', 'chocolate_stout', 'red_ale', 'winter_ale', 'christmas_ale', 'ale_bells', 'minute_ipa', 'anniversary_ale', 'pumpkin_ale', 'stout_bells', 'barrel_aged', 'nut_brown', 'imperial_ipa', 'grand_cru', 'bastard_ale', 'stout_founders', 'oak_aged', 'heavy_seas', 'stout_samuel', 'lager_samuel', 'bourbon_barrel', 'st_bernardus', 'ipa_india', 'dark_horse', 'new_holland', 'scotch_ale', 'white_ale', 'ale_double', 'russian_imperial', 'belgian_style', 'imperial_porter', 'stout_old', 'milk_stout', 'series_smuttynose', 'summer_ale', 'imperial_india', 'big_beer', 'beer_series', 'special_ale', 'wheat_beer', 'harvest_ale', 'trappistes_rochefort', 'breakfast_stout', 'ipa_stone', 'arrogant_bastard', 'green_flash', 'ale_old', 'golden_ale', 'ale_special', 'smoked_porter', 'ale_br

In [16]:


# Look at the most common words, and remove
if removeTopFrequentTokens > 0:
  setMostFreqWords = term_fd.most_common(removeTopFrequentTokens)
  print(setMostFreqWords)
  # lstMostFreqWordsKeys = sorted(list(map(lambda x: x[0], setMostFreqWords)))
  # lstTokens = list(map(lambda x: databasic.filter_by_words_bsearch(x, lstMostFreqWordsKeys), lstTokens))



[('ale', 351864), ('stout', 125445), ('ipa', 105278), ('pale', 84325), ('imperial', 70272), ('porter', 57977), ('lager', 45827), ('samuel', 40761), ('beer', 38657), ('black', 36881), ('double', 34185), ('india', 33548), ('old', 32536), ('adams', 31939), ('brown', 30044), ('red', 28881), ('hop', 24847), ('stone', 23694), ('wheat', 21158), ('style', 21066), ('amber', 20302), ('sierra', 20265), ('nevada', 20164), ('founders', 18874), ('bells', 18787), ('dark', 18733), ('de', 18689), ('oatmeal', 17744), ('winter', 17622), ('white', 17216), ('series', 16110), ('barrel', 15806), ('anniversary', 15575), ('bock', 15308), ('aged', 15058), ('chocolate', 14353), ('saison', 14221), ('special', 14215), ('st', 13890), ('big', 13796), ('tripel', 13378), ('extra', 13323), ('blue', 12972), ('great', 12726), ('belgian', 11819), ('brooklyn', 11787), ('reserve', 11708), ('pilsner', 11671), ('bourbon', 11540), ('la', 11509)]
