In [None]:
# Import libraries
import pandas as pd
import numpy as np

from datetime import datetime
import regex as re

from surprise import Dataset
from surprise import Reader
from surprise import NormalPredictor
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import PredefinedKFold
from surprise.model_selection import train_test_split

from utilities import data_basic_utility as databasic

import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams

In [None]:
print(str(len(stopwords.words("english"))))

print(sorted(stopwords.words("english")))

## File Details

To start with, this is basically just a copy paste of the surprise normal sample from Week 12.

# Assignment 3 -- Recommendation Systems

* The final challenge is much like Assignment 2 -- but scoped appropriately for the time and your current abilities.
* It is ratings prediction, just like the movielens recommendations we have seen and many other similar problems.
* The features created are based on Beer Reviews from experts on a website.
* Each beer has been scored between 0 and 5 (on a real scale, so 2.75 or 3.5 is OK).
* The official measure is Mean Average Error (MAE) which is pretty intuitive to work with. Everything supports is and it is easy to interpret.
* A set of features have been created based on the reviewer, the written review, and information about the Beer being reviewed.
* Not all features have to be used, and you can easily create new features using the data if you like.
* The features included are:

![title](Images/A3Features.png)

* Sizes of the files are:
|Size | File|
|---|---|
| 1.9G | features.tsv |
| 88B  | header-features.tsv|
| 48B  | header.tsv |
| 15M  | test.tsv |
| 50M  | train.tsv |
| 16M |  val.tsv |


In [None]:
filePrefix = "A3_01_surprisenormal1"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
seed = databasic.get_random_seed()

In [None]:
# # RowID  BeerID  ReviewerID  BeerName  BeerType  Label
# #df_train = pd.read_csv(baseDataDir + 'train_wk12.tsv',sep='\t',
# df_train = pd.read_csv(baseDataDir + 'train_200k.tsv',sep='\t',
#                          names=['RowID','BeerID','ReviewerID',
#                                   'BeerName','BeerType','rating'])


# #df_vali = pd.read_csv(baseDataDir + 'val_wk12.tsv',sep='\t',
# df_vali = pd.read_csv(baseDataDir + 'vali_200k.tsv',sep='\t',
#                          names=['RowID','BeerID','ReviewerID',
#                                   'BeerName','BeerType','rating'])
# df_vali.head(10)


In [None]:
# RowID BrewerID ABV DayofWeek Month DayofMonth Year TimeOfDay Gender Birthday Text Lemmatized POS_Tag
# df_features = pd.read_csv(baseDataDir + 'features-top500.tsv',sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
df_features = pd.read_csv(baseDataDir + 'features_200k.tsv',sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
# df_features = pd.read_csv(baseDataDir + 'features.tsv',sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
                                                                 'DayofMonth','Year','TimeOfDay','Gender',
                                                                 'Birthday','Text','Lemmatized','POS_Tag'])

df_features.head(10)

In [None]:
del df_features["ABV"]
del df_features["DayofWeek"]
del df_features["Month"]
del df_features["DayofMonth"]
del df_features["Year"]
del df_features["TimeOfDay"]
del df_features["Gender"]

Look at Birthday and do formatting

In [54]:
def convertBirthdayToAge(input):
  if input.lower() == "unknown":
    return 0
  elif "," not in input:
    return 0
  else:
    tokens = input.split(",")
    year = int(tokens[1])
    age = datetime.now().year - year
    if age > 0:
      return age
    else:
      return 0
    

In [56]:
df_features[df_features["Birthday"] != "unknown"].head(50)

Unnamed: 0,RowID,BrewerID,Birthday,Text,Lemmatized,POS_Tag,Age
4,22,1075,"Oct 14, 1983",This is my first rauchbier . Pours a burnt amb...,this be my first rauchbier . pour a burn amber...,DT VBZ PRP$ JJ NN . VBZ DT VBN NN . JJ NN . NN...,38
8,26,1075,"Nov 24, 1974",Poured a slightly cloudy deep amber/red color ...,pour a slightly cloudy deep amber/red color wi...,VBD DT RB JJ JJ VBN NN IN DT JJ NN NN IN NN . ...,47
9,27,1075,"Oct 10, 1988",Big thanks to N2168 for knocking this off my w...,big thanks to n2168 for knock this off my want...,JJ NNS IN NN IN VBG DT RP PRP$ NNS . VBN IN DT...,33
13,31,1075,"Aug 23, 1986",A - Semi aggressive pour produces a 1-1/2 fing...,a - Semi aggressive pour produce a 1-1/2 finge...,DT HYPH NNP JJ VBP VBZ DT JJ NN RB JJ JJ NN WD...,35
14,32,1075,"Dec 13, 1978",Look,look,VB,43
20,38,1075,"Feb 3, 1970","A-Bright , crystal clear , copper with a mediu...","a-bright , crystal clear , copper with a mediu...","NN , NN JJ , NN IN DT JJ JJ NN WDT RB VBZ IN D...",51
25,45,1075,"Jul 25, 1984",Pours a rich burnt caramel hue with some deep ...,pour a rich burn caramel hue with some deep am...,VBZ DT JJ VBN NN NN IN DT JJ NN NNS : NN IN DT...,37
26,46,1075,"Nov 25, 1978","On draught at The Beer Stein , Rauch r Bock is...","on draught at the Beer Stein , Rauch be Bock b...","IN NN IN DT NNP NNP , NNP VBP NNP VBZ JJ JJ IN...",43
33,53,1075,"Jun 29, 1973","I shared this bottle with tenderbranson69 , pi...","I share this bottle with tenderbranson69 , pic...","PRP VBD DT NN IN NN , VBD RP IN NNP POS IN NNP...",48
35,55,1075,"Dec 16, 1976",A,a,DT,45


In [55]:
df_features["Age"] = df_features.apply(lambda x: convertBirthdayToAge(x["Birthday"]), axis=1)

df_features.head(20)

Unnamed: 0,RowID,BrewerID,Birthday,Text,Lemmatized,POS_Tag,Age
0,18,1075,unknown,Pours a murky light brown with a 1 inch fizzy ...,pour a murky light brown with a 1 inch fizzy t...,VBZ DT JJ NN JJ IN DT CD NN JJ NN NN WDT VBZ I...,0
1,19,1075,unknown,Faint sudsy head with some with some dissipati...,faint sudsy head with some with some dissipate...,NN JJ NN IN DT IN DT VBG JJ NN . JJ JJ NN . DT...,0
2,20,1075,unknown,A new arrival to the West TN area ... Pours qu...,a new arrival to the West TN area ... pour qui...,"DT JJ NN IN DT NNP NNP NN , VBZ PDT DT NN JJR ...",0
3,21,1075,unknown,Sampled 10/30/11 - Transferring the notes . A ...,sample 10/30/11 - transfer the note . a ruby p...,VBN CD HYPH VBG DT NNS . DT NN VBP IN DT NN NN...,0
4,22,1075,"Oct 14, 1983",This is my first rauchbier . Pours a burnt amb...,this be my first rauchbier . pour a burn amber...,DT VBZ PRP$ JJ NN . VBZ DT VBN NN . JJ NN . NN...,38
5,23,1075,unknown,A,a,DT,0
6,24,1075,unknown,"Pours a mahogany color , rich , with a tan hea...","pour a mahogany color , rich , with a tan head...","VBZ DT NN NN , JJ , IN DT JJ NN . DT NN , VBD ...",0
7,25,1075,unknown,Pours light caramel brown with reddish highlig...,pour light caramel brown with reddish highligh...,NNS JJ NN JJ IN JJ NNS . DT JJ JJ NN VBZ RB VB...,0
8,26,1075,"Nov 24, 1974",Poured a slightly cloudy deep amber/red color ...,pour a slightly cloudy deep amber/red color wi...,VBD DT RB JJ JJ VBN NN IN DT JJ NN NN IN NN . ...,47
9,27,1075,"Oct 10, 1988",Big thanks to N2168 for knocking this off my w...,big thanks to n2168 for knock this off my want...,JJ NNS IN NN IN VBG DT RP PRP$ NNS . VBN IN DT...,33


Tinker with NLP preprocessing stuff on Lemmatized

In [None]:
df_features.loc[0, "Lemmatized"]

In [None]:
stringTest = df_features.loc[0, :]["Lemmatized"]
lstTokens = stringTest.split(" ")
lstTokens = sorted(lstTokens)
lstTokens

In [None]:
stopwordsSorted = sorted(stopwords.words("english"))
print(stopwordsSorted)

In [None]:
lstTokens2 = databasic.filter_by_words_bsearch(lstTokens, stopwordsSorted)
lstTokens2

In [None]:
def print_top(lstTokens, numberToPrint=5):
  for i in range(numberToPrint):
    print("[" + ",".join(lstTokens[i]) + "]")

In [None]:
# convert target column into a list of word tokens  
lstTokens = df_features.apply(lambda x: str(x["Lemmatized"]).split(" "), axis=1)
lstTokens = lstTokens.to_list()

# Do Text Preprocessing: 
# remove capitalisation, single letter tokens and stop words
lstTokens = list(map(lambda x: list(map(lambda y: y.lower(), x)), lstTokens))
lstTokens = list(map(lambda x: list(filter(lambda y: len(y) >= 2, x)), lstTokens))

print_top(lstTokens)

In [None]:

# sort each token list so we can use bsearch
stopwordsSorted = sorted(stopwords.words("english"))  
lstTokens = list(map(lambda x: databasic.filter_by_words_bsearch(x, stopwordsSorted), lstTokens))

# Create variables for the words and vocab. When we update out tokens lists, we will want to recompile our word list and vocabulary to keep it up to date
words, vocab = databasic.createWordsAndVocabForTokenLists(lstTokens)

# Create a term Frequency distribution
term_fd = nltk.FreqDist(words)

# remove single occurrence words
setSingleWords = set(term_fd.hapaxes())
lstSingleWords = sorted(list(setSingleWords))
lstTokens = list(map(lambda x: databasic.filter_by_words_bsearch(x, lstSingleWords), lstTokens))


print_top(lstTokens)

In [None]:
setTopBigrams = 50
removeTopFrequentTokens = 50


# find the most commong bigrams and join them together to be a single term
if setTopBigrams > 0:
  bigrams = ngrams(words, n = 2)
  fdbigram = nltk.FreqDist(bigrams)
  mostFreqBigrams = fdbigram.most_common(setTopBigrams)    
  print(mostFreqBigrams)


In [None]:
lstWorking = lstTokens.copy()
print_top(lstWorking)

In [None]:


if setTopBigrams > 0:
  # Convert the bigrams to a list of bigram strings (with spaces)
  rep_patterns = list(map(lambda x: x[0], mostFreqBigrams))
  rep_patterns = list(map(lambda x: x[0] + " " + x[1], rep_patterns))

  # Create another list which is the replacements with the
  replacements = list(map(lambda x: x.replace(" ", "_"), rep_patterns))
  
  print(replacements)

  lstWorking = [" ".join(tokens) for tokens in lstWorking] # convert all the token lists back into a single string

  # Loop thought and basically find/replace all the bigrams in the string
  for i in range(0, len(lstWorking)): 
      for j in  range(0,len(rep_patterns)):
          lstWorking[i] = re.sub(rep_patterns[j], replacements[j], lstWorking[i]) # replace with bigram representation 

  lstWorking = [tokens.split(" ") for tokens in lstWorking] # convert back to tokenised lists    


print_top(lstWorking)  

In [None]:


# Look at the most common words, and remove
if removeTopFrequentTokens > 0:
  setMostFreqWords = term_fd.most_common(removeTopFrequentTokens)
  print(setMostFreqWords)
  # lstMostFreqWordsKeys = sorted(list(map(lambda x: x[0], setMostFreqWords)))
  # lstTokens = list(map(lambda x: databasic.filter_by_words_bsearch(x, lstMostFreqWordsKeys), lstTokens))

