# Exploring the use of Natural Language Processing to determine Item-Item Similarity

In [1]:
from collections import defaultdict
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.probability import FreqDist
from spellchecker import SpellChecker

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity 

## Preprocessing

The board games data has a pre-processed description: can we use this description to determine how similar two games will be? 

In [2]:
boardgames_df = pd.read_csv('data/modern_games.csv')

In [3]:
boardgames_df['Description'].sample(10, random_state=42)

1496    jedi knight sith lord awesome warrior mystical...
8943    match color player cooperate try card color or...
8970    founder new village middle age year great plag...
6109    information publisher game pageoperation chast...
8180    description publisherfollowe god player settle...
2301    operational level simulation year conflict con...
7091    seven fastplaying card game player jason tagmi...
4245    mecanisburgo    player sciencefiction theme ga...
9030    player draw ideal castle battlement tower nice...
9363    year scorpius system settle decade government ...
Name: Description, dtype: object

These descriptions have already been processed a little by removing punctuation and capitals, but it's still a little messy. We will start preprocessing by using a more aggressive stemmer (a porter) and removing any lingering stopwords. 

In [None]:
tokenizer = RegexpTokenizer(pattern = r"(?u)\w{3,}")
stopwords_list = stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()
porter = nltk.stem.PorterStemmer()
spell = SpellChecker()

words = set(nltk.corpus.words.words())

def preprocess_text(text, tokenizer):
    # Standardize case (lowercase the text)
    lowered = text.lower()
    # Tokenize text using `tokenizer`
    tokens = tokenizer.tokenize(lowered)
    # Remove stopwords using `stopwords_list`
    text = [word for word in tokens if word not in stopwords_list]
    # Lemmatize the tokenized text
    text = [lemmatizer.lemmatize(word) for word in text]
    text = str(text)

    # Return the preprocessed text
    return text

preprocess_text("This is an example aaa for preprocessing skel.", tokenizer)

"['example', 'aaa', 'preprocessing', 'skel']"

In [5]:
boardgames_df['Description_Preprocessed'] = boardgames_df['Description'].apply(lambda x: preprocess_text(x, tokenizer))

In [6]:
boardgames_df['Description_Preprocessed'].sample(10, random_state=42)

1496    ['jedi', 'knight', 'sith', 'lord', 'awesome', ...
8943    ['match', 'color', 'player', 'cooperate', 'try...
8970    ['founder', 'new', 'village', 'middle', 'age',...
6109    ['information', 'publisher', 'game', 'pageoper...
8180    ['description', 'publisherfollowe', 'god', 'pl...
2301    ['operational', 'level', 'simulation', 'year',...
7091    ['seven', 'fastplaying', 'card', 'game', 'play...
4245    ['mecanisburgo', 'player', 'sciencefiction', '...
9030    ['player', 'draw', 'ideal', 'castle', 'battlem...
9363    ['year', 'scorpius', 'system', 'settle', 'deca...
Name: Description_Preprocessed, dtype: object

# Bag of Words

We examine our descriptions word by word both by counting the words and by examining the frequency of words across documents.

In [7]:
vec = CountVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)", stop_words=stopwords_list,
                      min_df=5)
X = vec.fit_transform(boardgames_df.Description_Preprocessed.astype(str))

df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
df.sample(10, random_state=42)

Unnamed: 0,aaa,aaron,abandon,abbey,abbot,abbreviate,abduct,abduction,abilitie,abilitiesit,...,zman,zoch,zombie,zona,zone,zoo,zoom,zucker,zulu,zum
1496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8943,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8970,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2301,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7091,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9030,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9363,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TF-IDF

In [22]:
tf_vec = TfidfVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)", stop_words=stopwords_list, min_df=15)
td_idf_FT = tf_vec.fit_transform(boardgames_df.Description_Preprocessed.astype(str))

game_description_TFIDF = pd.DataFrame(td_idf_FT.toarray(), columns=tf_vec.get_feature_names_out())
df.head()

Unnamed: 0,aka,baron,abandon,abbey,abbot,abbreviate,abduct,abduction,abilities,abilities.1,...,zman,zoch,zombie,zona,zone,zoo,zoom,zucker,zulu,zum
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
game_description_TFIDF.iloc[100].sort_values(ascending=False)[:10]

harbor       0.416647
commodity    0.288844
trading      0.234571
house        0.205387
expand       0.192267
expansion    0.182510
large        0.167790
area         0.166855
die          0.160078
tilelaye     0.159898
Name: 100, dtype: float64

In [18]:
tf_vec.vocabulary_

{'die': 826,
 'game': 1290,
 'seven': 2694,
 'political': 2239,
 'race': 2388,
 'different': 830,
 'region': 2458,
 'germany': 1324,
 'player': 2213,
 'charge': 452,
 'national': 1981,
 'party': 2138,
 'manage': 1815,
 'limited': 1751,
 'resource': 2517,
 'help': 1424,
 'victory': 3248,
 'win': 3337,
 'point': 2227,
 'regional': 2459,
 'election': 957,
 'way': 3303,
 'score': 2642,
 'supply': 2947,
 'depend': 780,
 'size': 2756,
 'second': 2656,
 'medium': 1869,
 'influence': 1562,
 'receive': 2436,
 'grow': 1372,
 'progress': 2329,
 'fair': 1117,
 'number': 2032,
 'platform': 2208,
 'match': 1847,
 'opinion': 2066,
 'end': 984,
 'gamethe': 1300,
 'edition': 942,
 'feature': 1151,
 'old': 2056,
 'west': 3324,
 'support': 2948,
 'update': 3205,
 'rule': 2595,
 'add': 35,
 'short': 2720,
 'variant': 3224,
 'additional': 37,
 'original': 2088,
 'designer': 795,
 'samurai': 2619,
 'set': 2688,
 'medieval': 1867,
 'japan': 1650,
 'compete': 558,
 'gain': 1285,
 'favor': 1145,
 'faction': 11

# Computing similarity based on TF-IDF

In [19]:
# Finding cosine similarity between vectors 

cos_sim = cosine_similarity(game_description_TFIDF, game_description_TFIDF) 

In [20]:
# Storing indices of the data 
indices = pd.Series(boardgames_df.index) 

def recommendations(bggID, cosine_sim = cos_sim, num_reccomendations =10): 
    recommendations = []

    similarity_scores = pd.Series(cosine_sim[bggID]).sort_values(ascending = False) 

    top_n = list(similarity_scores.iloc[1:num_reccomendations+1].index) 

    for i in top_n: 
        recommendations.append(list(boardgames_df.index)[i]) 
    return recommendations 

recommendations(925)

[370, 6221, 3491, 6724, 103, 1963, 7696, 6814, 9831, 5971]

# This is not good..... could we use something else? 

In [None]:
themes_df = pd.read_csv('data/modern_themes.csv')
#we want our bgg id to be the index
themes_df.set_index('BGGId', inplace=True)
#due to memory issues, use a sample to prove the data, run the real thin in virtual machine
themes_sampled = themes_df.sample(2000, random_state=42)