# Exploring the use of Natural Language Processing to determine Item-Item Similarity

In [61]:
from collections import defaultdict
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.probability import FreqDist
from spellchecker import SpellChecker

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity 

## Preprocessing

The board games data has a pre-processed description: can we use this description to determine how similar two games will be? 

In [18]:
boardgames_df = pd.read_csv('data/modern_games.csv')

In [19]:
boardgames_df['Description'].sample(10, random_state=42)

1496    jedi knight sith lord awesome warrior mystical...
8943    match color player cooperate try card color or...
8970    founder new village middle age year great plag...
6109    information publisher game pageoperation chast...
8180    description publisherfollowe god player settle...
2301    operational level simulation year conflict con...
7091    seven fastplaying card game player jason tagmi...
4245    mecanisburgo    player sciencefiction theme ga...
9030    player draw ideal castle battlement tower nice...
9363    year scorpius system settle decade government ...
Name: Description, dtype: object

These descriptions have already been processed a little by removing punctuation and capitals, but it's still a little messy. We will start preprocessing by using a more aggressive stemmer (a porter) and removing any lingering stopwords. 

In [27]:
tokenizer = RegexpTokenizer(pattern = r"(?u)\w{3,}")
stopwords_list = stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()
porter = nltk.stem.PorterStemmer()
spell = SpellChecker()

words = set(nltk.corpus.words.words())

def preprocess_text(text, tokenizer):
    # Standardize case (lowercase the text)
    lowered = text.lower()
    # Tokenize text using `tokenizer`
    tokens = tokenizer.tokenize(lowered)
    # Remove stopwords using `stopwords_list`
    text = [word for word in tokens if word not in stopwords_list]
    # Stem the tokenized text using `stemmer`
    text = [lemmatizer.lemmatize(word) for word in text]
    text = str(text)

    # Return the preprocessed text
    return text

preprocess_text("This is an example aaa for preprocessing skel.", tokenizer)

"['example', 'aaa', 'preprocessing', 'skel']"

In [28]:
boardgames_df['Description_Preprocessed'] = boardgames_df['Description'].apply(lambda x: preprocess_text(x, tokenizer))

In [29]:
boardgames_df['Description_Preprocessed'].sample(10, random_state=42)

1496    ['jedi', 'knight', 'sith', 'lord', 'awesome', ...
8943    ['match', 'color', 'player', 'cooperate', 'try...
8970    ['founder', 'new', 'village', 'middle', 'age',...
6109    ['information', 'publisher', 'game', 'pageoper...
8180    ['description', 'publisherfollowe', 'god', 'pl...
2301    ['operational', 'level', 'simulation', 'year',...
7091    ['seven', 'fastplaying', 'card', 'game', 'play...
4245    ['mecanisburgo', 'player', 'sciencefiction', '...
9030    ['player', 'draw', 'ideal', 'castle', 'battlem...
9363    ['year', 'scorpius', 'system', 'settle', 'deca...
Name: Description_Preprocessed, dtype: object

# Bag of Words

In [59]:
vec = CountVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)", stop_words=stopwords_list,
                      min_df=5)
X = vec.fit_transform(boardgames_df.Description_Preprocessed.astype(str))

df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
df.sample(10, random_state=42)

Unnamed: 0,aaa,aaron,abandon,abbey,abbot,abbreviate,abduct,abduction,abilitie,abilitiesit,...,zman,zoch,zombie,zona,zone,zoo,zoom,zucker,zulu,zum
1496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8943,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8970,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2301,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7091,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9030,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9363,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Spellchecking

This is pretty good, but we still have some odd words to spell check. 

In [31]:
def spellcheck(word):
    # remove the word if not english, try to correct if possible 
    if word not in spell:
        return spell.correction(word) 
    else:
        return word

In [57]:
for col in df.columns:
    df.rename(columns={col: spellcheck(col)}, inplace=True)

In [34]:
df

Unnamed: 0,abandon,ability,able,aboard,abstract,academy,accept,access,accessible,acclaim,...,yield,york,young,yoursquoll,yoursquore,yoursquove,zero,zombie,zone,zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10471,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10472,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
10473,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
10474,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TF-IDF

In [63]:
tf_vec = TfidfVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)", stop_words=stopwords_list, min_df=5)
td_idf_FT = tf_vec.fit_transform(boardgames_df.Description_Preprocessed.astype(str))

game_description_TFIDF = pd.DataFrame(td_idf_FT.toarray(), columns=tf_vec.get_feature_names_out())
df.head()

Unnamed: 0,aaa,aaron,abandon,abbey,abbot,abbreviate,abduct,abduction,abilitie,abilitiesit,...,zman,zoch,zombie,zona,zone,zoo,zoom,zucker,zulu,zum
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
game_description_TFIDF.iloc[100].sort_values(ascending=False)[:10]

harbor       0.409408
commodity    0.283826
trading      0.230496
house        0.201819
expand       0.188926
amsterdam    0.185600
expansion    0.179339
large        0.164874
area         0.163956
die          0.157297
Name: 100, dtype: float64

In [65]:
tf_vec.vocabulary_

{'die': 2435,
 'game': 3625,
 'seven': 7959,
 'sequential': 7927,
 'political': 6628,
 'race': 7114,
 'different': 2446,
 'region': 7308,
 'germany': 3751,
 'player': 6529,
 'charge': 1425,
 'national': 5758,
 'party': 6298,
 'manage': 5243,
 'limited': 5052,
 'resource': 7443,
 'help': 4102,
 'victory': 9581,
 'win': 9850,
 'point': 6593,
 'regional': 7309,
 'election': 2797,
 'way': 9747,
 'score': 7831,
 'supply': 8671,
 'eighty': 2780,
 'depend': 2316,
 'size': 8142,
 'second': 7875,
 'medium': 5410,
 'influence': 4488,
 'receive': 7230,
 'membership': 5427,
 'grow': 3915,
 'progress': 6842,
 'fair': 3180,
 'number': 5925,
 'lastly': 4917,
 'platform': 6517,
 'match': 5351,
 'opinion': 6052,
 'end': 2873,
 'gamethe': 3668,
 'edition': 2753,
 'feature': 3256,
 'old': 6005,
 'west': 9802,
 'support': 8673,
 'reunite': 7485,
 'update': 9447,
 'rule': 7668,
 'add': 102,
 'short': 8032,
 'variant': 9518,
 'additional': 105,
 'original': 6108,
 'designer': 2350,
 'samurai': 7741,
 'set':

# Computing similarity based on TF-IDF

In [66]:
# Finding cosine similarity between vectors 

cos_sim = cosine_similarity(game_description_TFIDF, game_description_TFIDF) 

In [None]:
# Storing indices of the data 
indices = pd.Series(boardgames_df.index) 

def recommendations(title, cosine_sim = cos_sim): 
    recommended_movies = [] 
    index = indices[indices == title].index[0] 
    similarity_scores = pd.Series(cosine_sim[index]).sort_values(ascending = False) 
    top_10_movies = list(similarity_scores.iloc[1:11].index) 
    for i in top_10_movies: 
        recommended_movies.append(list(boardgames_df.index)[i]) 
    return recommended_movies 
