#**Investigation of *items* database and correlations between items**

**EDA, NLP, Feature Generation**

Andreas Theodoulou and Michael Gaidis (May, 2020)

#0. Configure Environment
**NOT OPTIONAL**

In [2]:
# General python libraries/modules used throughout the notebook
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, FormatStrFormatter, AutoMinorLocator
import numpy as np
import seaborn as sns

import os
from itertools import product
import re
import json
import time
from time import sleep, localtime, strftime
import pickle


# Magics
%matplotlib inline


# # NLP packages
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords

# # ML packages
# from sklearn.linear_model import LinearRegression

# !pip install catboost
# from catboost import CatBoostRegressor 

# %tensorflow_version 2.x
# import tensorflow as tf
# import keras as K

# # List of the modules we need to version-track for reference
# modules = ['pandas','matplotlib','numpy','seaborn','sklearn','tensorflow','keras','catboost','pip']

  import pandas.util.testing as tm


In [0]:
# Notebook formatting
# Adjust as per your preferences.  I'm using a FHD monitor with a full-screen browser window containing my IPynb notebook

# format pandas output so we can see all the columns we care about (instead of "col1  col2  ........ col8 col9", we will see "col1 col2 col3 col4 col5 col6 col7 col8 col9" if it fits inside display.width parameter)
pd.set_option("display.max_columns",30)  
pd.set_option("display.max_rows",100)     # Override pandas choice of how many rows to show, so, for example, we can see the full 84-row item_category dataframe instead of the first few rows, then ...., then the last few rows
pd.set_option("display.width", 300)       # Similar to the above for showing more rows than pandas defaults to, we can show more columns than default, if we tune this to our monitor window size
pd.set_option("max_colwidth", None)

#pd.set_option("display.precision", 3)  # Nah, this is helpful, but below is even better
#Try to convince pandas to print without decimal places if a number is actually an integer (helps keep column width down, and highlights data types)
pd.options.display.float_format = lambda x : '{:.0f}'.format(x) if round(x,0) == x else '{:,.3f}'.format(x)

#1. Load Data Files



##1.1) Enter Data File Names and Paths

**NOT Optional**

In [0]:
#  FYI, data is coming from a public repo on GitHub at github.com/migai/Kag
# List of the data files (path relative to GitHub master), to be loaded into pandas DataFrames
data_files = [  "readonly/final_project_data/items.csv",
                "readonly/final_project_data/item_categories.csv",
                "readonly/final_project_data/shops.csv",
                "readonly/final_project_data/sample_submission.csv.gz",
                "readonly/final_project_data/sales_train.csv.gz",
                "readonly/final_project_data/test.csv.gz",
                "data_output/shops_transl.csv",
                "data_output/shops_augmented.csv",
                "data_output/item_categories_transl.csv",
                "data_output/item_categories_augmented.csv",
                "data_output/items_transl.csv",
                "readonly/en_50k.csv"  ]


# Dict of helper code files, to be loaded and imported {filepath : import_as}
code_files = {}  # not used at this time; example dict = {"helper_code/kaggle_utils_at_mg.py" : "kag_utils"}


# GitHub file location info
git_hub_url = "https://raw.githubusercontent.com/migai"
repo_name = 'Kag'
branch_name = 'master'
base_url = os.path.join(git_hub_url, repo_name, branch_name)

##1.2) Load Data Files

In [5]:
# click on the URL link presented to you by this command, get your authorization code from Google, then paste it into the input box and hit 'enter' to complete mounting of the drive
from google.colab import drive  
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [6]:
'''
############################################################
############################################################
'''
# Replace this path with the path on *your* Google Drive where the repo master branch is stored
#   (on GitHub, the remote repo is located at github.com/migai/Kag --> below is my cloned repo location)
GDRIVE_REPO_PATH = "/content/drive/My Drive/Colab Notebooks/NRUHSE_2_Kaggle_Coursera/final/Kag"
'''
############################################################
############################################################
'''

%cd "{GDRIVE_REPO_PATH}"

print("Loading Files from Google Drive repo into Colab...\n")

# Loop to load the data files into appropriately-named pandas DataFrames
for path_name in data_files:
    filename = path_name.rsplit("/")[-1]
    data_frame_name = filename.split(".")[0]
    exec(data_frame_name + " = pd.read_csv(path_name)")
    if data_frame_name == 'sales_train':
        sales_train['date'] = pd.to_datetime(sales_train['date'], format = '%d.%m.%Y')
    print("Data Frame: " + data_frame_name)
    print(eval(data_frame_name).head(2))
    print("\n")

"""
unused at this time...

# Load in any helper functions from the code_files dictionary
#    dictionary key is the path (replace "/"" with "." when using Google Drive + Colab), 
#      and dictionary value is the module reference name
#    note that the directory chain on GitHub (and local repo) from current directory down to the .py file
#      must include a "__init__.py" file (it can be empty) in each of the directories
for filepath, module in code_files.items():
  path_name = filepath.replace("/",".")[:-3]  # Google Drive reference does not use .py, and uses a "." instead of "/" for directory delineation
  exec("import " + path_name + " as " + module)

# Sanity check test
test1 = kag_utils.add_one(2)
print(test1)
"""
optional_code = True  # in a code block: (at top of cell = notice for you; at bottom = prevents Jupyter printing """ comments)

/content/drive/My Drive/Colab Notebooks/NRUHSE_2_Kaggle_Coursera/final/Kag
Loading Files from Google Drive repo into Colab...

Data Frame: items
                                                              item_name  item_id  item_category_id
0                             ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.)         D        0                40
1  !ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]        1                76


Data Frame: item_categories
        item_category_name  item_category_id
0  PC - Гарнитуры/Наушники                 0
1         Аксессуары - PS2                 1


Data Frame: shops
                       shop_name  shop_id
0  !Якутск Орджоникидзе, 56 фран        0
1  !Якутск ТЦ "Центральный" фран        1


Data Frame: sample_submission
   ID  item_cnt_month
0   0           0.500
1   1           0.500


Data Frame: sales_train
        date  date_block_num  shop_id  item_id  item_price  item_cnt_day
0 2013-01-02               0       59    22154    

#2. Explore Data (EDA), Clean Data, and Generate Features

#2.5) ***items*** Dataset: EDA, Cleaning, Correlations, and Feature Generation

---



---



###Thoughts regarding items dataframe
Let's first look at how many training examples we have to work with...

Many of the items have similar names, but slightly different punctuation, or only very slightly different version numbers or types.  (e.g., 'Call of Duty III' vs. 'Call of Duty III DVD')

One can expect that these two items would have similar sales in general, and by grouping them into a single feature category, we can eliminate some of the overfitting that might come as a result of the relatively small ratio of (training set shop-item-date combinations = 2935849)/(total number of unique items = 22170).  (This is an average of about 132 rows in the sales_train data for each shop-item-date combination that we are using to train our model.  Our task is to produce a monthly estimate of sales (for November 2015), so it is relevant to consider training our model based on how many sales in a month vs. how many sales in the entire training set.  Given that the sales_train dataset covers the time period from January 2013 to October 2015 (34 months), we have on average fewer than 4 shop-item combinations in our training set for a given item in any given month.  Furthermore, as we are trying to predict for a particular month (*November* 2015), it is relevant to consider how many rows in our training set occur in the month of November.  The sales_train dataset contains data for two 'November' months out of the total 34 months of data.  Another simple calculation gives us an estimate that our training set contains on average 0.23 shop-item combinations per item for November months.

To summarize:

*  *sales_train* contains 34 months of data, including 2935849 shop-item-date combinations
*  *items* contains 22170 "unique" item_id values

In the *sales_train* data, we therefore have:
*  on average, 132 rows with a given shop-item pair for a given item_id
*  on average, 4 rows with a given shop-item pair for a given item_id in a given month
*  on average, 0.23 rows with a given shop-item pair for a given item_id in all months named 'November'

If we wish to improve our model predictions for the following month of November, it behooves us to use monthly grouping of sales, or, even better, November grouping of sales.  This smooths out day-to-day variations in sales for a better monthly prediction.  However, the sparse number of available rows in the *sales_train* data will contribute to inaccuracy in our model training and predictions.

Imagine if we could reduce the number of item_id values from 22170 to perhaps half that or even less.  Given that the number of rows for training (per item, on a monthly or a November basis) is so small, then such a reduction in the number of item_id values would have a big impact.  (The same is true for creating features to supplement "shop_id" so as to group and reduce the individuality of each shop - and thus effectively create, on average, more rows of training data for each shop-item pair.

###2.5.1) **Translate and Ruminate**
We will start by translating the Russian text in the dataframe, and add our ruminations on possible new features we can generate.

The dataframe *items_transl* (equivalent to *items* plus a column for English translation) is saved as a .csv file so we do not have to repeat the translation process the next time we open a Google Colab runtime.

In [7]:
print(items_transl.info())
print("\n")
print(items_transl.tail(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_name         22170 non-null  object
 1   item_id           22170 non-null  int64 
 2   item_category_id  22170 non-null  int64 
 3   en_item_name      22170 non-null  object
dtypes: int64(2), object(2)
memory usage: 692.9+ KB
None


                                                   item_name  item_id  item_category_id                                           en_item_name
22160                             ЯРМАРКА ТЩЕСЛАВИЯ (Регион)    22160                40                                   Vanity Fair (Region)
22161                       ЯРОСЛАВ. ТЫСЯЧУ ЛЕТ НАЗАД э (BD)    22161                37                YAROSLAV. Thousands of years ago e (BD)
22162                                                 ЯРОСТЬ    22162                40                                         

###2.5.2) **NLP for feature generation from items data**
Automate the search for commonality among items, and create new categorical feature to prevent overfitting from close similarity between many item names

####Investigate possibility of using NLP to reduce or regularize the items dataset

---

---


In [0]:
# The following is commented out for now; not sure if I'm going to use nltk or wordnet

# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# STOPWORDS = set(stopwords.words('english'))

# nltk.download('wordnet')
# from nltk.stem import WordNetLemmatizer 
# lemmatizer = WordNetLemmatizer() 
# # use wordnet??
# from nltk.corpus import wordnet as wn
# # example uses of wordnet
# w = "volume iiii"
# print(f"Lemmatization of '{w}'':", lemmatizer.lemmatize(w)) 
# w = "rocks"
# print(f"Lemmatization of '{w}'':", lemmatizer.lemmatize(w)) 
# wn.synsets('rus')

# Here is the approach I plan to take to look at item name similarity:
#   1) vectorize item names, with vector elements chosen as follows:
#         a) uncommon words or part numbers found inside delimiters like () or [] or / / etc.
#               1.) large "n" n-grams
#               2.) part numbers and uncommon words
#         b) uncommon words in entire item name (not only delimited words)
#         c) words used in "item_categories" names, supplemented
#         d) special descriptors like edition number, english/russian, etc.  (keep all numbers as "words")
#   2) compute cosine similarity or other method giving special weight to the above tiers
#   3) manually investigate item names with very high similarity, and combine if actually the same name
#   4) set a certain similarity limit, and item groups above that limit will form new item categories (target: 2000 categories)
#   5) check any items not in one of these new categories, and see if they are tested... if so, assign to closest of the new categories
#
# Idea is to then use the 2000 category list instead of item name as a key feature in fitting the model, both to help
#   regularize, and to help generalize to the items in test set that are not in train set
#
# What words are "uncommon"?
#   Use the top 50,000 words in 2018 database of movie/tv subtitles from https://github.com/hermitdave/FrequencyWords
#   The en_50k dataframe has the 50,000 most-commonly found words in this database, along with a number of "counts" 
#   or appearances in that text corpus.  The word count gives us an idea of word popularity (higher count = higher popularity of use)
#   We can then do an inverse-frequency type of word characterization on our item names
# Why not use a pre-existing word vectorizer package to create our item_name vectors?...
#   because this 21,700 item database is somewhat unique in that it is heavily weighted 
#   towards Russian entertainment sales.  We don't want word vectors that ignore things 
#   like xbox versus playstation.  We want to "tweak" the vectors to help us form relevant 
#   item groups, and not just use any word in the item name to dominate in group identification
#   (We know, for example, that the word "rus" is likely to mean "Russian" in our database,
#   whereas a standard vectorizer would either characterize it as "rus" or as "ruthenium" perhaps)

####**Delimited Groups of Words**

Investigating "special" delimited word groups (like this) or [here] or /hobbitville/ that are present in item names, and may be particularly important in creating n>1 n-grams for uniquely identifying items so that we can tell if two items are the same or nearly the same

#####Some Details on The Approach...

In [0]:
# explanation of regex string I'm using to parse the item_name
'''

^\s+|\s*[,\"\/\(\)\[\]]+\s*|\s+$

gm
1st Alternative ^\s+
^ asserts position at start of a line
\s+ matches any whitespace character (equal to [\r\n\t\f\v ])
+ Quantifier — Matches between one and unlimited times, as many times as possible, giving back as needed (greedy)

2nd Alternative \s*[,\"\/\(\)\[\]]+\s*
\s* matches any whitespace character (equal to [\r\n\t\f\v ])
* Quantifier — Matches between zero and unlimited times, as many times as possible, giving back as needed (greedy)
Match a single character present in the list below [,\"\/\(\)\[\]]+
+ Quantifier — Matches between one and unlimited times, as many times as possible, giving back as needed (greedy)
, matches the character , literally (case sensitive)
\" matches the character " literally (case sensitive)
\/ matches the character / literally (case sensitive)
\( matches the character ( literally (case sensitive)
\) matches the character ) literally (case sensitive)
\[ matches the character [ literally (case sensitive)
\] matches the character ] literally (case sensitive)
\s* matches any whitespace character (equal to [\r\n\t\f\v ])
* Quantifier — Matches between zero and unlimited times, as many times as possible, giving back as needed (greedy)

3rd Alternative \s+$
\s+ matches any whitespace character (equal to [\r\n\t\f\v ])
+ Quantifier — Matches between one and unlimited times, as many times as possible, giving back as needed (greedy)
$ asserts position at the end of a line

Global pattern flags
g modifier: global. All matches (don't return after first match)
m modifier: multi line. Causes ^ and $ to match the begin/end of each line (not only begin/end of string)
'''
commented_cell = True  # prevent Jupyter from printing triple-quoted comments

In [0]:
# This cell contains no code to run; it is simply a record of some inspections that were done on the items database

# before removing undesirable characters / punctuation from the item name,
#   let's see if we can find n-grams or useful describers or common abbreviations by looking between the nasty characters
# first, let's see what characters are present in the en_item_name column
'''
nasty_symbols = re.compile('[^0-9a-zA-Z ]')
nasties = set()
for i in range(len(items_transl)):
  n = nasty_symbols.findall(items_transl.at[i,'en_item_name'])
  nasties = nasties.union(set(n))
print(nasties)
{'[', '\u200b', 'ñ', '(', ')', '.', 'à', '`', 'ó', '®', 'Á', 
'\\', 'è', '&', '-', ':', 'ë', '_', 'û', '»', '=', '+', ']', ',', 
'«', 'ú', "'", 'ö', '#', 'ä', ';', 'ü', '"', 'ô', '/', '№', 'é', 
'í', '!', '°', 'å', '*', 'ĭ', 'ð', '?', 'â'}
'''
# From the above set of nasty characters, it looks like slashes, single quotes, double quotes, parentheses, and square brackets might enclose relevant n-grams
# Let's pull everything from en_item_name that is inside ' ', " ", (), or [] and see how many unique values we get, and if they are n-grams or abbreviations, for example
# It also seems that many of the item names end in a single character "D" for example, which should be converted to DVD
# Let's set up columns for ()[]-grams, for last string in the name, and for first string in name, and for text that precedes ":", and for text that surrounds "&" or "+"
#   but first, we will strip out every nasty character except ()[]:&+'"/ and replace the nasties with spaces, then eliminating double spaces
# And, let's add a boolean column for whether or not that item is in the test set

'''
# sanity check:
really_nasty_symbols = re.compile('[^0-9a-zA-Z \(\)\[\]:&+\'"/]')
really_nasties = set()
for i in range(len(items_transl)):
  rn = really_nasty_symbols.findall(items_transl.at[i,'en_item_name'])
  really_nasties = really_nasties.union(set(rn))
print(really_nasties)
{'\u200b', 'ñ', '.', 'à', '`', 'ó', '®', 'Á', '\\', 'è', '-', 'ë', '_', 'û', '»', '=', ',', '«', 'ú', 'ö', '#', 'ä', ';', 'ü', 'ô', '№', 'é', 'í', '!', '°', 'å', '*', 'ĭ', 'ð', '?', 'â'}
OK, looks good
'''
commented_cell = True  # prevent Jupyter from printing triple-quoted comments

#####Add 'delimited' and 'cleaned' data columns; shorten the titles of other columns so dataframe fits better on the screen

In [25]:
items_delimited = items_transl.copy(deep=True)
# delete the wide "item_name" column so we can read more of the data table width-wise
items_delimited = items_delimited.drop("item_name", axis=1).rename(columns = {'en_item_name':'item_name','item_category_id':'i_cat_id'})
#print(items_delimited.head())
items_in_test_set = test.item_id.unique()
items_delimited["i_tested"] = False
for i in items_in_test_set:
  items_delimited.at[i,"i_tested"] = True

# nasty_symbols_re = re.compile('[^0-9a-zA-Z \+\:\&]')  # remove all punctuation and crazy characters, except that we keep "+", ":" and "&"
# really_nasty_symbols_re = re.compile('[^0-9a-zA-Z \(\)\[\]\:\&\+\'"/]')
conjunctions_re = re.compile('\s*[\+\&]\s*')
nasty_symbols_re = re.compile('[^0-9a-zA-Z ]')  # remove all punctuation
really_nasty_symbols_re = re.compile('[^0-9a-zA-Z \(\)\[\]\:\"\/]')
delimiters_re = re.compile('[\(\)\[\]\:\"\/\+\&]')
multiple_whitespace_re = re.compile('[ ]{2,}')
delim_pattern_re = re.compile('^\s+|\s*[,\"\/\(\)\[\]\:]+\s*|\s+$') # special symbols indicating a delimiter --> a space at start or end of item name is considered a delimiter, along with ,/()[]:"
d_to_dvd_re = re.compile('\s+d$')  #several item names end in "d" -- which actually seems to indicate dvd (because the items I see are in category 40: Movies-DVD)... standardize so d --> dvd
digitalin_to_digitalversion_re = re.compile('digital in$') # several items seem to end in "digital in"... maybe in = internet?, but looking at nearby items/categories, 'digital version' looks standard

def text_total_clean(text):
    #text: the original en_item_name
    #return: en_item_name made lowercase, stripped of "really_nasties" and multiple spaces
    #     NOTE: this is not stripping stopwords
    text = text.lower()
    text = d_to_dvd_re.sub(" dvd", text)
    text = digitalin_to_digitalversion_re.sub("digital version",text)
    text = delimiters_re.sub(" ", text)  # replace all delimiters with a space
    text = nasty_symbols_re.sub("", text)  # delete anything other than letters, numbers, and spaces
    text = multiple_whitespace_re.sub(" ", text)  # replace multiple spaces with a single space
    return text

def text_clean_delimited(text):
    #text: the original en_item_name
    #return: en_item_name made lowercase, stripped of "really_nasties" and multiple spaces, in a list of strings that had been separated by one of the above "delimiters"
    #     NOTE: this is not stripping out all punctuation, and not stripping stopwords... we are first getting a look at commonly found phrases
    text = text.lower()
    text = d_to_dvd_re.sub(" dvd", text)
    text = digitalin_to_digitalversion_re.sub("digital version",text)
    text = conjunctions_re.sub(" ", text)         # replace conjunctions + and & with a space
    text = really_nasty_symbols_re.sub("", text)  # just delete the nasty symbols
    text = multiple_whitespace_re.sub(" ", text)  # replace multiple spaces with a single space
    text = delim_pattern_re.split(text)           # split item_name at all delimiters, irrespective of number of spaces before or after the string or delimiter
    text = [x for x in text if x]                 # remove empty strings "" from the list of split items in text
    return text

# add a column of simply cleaned text without any undesired punctuation or delimiters
items_delimited['clean_item_name'] = items_delimited['item_name'].apply(text_total_clean)

# now add a column of lists of delimited (cleaned) text
items_delimited['delim_name_list'] = items_delimited['item_name'].apply(text_clean_delimited)

# have a look at what we got with our delimited text globs
def maxgram(gramlist):
    maxg = 0
    for g in gramlist:
        maxg = max(maxg,len(g.split()))
    return maxg
items_delimited['d_len'] = items_delimited.delim_name_list.apply(lambda x: len(x))
items_delimited['d_maxgram'] = items_delimited.delim_name_list.apply(maxgram)
print(items_delimited.head())
print("\n")
print(items_delimited.describe())

#items_delimited.to_csv("data_output/items_delimited.csv", index=False)

   item_id  i_cat_id                                                              item_name  i_tested                                                     clean_item_name                                                      delim_name_list  d_len  d_maxgram
0        0        40                                           ! POWER IN glamor (PLAST.) D     False                                           power in glamor plast dvd                                        [power in glamor, plast, dvd]      3          3
1        1        76  ! ABBYY FineReader 12 Professional Edition Full [PC, Digital Version]     False   abbyy finereader 12 professional edition full pc digital version   [abbyy finereader 12 professional edition full, pc digital version]      2          6
2        2        40                                               *** In the glory (UNV) D     False                                                in the glory unv dvd                                             [in the glory, 

In [0]:
# make item df easier to read for the following stuff
items_clean_delimited = items_delimited.copy(deep=True).drop("item_name", axis=1).rename(columns = {'clean_item_name':'item_name'})

#####Look at the characteristics of different length n-grams in our delimited set

In [26]:
# Inspect the delimited 1-grams

items_delimited_1gram = items_clean_delimited.copy(deep=True)
items_delimited_1gram["d_1grams"] = items_delimited_1gram.delim_name_list.apply(lambda x: [a for a in x if len(a.split()) == 1]) # column contains all "delimited" 1-grams in the translation

g1 = items_delimited_1gram.d_1grams.apply(pd.Series,1).stack()
g1.index = g1.index.droplevel(-1)
g1.name = 'd_1grams'
del items_delimited_1gram['d_1grams']
items_delimited_1gram = items_delimited_1gram.join(g1)

print(items_delimited_1gram.head())
print("\n")
freq_1grams = items_delimited_1gram.d_1grams.value_counts()
print(f'Number of unique delimited 1-grams: {len(freq_1grams)}')
print(f'Number of unique delimited 1-grams that are duplicated at least once: {len(freq_1grams[freq_1grams > 1])}')
print(freq_1grams[1:10])  # can ignore index 0, as it is the empty string

   item_id  i_cat_id  i_tested                                                          item_name                                                      delim_name_list  d_len  d_maxgram d_1grams
0        0        40     False                                          power in glamor plast dvd                                        [power in glamor, plast, dvd]      3          3    plast
0        0        40     False                                          power in glamor plast dvd                                        [power in glamor, plast, dvd]      3          3      dvd
1        1        76     False   abbyy finereader 12 professional edition full pc digital version  [abbyy finereader 12 professional edition full, pc digital version]      2          6      NaN
2        2        40     False                                               in the glory unv dvd                                             [in the glory, unv, dvd]      3          3      unv
2        2        40     False

In [27]:
# Inspect the delimited 2-grams

items_delimited_2gram = items_clean_delimited.copy(deep=True)
items_delimited_2gram["d_2grams"] = items_delimited_2gram.delim_name_list.apply(lambda x: [a for a in x if len(a.split()) == 2]) # column contains all "delimited" 2-grams in the translation

g2 = items_delimited_2gram.d_2grams.apply(pd.Series,1).stack()
g2.index = g2.index.droplevel(-1)
g2.name = 'd_2grams'
del items_delimited_2gram['d_2grams']
items_delimited_2gram = items_delimited_2gram.join(g2)

print(items_delimited_2gram.tail())
print("\n")
freq_2grams = items_delimited_2gram.d_2grams.value_counts()
print(f'Number of unique delimited 2-grams: {len(freq_2grams)}')
print(f'Number of unique delimited 2-grams that are duplicated at least once: {len(freq_2grams[freq_2grams > 1])}')
print(freq_2grams[1:8])

       item_id  i_cat_id  i_tested                                        item_name                                       delim_name_list  d_len  d_maxgram         d_2grams
22166    22166        54      True  language 1c queries enterprises digital version   [language 1c queries, enterprises, digital version]      3          3  digital version
22167    22167        49      True  1c query language enterprise 8 cd khrustalev ey  [1c query language, enterprise 8, cd, khrustalev ey]      4          3     enterprise 8
22167    22167        49      True  1c query language enterprise 8 cd khrustalev ey  [1c query language, enterprise 8, cd, khrustalev ey]      4          3    khrustalev ey
22168    22168        62     False                               egg for little inu                                  [egg for little inu]      1          4              NaN
22169    22169        69     False                       dragon egg game of thrones                         [dragon egg, game of throne

In [28]:
# Inspect the delimited 4-grams

items_delimited_4gram = items_clean_delimited.copy(deep=True)
items_delimited_4gram["d_4grams"] = items_delimited_4gram.delim_name_list.apply(lambda x: [a for a in x if len(a.split()) == 4]) # column contains all "delimited" 4-grams in the translation

g4 = items_delimited_4gram.d_4grams.apply(pd.Series,1).stack()
g4.index = g4.index.droplevel(-1)
g4.name = 'd_4grams'
del items_delimited_4gram['d_4grams']
items_delimited_4gram = items_delimited_4gram.join(g4)

print(items_delimited_4gram.tail())
print("\n")
freq_4grams = items_delimited_4gram.d_4grams.value_counts()
print(f'Number of unique delimited 4-grams: {len(freq_4grams)}')
print(f'Number of unique delimited 4-grams that are duplicated at least once: {len(freq_4grams[freq_4grams > 1])}')
print(freq_4grams[1:12])

       item_id  i_cat_id  i_tested                                        item_name                                       delim_name_list  d_len  d_maxgram            d_4grams
22165    22165        31     False              nuclear titbit 2 pc digital version                [nuclear titbit 2, pc digital version]      2          3                 NaN
22166    22166        54      True  language 1c queries enterprises digital version   [language 1c queries, enterprises, digital version]      3          3                 NaN
22167    22167        49      True  1c query language enterprise 8 cd khrustalev ey  [1c query language, enterprise 8, cd, khrustalev ey]      4          3                 NaN
22168    22168        62     False                               egg for little inu                                  [egg for little inu]      1          4  egg for little inu
22169    22169        69     False                       dragon egg game of thrones                         [dragon egg,

In [29]:
# Get all of the delimited n-grams that are duplicated at least once in item names
#  range of sizes of delimited phrases (number of 'words'):
min_gram = items_delimited.d_maxgram.min()
max_gram = items_delimited.d_maxgram.max()

gram_freqs = {}   # dict will hold elements that are pd.Series with index = phrase, value = number of repeats in items database item names
for n in range(min_gram,max_gram+1):
    item_ngram = items_clean_delimited.copy(deep=True)
    item_ngram['delim_ngrams'] = item_ngram.delim_name_list.apply(lambda x: [a for a in x if len(a.split()) == n])

    grams = item_ngram.delim_ngrams.apply(pd.Series,1).stack()
    grams.index = grams.index.droplevel(-1)
    grams.name = 'delim_ngrams'
    del item_ngram['delim_ngrams']
    item_ngram = item_ngram.join(grams)

    freq_grams = item_ngram.delim_ngrams.value_counts()
    print(f'Number of unique delimited {n}-grams: {len(freq_grams)}')
    grams_dupe = len(freq_grams[freq_grams > 1])
    print(f'Number of unique delimited {n}-grams that are duplicated at least once: {grams_dupe}\n')
    if grams_dupe > 0:
        gram_freqs[n] = freq_grams.copy(deep=True)
print('Done')

Number of unique delimited 1-grams: 2300
Number of unique delimited 1-grams that are duplicated at least once: 899

Number of unique delimited 2-grams: 3349
Number of unique delimited 2-grams that are duplicated at least once: 943

Number of unique delimited 3-grams: 3453
Number of unique delimited 3-grams that are duplicated at least once: 617

Number of unique delimited 4-grams: 3392
Number of unique delimited 4-grams that are duplicated at least once: 397

Number of unique delimited 5-grams: 2906
Number of unique delimited 5-grams that are duplicated at least once: 235

Number of unique delimited 6-grams: 2112
Number of unique delimited 6-grams that are duplicated at least once: 110

Number of unique delimited 7-grams: 1460
Number of unique delimited 7-grams that are duplicated at least once: 84

Number of unique delimited 8-grams: 1034
Number of unique delimited 8-grams that are duplicated at least once: 40

Number of unique delimited 9-grams: 652
Number of unique delimited 9-grams

OK, so the gram_freqs dictionary holds a bunch of informative stuff that will help us group items together.
Let's also add every unique word from the item_categories names

In [34]:
def cat_name_clean(text):
    #text: the original en_cat_name
    #return: en_cat_name made lowercase, stripped of punctuation and multiple spaces
    #     NOTE: this is not stripping stopwords
    text = text.lower()
    text = delimiters_re.sub(" ", text)  # replace all delimiters with a space
    text = nasty_symbols_re.sub("", text)  # delete anything other than letters, numbers, and spaces
    text = multiple_whitespace_re.sub(" ", text)  # replace multiple spaces with a single space
    text = text + " "  # serves as a delimiter when we combine rows
    return text

catwords = set(item_categories_augmented.en_cat_name.apply(cat_name_clean).sum().split(" "))
print(catwords)

{'', 'movie', 'certificates', 'additional', 'dvd', 'albums', 'one', 'fiction', 'live', 'robots', 'toys', 'compact', 'mats', 'local', 'and', 'gifts', 'pc', 'android', 'artbook', '8', 'sports', 'movies', 'in', 'bags', 'development', 'of', 'educational', 'bluray', '3d', 'tickets', 'materials', '1c', 'souvenirs', 'carriers', 'firm', 'attributes', 'ps4', 'computer', 'office', 'spire', 'literature', 'cards', 'goods', 'board', 'collectors', 'informative', 'card', 'business', 'postcards', 'services', 'for', 'xbox', 'headphones', 'psvita', 'music', 'encyclopedia', 'headsets', 'net', 'vinyl', 'soft', 'enterprise', 'mouse', 'gift', 'programs', 'batteries', 'tools', 'other', 'audiobooks', 'figure', 'system', 'digital', 'windows', 'cinema', 'delivery', 'program', 'stickers', 'publications', 'manga', '360', 'mp3', '4k', 'production', 'accessories', 'guides', 'ps3', 'travel', 'gadgets', 'edition', 'mac', 'payment', 'ps2', 'numeral', 'collector', 'consoles', 'digits', 'game', 'standard', 'piece', 'vid

######Consider as special the n-grams that follow certain descriptive keywords such as "by" or "for" or "from" (? TBD)

In [0]:
# these are words we could consider to highlight n-grams that follow these words, similar to the thinking that "delimited" words may be special
key_modifiers = "by,for,from"  

###Continue on the work to vectorize item names and compute similarities between items (this section should be grouped with the above section eventually, but I have it set apart for now, so it is easy to find)

---

---


In [0]:
# Inspecting the "delimited" n-grams, I've considered the following for use in vectorizing our item names

# Regarding the use of NLTK stopwords, meh... a lot of these stopwords are irrelevant regarding excess text in our item names... 
#    We really should make our own stopword list
s = "a,the,an,only,more,are,any,on,your,just,it,its,it's,has,with,for,by,from"

# make some dictionaries to "lemmatize" certain keywords into standardized 1-grams
# volumes/editions
key_descriptors = {'1': 'first,1st,1ed,v1,vol1,volume1,ver1,version1,edition1,original,standard'}
key_descriptors['2'] = 'second,2nd,ii,2ed,v2,vol2,volume2,ver2,version2,edition2,updated,revised'
key_descriptors['3'] = 'third,3rd,iii,3ed,v3,vol3,volume3,ver3,version3,edition3'
key_descriptors['4'] = 'fourth,4th,iiii,4ed,v4,vol4,volume4,ver4,version4,edition4'
# special, highlighted items
key_descriptors['special'] = 'collector,collection,premium,platinum,gold,silver,black,boxset,classic,trilogy'
# items dealing with business
key_descriptors['biz'] = 'company,business,enterprise,firm,professional'
# items likely to be a Russian version, or translated into Russian
key_descriptors['rus'] = 'russian,rus,rusv,translate,translated'
# items likely to be in English
key_descriptors['eng'] = 'english,eng,engl'
# relatively frequent descriptors that should have substantial weight in the item name vectorization
key_descriptors['0'] = 'essential,preorder,supplement,addition,catalog,reprint'

key_platforms = '1c,1cpublishing,playstation,psp,ps2,ps3,ps4,psvita,psn,pc,android,iphone,phone,tablet,'
key_platforms += 'ipad,xbox,xbox 360,xbox one,xbox360,xboxone,xbox1,xone,apple,mac,microsoft,windows,98,live,marvel,cech4008c'

key_formats = {'dvd': 'pcdvd,dvd,2dvd,3dvd,4dvd,5dvd,6dvd,dvddigipack,dvdbox,mp3dvd,dvdbook,dvdcd'}
key_formats['cd'] = 'cd,mp3cd,2cd,cdbox,compactdisk,audiocd,cddigipack'
key_formats['movie'] = 'bd,2bd,3bd,bluray,blueray,4k,movie,dvdmovie,moviedvd,dvdbd,bddvd,2dvdbd,tv'
key_formats['0'] = 'book,audiobook,digibook,wb,wd,mp3,digipack,jewel,umdcase,keychain,ac,6cm,8cm,12cm,14cm,dicomp,glass,delivery,online,gb,tb,mb'

special_words = 'komplvoprsertekzam1s,softklab,transformer,hobbit,barbie,middleearth,'  # plus any 4-digit numbers like 1812, 2014,...
special_words += 'red30,red20,alien,toy,region,bbc,batman,scee,cyrus,mtg,berserk,disney,spell,'
special_words += 'buh8,b01,retribution,metodmaterialy,destiny,wolfenstein,hhkata,luntik,'
special_words += 'paragon,42,72,buka,injustice,call,action,universal,zhelboks,fallout,cuh1108a,16,cechzc2e,'
special_words += 'pch1108za01,dinosaur,braveheart,titanic,infamous,spiderman,armageddon,universe,'
special_words += 'rezhversiya,outcast,wheelbarrow,yellow,inquisition,pch1008za01,zolushka,xmen,'
special_words += 'metro,battlefront,cuh1208b,wargame,operation,cech3008b,poirot,hhkatv,kitchen,'
special_words += 'icecrown,underworld,shepherd,thief,sensor,shrek,oceans,castlevania,murdered,'
special_words += 'madagascar,shelter,devastation,deadpool,incredibles,jungle,titanfall,pspe1008,cech2508b,skyrim,avatar,maleficent,invizimals,heracles,phantom,toysmlk,aladdin'

#key_categories = 1-grams from all item_categories_augmented (0,1,2,3,4 columns)
  


In [0]:
# Get word counts from en_50k for the delimited terms:
#   combine all words in delim_name_list, and search one-by-one through en_50k
#   create a column for words with zero counts
#   create a column for words with 1 to 1000 counts

In [0]:
en50k_word_list = en_50k.word.to_numpy(dtype='str')
swfreq = []
for sw in special_words.split(","):
    sw_50k_rank = np.where(en50k_word_list == sw)[0]
    if len(sw_50k_rank) > 0:
        swfreq.append(en_50k.at[sw_50k_rank[0],'word_count'])
    else:
        swfreq.append(0)

special_wds_df = pd.DataFrame({"word": special_words.split(","), "counts_50k": swfreq}).sort_values('counts_50k')
print(special_wds_df.head())
print(special_wds_df.describe())

                    word  counts_50k
0   komplvoprsertekzam1s           0
24           wolfenstein           0
25                hhkata           0
26                luntik           0
28                    42           0
       counts_50k
count          91
mean   10,899.868
std    60,340.351
min             0
25%             0
50%           624
75%          3637
max        572997


In [0]:
# inspect item names more closely to see how they relate to the rare delimited words, and to 
#    help understand what certain unusual terms mean 
# "firms" = big record label 
# "figure" = digital version; not hardcopy
#  komplvoprsertekzam1s = Complete Set of Certification Exam. 1C

# enter the desired item_ids here:
to_plot = [617,618]

item_plot_df = sales_train[sales_train.item_id.isin(to_plot)] 
print(item_plot_df.head())

# what does "figure" mean in category name? --> DIGITAL VERSION (as opposed to hardcopy)
# whereas, "Digital Version" in item name actually seems to indicate it is downloaded rather than obtained from DVD, for example

#fig_cat_df = items_transl_with_cat[items_transl_with_cat.item_category_id.isin([77,78])][['item_id','en_item_name','item_category_id','category']] 
#fig_cat_df = items_transl_with_cat[items_transl_with_cat.item_category_id.isin([43,44])][['item_id','en_item_name','item_category_id','category']] 
#fig_cat_df = items_transl_with_cat[items_transl_with_cat.item_category_id.isin([75,76])][['item_id','en_item_name','item_category_id','category']] 
fig_cat_df = items_transl_with_cat[items_transl_with_cat.item_category_id.isin([6])][['item_id','en_item_name','item_category_id','category']] 
print(fig_cat_df.tail())

# see what some of these 'rare' word 1-grams are:
# get item names from rare words
rare_words = 'cech2508b,komplvoprsertekzam1s'
#reg_pattern = re.compile('|'.join(rare_words.split(',')), flags=re.IGNORECASE)
any_rare_words = '|'.join(rare_words.split(','))
rare_items_df = items_delimited[items_delimited.clean_item_name.str.contains(any_rare_words, case=False, na=False)][['item_id','i_cat_id','i_tested','item_name','clean_item_name']] 
print(rare_items_df.head(50))
print(items[items['item_id']==13380])

# "certification exam" is not present in the cleaned, translated item names, nor is "certification"
# "exam " is present in only 26 items... so, komplvoprsertekzam1s is broader than 'exam'
certification = items_delimited[items_delimited.clean_item_name.str.contains('exam ', case=False, na=False)][['item_id','i_cat_id','i_tested','item_name','clean_item_name']] 
print(len(certification))
print(certification.head())

              date  date_block_num  shop_id  item_id  item_price  item_cnt_day
505355  2013-05-07               4       55      618         172             1
933109  2013-09-27               8       55      618         172             1
933124  2013-09-06               8       55      618         172             1
1088977 2013-11-24              10       55      618         172             1
1088978 2013-11-03              10       55      618         172             1
       item_id                                                                        en_item_name  item_category_id                category
8448      8448                                   Accessory: Xbox 360 Hard Drive 500 GB (6FM-00003)                 6  Accessories - XBOX 360
8449      8449                  Accessory: Xbox 360 Wireless Controller Chrome magenta (43G-00062)                 6  Accessories - XBOX 360
8450      8450                    Accessory: Xbox 360 Wireless Controller Black Chrome (43G-00059)     

######Items containing "1C" in their name

In [0]:
# I'm curious about items containing "1C" ... there seem to be quite a few, and if I'm not mistaken,
#    1C is an ERP like Oracle or SAP, but 1C is based in Russia.  Let's see how many items have 1C
#    in their item_name, and what categories these belong to, and what the item names look like (what are common words that accompany "1C"?)
items_delimited_1c = items_clean_delimited.copy(deep=True)
# delete the wide "item_name" column so we can read more of the data table width-wise
items_delimited_1c = items_delimited_1c.drop(['delim_name_list','d_len','d_maxgram'], axis=1)
items_delimited_1c.head()


Unnamed: 0,item_id,i_cat_id,i_tested,item_name
0,0,40,False,power in glamor plast dvd
1,1,76,False,abbyy finereader 12 professional edition full pc digital version
2,2,40,False,in the glory unv dvd
3,3,40,False,blue wave univ dvd
4,4,40,False,box glass dvd


#XX) Below this markdown cell can be ignored

These are just code snippets I used to debug how to accomplish some of the tasks above.  I may want to revisit them some day, so I am too scared to just delete them. :)

Below is a code snippet to plot item sales vs. month for one or more item ids

In [0]:
# enter the desired item_ids here:
to_plot = [617,618]

item_plot_df = sales_train[sales_train.item_id.isin(to_plot)] 
print(item_plot_df.head())

# what does "figure" mean in category name?
#fig_cat_df = items_transl_with_cat[items_transl_with_cat.item_category_id.isin([77,78])][['item_id','en_item_name','item_category_id','category']] 
#fig_cat_df = items_transl_with_cat[items_transl_with_cat.item_category_id.isin([43,44])][['item_id','en_item_name','item_category_id','category']] 
#fig_cat_df = items_transl_with_cat[items_transl_with_cat.item_category_id.isin([75,76])][['item_id','en_item_name','item_category_id','category']] 
fig_cat_df = items_transl_with_cat[items_transl_with_cat.item_category_id.isin([6])][['item_id','en_item_name','item_category_id','category']] 
print(fig_cat_df.tail())

# get item names from rare words
rare_words = 'pch1008za01' #,hhkatv,pspe1008,cech2508b,komplvoprsertekzam1s'
#reg_pattern = re.compile('|'.join(rare_words.split(',')), flags=re.IGNORECASE)
any_rare_words = '|'.join(rare_words.split(','))
rare_items_df = items_clean_delimited[items_clean_delimited.item_name.str.contains(any_rare_words, case=False, na=False)][['item_id','i_cat_id','i_tested','item_name']] 
print(rare_items_df.head(50))
'''

# seems to be one particular bad player... item 2973 from shop 6 in row 484683
#   look more at this item, shop, and item-shop combo:
print("\n")
print(2973 in test.item_id.to_list())
# this item is not in the test set... does it make up a significant amount of the train set?

print(f"Shop 32 = {shops_augmented.at[32,'en_shop_name']}", end="")
print(f", Item 2973 = {items_transl.at[2973,'en_item_name']}\n")

item2973 = sales_train[sales_train.item_id == 2973]
print(item2973.describe())
print("\n")
print(item2973.sort_values('item_price').head(10))
# only one sales_train row entry with price < 1000, and it is this negative outlier

print("\n")
item2973shop32 = item2973[item2973.shop_id == 32]
print(item2973shop32.describe())
print("\n")
print(item2973shop32.sort_values('date').head(15))
# it looks like perhaps this shop had a discount clearance sale in summer 2013, and
#  then never sold the item again

plt.rcParams["figure.figsize"] = [16,5]
fig = plt.figure() #figsize=(16,9))
#item2973.sort_values('date_block_num')['date_block_num'].value_counts().plot(kind='bar')
item2973.date_block_num.value_counts().reset_index().drop('index',axis=1).rename(columns={'date_block_num':'n_train_rows'}).plot(kind='bar')
plt.title('by MONTH (date_block_num), number of rows in sales_train for item 2973')
plt.grid(b=True, which='major', axis='y', color='#666666', linestyle='-')

plt.show()

# looks like this item has little bearing on sales in Nov. 2015, as its sales
#  died off (among all shops) by January 2015
# and, the one entry at row 484683 in sales_train with the negative price can be safely deleted
'''
a=1

              date  date_block_num  shop_id  item_id  item_price  item_cnt_day
505355  2013-05-07               4       55      618         172             1
933109  2013-09-27               8       55      618         172             1
933124  2013-09-06               8       55      618         172             1
1088977 2013-11-24              10       55      618         172             1
1088978 2013-11-03              10       55      618         172             1
       item_id                                                                        en_item_name  item_category_id                category
8448      8448                                   Accessory: Xbox 360 Hard Drive 500 GB (6FM-00003)                 6  Accessories - XBOX 360
8449      8449                  Accessory: Xbox 360 Wireless Controller Chrome magenta (43G-00062)                 6  Accessories - XBOX 360
8450      8450                    Accessory: Xbox 360 Wireless Controller Black Chrome (43G-00059)     

######Below here are some code snippets for future reference; ignore please

</br>

</br>

---
---

</br>

</br>


In [0]:
'''
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
MULTIPLE_WHITESPACE_RE = re.compile('[ ]{2,}')
STOPWORDS = set(stopwords.words('english'))  #using "set" speeds things up a little; note all stopwords are in lowercase
#print("." in STOPWORDS)
def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text... need to do this before removing stopwords because stopwords are all lowercase
    text = REPLACE_BY_SPACE_RE.sub(' ',text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('',text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = MULTIPLE_WHITESPACE_RE.sub(' ',text)
    text = " ".join([word for word in text.split(" ") if word not in stopwords.words('english')]) # delete stopwords from text
 
    return text
'''
# keep this cell for future reference
keep = True