#**EDA and Feature Generation**
## shops, items, item_categories

Andreas Theodoulou and Michael Gaidis (June, 2020)

#**0. Mount Google Drive (Local File Storage/Repo For Colab)**

In [1]:
# click on the URL link presented to you by this command, get your authorization code from Google, then paste it into the input box and hit 'enter' to complete mounting of the drive
from google.colab import drive  
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


#**1. Configure Environment and Load Data Files**

In [2]:
# python libraries/modules used throughout this notebook (with some holdovers from other, similar notebooks)
'''
NOTE: selecting a group of code lines and pressing ctrl-/ will toggle commenting of the code lines, for fast and easy disabling/enabling of stuff
'''

# pandas data(database) storage, EDA, and manipulation
import pandas as pd
### pandas formatting
### Adjust as per your preferences.  Here's what I find works well when using a FHD monitor with a full-screen browser window containing my IPynb notebook:
# pd.set_option("display.max_rows",100)     # Override pandas choice of how many rows to show, so we can see the full 84-row item_category df instead of '...' in the middle
# pd.set_option("display.max_columns",30)   # Similar to row code above, we can show more columns than default
# pd.set_option("display.width", 250)       # Tune this to our monitor window size to avoid horiz scroll bars in output windows (but, the drawback is that we will get output text wrapping)
# pd.set_option("max_colwidth", None)       # This is done, for example, so we can see full item name and not '...' in the middle
### Here's what I find works well for this particular IPynb, when using a FHD laptop monitor with a full-screen browser window containing my IPynb notebook:
pd.set_option("display.max_rows",120)     # Override pandas choice of how many rows to show, so, for example, we can see the full 84-row item_category dataframe instead of the first few rows, then ...., then the last few rows
pd.set_option("display.max_columns",26)   # Similar to row code above, we can show more columns than default  
pd.set_option("display.width", 230)       # Tune this to our monitor window size to avoid horiz scroll bars in output windows (but, the drawback is that we will get output text wrapping)
pd.set_option("max_colwidth", None)       # This is done, for example, so we can see full item name and not '...' in the middle

# pd.set_option("display.precision", 3)  # Nah, this is helpful, but below is even better
# Try to convince pandas to print without decimal places if a number is actually an integer (helps keep column width down, and highlights data types), or with precision = 3 decimals if a float
pd.options.display.float_format = lambda x : '{:.0f}'.format(x) if round(x,0) == x else '{:,.3f}'.format(x)

# Pandas additional enhancements
pd.set_option('compute.use_bottleneck', False)  # speed up operation when using NaNs
pd.set_option('compute.use_numexpr', False)     # speed up boolean operations, large dataframes; DataFrame.query() and pandas.eval() will evaluate the subexpressions that can be evaluated by numexpr


# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# ipynb magic command to allow interactive matplotlib graphics in ipynb notebook
%matplotlib inline  
# a useful reference of contrasting color choices to use when plotting multiple things on a single axis
pltcolors = ['blue','red','green','black','darkorange','fuchsia','teal','gold','violet','olive','firebrick','gray','cyan','sienna','dodgerblue','lime','darkorchid','deeppink','turquoise','tan']
from matplotlib.ticker import MultipleLocator, FormatStrFormatter, AutoMinorLocator
from IPython.display import Javascript      # used to properly code the creation of sns heatmaps in IPynb with Google Colab
from IPython.display import display_html    # used to print out side-by-side dataframes, for example

# computations
import numpy as np
from scipy import sparse
from numba import jit, njit, prange  # speedup for appropriate functions and datatypes (no sets, lists, dictionaries, string functions; use np arrays rather than pandas series or dataframes)
#  If you want Numba to throw an error if it cannot compile a function in a way that speeds up your code, pass the argument nopython=True (e.g. @jit(nopython=True))
from numba import vectorize  # speed up row-wise operations like .apply() --> https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html
# can also do np.vectorize (see pandas 1.0.3 documentation section on enhancing speed of pandas operations)

# file operations
import os
import feather   # this is 3x to 8x faster than pd.read_csv and pd.to_hdf, but file size is 2x hdf and 10x csv.gz
import pickle
import json
from urllib.parse import urlunparse
from pathlib import Path

# misc. python enhancements
# note: for a quick look at what's available for magic commands in this ipynb, enter this into a code cell: '%quickref'
import re
import string
from itertools import product
from collections import OrderedDict
import time
import datetime
from time import sleep, localtime, strftime, tzset, strptime
os.environ['TZ'] = 'EST+05EDT,M4.1.0,M10.5.0'   # allows user to simply print a formatted version of the local date and time; helps keep track of what cells were run, and when
tzset()


# Specialized packages
# -- for network analysis / graphs / clustering (a reasonable alternative to pca, tSNE, or Knn clustering when number of dimensions is huge)
import networkx as nx
from networkx.algorithms import community, cluster
# -- NLP packages ... for now, as of 5/29/20, only using the lemmatizer, due to the 
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

# ML packages
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
# from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# !pip install catboost
# from catboost import CatBoostRegressor
# %tensorflow_version 2.x
# import tensorflow as tf
# import keras as K

# # List of the modules we need to version-track for reference
modules = ['pandas','matplotlib','numpy','scipy','numba','seaborn','sklearn','tensorflow','keras','catboost','pip','nltk','networkx']
print(f'done: {strftime("%a %X %x")}')

  import pandas.util.testing as tm


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
done: Thu 08:27:52 06/18/20


In [3]:
#  Except for fast-loading (large filesize) feather format files, 
#   the data is coming from a public repo on GitHub at github.com/migai/Kag that has been synced to my local repo on Google Drive

'''
############################################################
############################################################
'''
# Replace this path with the path on *your* Google Drive where the repo master branch is stored
#   (on GitHub, the remote repo is located at github.com/migai/Kag --> below is my cloned repo location)
GDRIVE_REPO_PATH = "/content/drive/My Drive/Colab Notebooks/NRUHSE_2_Kaggle_Coursera/final/Kag"
OUT_OF_REPO_PATH = "/content/drive/My Drive/Colab Notebooks/NRUHSE_2_Kaggle_Coursera/final"   # place > 100MB files here, because they won't sync with GitHub

traintest_loaded = True   # set this to True if you plan to load the .ftr or the .csv.gz version of the traintest dataframe, and skip the calculations below that generated it
ftr_file_load_employed = True #False #True  # set to True if you wish to load the .ftr version or the .csv.gz version... it's faster, but its a 10x larger file, and won't work in the GitHub repo push


# if using large feather file for fast loading, use the routine here
#   note that this is too large to push to GitHub, so if you want to go this route, 
#   you'll first have to load (more slowly) the 'data_output/traintest.csv.gz' file 
#   with pandas read_csv, and then store the file as feather type (outside your local GitHub repo)
#   Or, you can just recreate the dataframe by running the first few code cells that do merging and data manipulation
# load feather files manually for now
if (traintest_loaded and ftr_file_load_employed):
    print('ftr files source directory: ', end='')
    %cd "{OUT_OF_REPO_PATH}"
    traintest = pd.read_feather('traintest.ftr', columns=None, use_threads=True)
    print("Loading ftr Files from Google Drive (outside repo) into Colab... \n\nData Frame: traintest (from ftr)")
    print(traintest.head(2))

'''
############################################################
############################################################
'''

data_files = []
# List of the data files (path relative to GitHub master), to be loaded into pandas DataFrames
if (traintest_loaded and not ftr_file_load_employed):
    data_files = [ "data_output/traintest.csv.gz" ]
                
data_files += [  #"readonly/final_project_data/shops.csv",
                #"data_output/shops_transl.csv",
                "data_output/shops_augmented.csv",
                "data_output/shops_new.csv",
               
                #"readonly/final_project_data/items.csv",
                #"data_output/items_transl.csv",
                "data_output/items_augmented.csv",
                #"data_output/items_new.csv",
                #"data_output/items_clustered_22170.csv.gz",
               
                #"readonly/final_project_data/item_categories.csv",
                #"data_output/item_categories_transl.csv",
                "data_output/item_categories_augmented.csv",
                #"readonly/en_50k.csv",
               
                "readonly/final_project_data/sales_train.csv.gz",
                #"data_output/sales_train_cleaned.csv.gz",
               
                #"readonly/final_project_data/sample_submission.csv.gz",
                "readonly/final_project_data/test.csv.gz"
                ]


# Dict of helper code files, to be loaded and imported {filepath : import_as}
code_files = {}  # not used at this time; example dict = {"helper_code/kaggle_utils_at_mg.py" : "kag_utils"}


# GitHub file location info
git_hub_url = "https://raw.githubusercontent.com/migai"
repo_name = 'Kag'
branch_name = 'master'
base_url = os.path.join(git_hub_url, repo_name, branch_name)

if data_files:
    print('\n\ncsv files source directory: ', end='')
    %cd "{GDRIVE_REPO_PATH}"

    print("\nLoading csv Files from Google Drive repo into Colab...\n")

    # Loop to load the data files into appropriately-named pandas DataFrames
    for path_name in data_files:
        filename = path_name.rsplit("/")[-1]
        data_frame_name = filename.split(".")[0]
        exec(data_frame_name + " = pd.read_csv(path_name)")
        # if data_frame_name == 'sales_train':
        #     sales_train['date'] = pd.to_datetime(sales_train['date'], format = '%d.%m.%Y')
        print(f'DataFrame {data_frame_name}, shape = {eval(data_frame_name).shape} :')
        print(eval(data_frame_name).head(2))
        print("\n")
else: 
    %cd "{GDRIVE_REPO_PATH}"
    
print(f'\nDataFrame Loading Complete: {strftime("%a %X %x")}\n')

ftr files source directory: /content/drive/My Drive/Colab Notebooks/NRUHSE_2_Kaggle_Coursera/final
Loading ftr Files from Google Drive (outside repo) into Colab... 

Data Frame: traintest (from ftr)
   day  DoW  DoM  week  qtr  season  month  price  sales  shop_id  item_id                                  item_name  it_test  item_category_id   item_category_name  it_cat_test item_cat3 item_cat4         shop_name sh_cat  \
0    0  Tue    1     0    0       2      0     99      1        2      991    3d action puzzle dinosaur tyrannosaurus    False                67  Gifts - Development         True     Gifts     Gifts  Adygea TC "Mega"   Mega   
1    0  Tue    1     0    0       2      0   2599      1        2     1472  assassin creed 3 xbox 360 russian version    False                23     Games - XBOX 360         True     Games      Xbox  Adygea TC "Mega"   Mega   

   sh_test district    city  
0     True    South  Adygea  
1     True    South  Adygea  


csv files source directory:

#**2. Merge data sets and create day, week, quarter, and season feature columns**

In [4]:
def clean_merge_augment(day0 = datetime.datetime(2013,1,1),
                        delete_rows = [2909818,2909401,2326930,2257299,1163158,484683],
                        merge_shops = {0: 57, 1: 58, 11: 10},
                        delete_shops = []):
    """
    Parameters:
    day0 = datetime.datetime object representing the day you wish to use as your reference when creating time-based features
    delete_rows = list of integer row numbers that you wish to delete from the sales_train data set
    merge_shops = dictionary of integer shop_id key:value pairs where shop(=key) is merged into shop(=value)
    delete_shops = list of integer shop_id numbers that you wish to fully delete from the sales_train data set
        # looks like it could be safe to delete these shops from sales_train: [8, 13, 23, 32, 33, 40]
        # should probably delete categories 8, 80 (= 'tickets') and probably 81,82 (= 'net carriers')

    Global Variables: this function assumes you have the following pandas dataframes available globally:
    1) unaltered sales_train
    2) unaltered test
    3) items_augmented (contains 'item_id', 'item_tested', 'item_category_id', and 'orig_eng_name_transl')
    4) item_categories_augmented ('item_category_id','en_cat_name','item_cat_tested','item_category3','item_category4')
    5) shops_augmented ('shop_id', 'en_shop_name', 'shop_city', 'shop_federal_district',  'shop_city_population',  'shop_tested')
    6) shops_new ('shop_id', 'shop_type', 'fd_popdens',  'fd_gdp')

    This function does the following:
    1) cleans (deletes) outlier rows from the training set that appear to be erroneous or irrelevant entries
    2) merge 3 shops into other shops where it appears that the sales_train set simply has different names for the 
        same shop at different time periods (shop 0 absorbed by 57; shop 1 absorbed by 58, shop 11 absorbed by 10)
    3) optionally delete shops entirely from the sales_train data set (e.g., for irrelevant shops)
    4) append the test set rows to the sales_train rows, using a date of November 1, 2015 for test
    5) adjust the 'date' column on the merged dataset to be in datetime format, so it looks like a string of format: 'YYYY-M-D'
    6) merge the aforementioned data sets into the merged sales_train + test DataFrame

    Then, creates and inserts new time-based feature columns as follows:
    Given a dataframe with a 'date' column containing strings like '2015-10-30', create new time-series columns:
    1. 'day'    = integer value of day number, starting at day = 0 for parameter day0, and incrementing by calendar day number (not by transaction day number)... 
                    Thus, 'day' may not include all possible integers from start to finish.  It only assigns integer values (based on the calendar) to days when 
                    there are transactions in the input dataframe --> if the input dataframe has no transactions on a particular day, that day's 'calendar' integer 
                    value will not be present in the column (will be = 0)
    2. 'DoW'    = day of week = 3-character text string of weekday by name (Sun, Mon, ...)
    3. 'DoM'    = day of month = 1-31
    4. 'week'   = integer value of week number, with week = 0 at time= parameter day0.  However, unlike 'day', the 'week' number is aligned not to start at day0, but rather
                    so that there is a full 'week' of 7 days that ends on Oct. 31, 2015 (the final day of training data).  This results in week = 0 having only 5 days in it.
                    n.b., the final week of October, 2015 is assigned 'week' number = 147.  Artifically assigning test to Nov. 1, 2015 results in test week = 148
    5. 'month'  = renamed from "date_block_num" of original data set (no changes).  Integer values from 0 to 33 represent the months starting at day0.  Test month=34 is Nov. 2015.
    6. 'qtr'    = quarter = integer number of 3-month chunks of time, aligned with the end of October, 2015.  day0 is included in 'qtr' = 0, but 'qtr'=0 only contains 1 month (Jan 2013) of data due to the alignment
                    The months of August, Sept, Oct 2015 form 'qtr' = 11.  "qtr" in this sense is just 3-month chunks... it is not the traditional Q1,Q2,Q3,Q4 beginning Jan 1, but instead is more like
                    date_block_num in that it is monotonically increasing integers, incremented every 3 months such that #11 ends at the end of our training data
    7. 'season' = integer number of 3-month chunks of time, reset each year (allowed values = 0,1,2,3)... not quite the same as spring-summer-winter-fall, or Q1,Q2,Q3,Q4, but instead shifted to 
                    better capture seasonal spending trends aligned in particular with high December spending
                    2 = Dec 1 to Feb 28 (biggest spending season), 3 = Mar 1 to May 31, 0 = June 1 to Aug 30 (lowest spending season), 1 = Sept 1 to Nov 30

    Finally, drop the date column from the dataframe, and sort the dataframe by ['day','shop_id','item_id']  (original dataframe seems to be sorted by month, but unsorted within each month)

    returns: the cleaned/dated/merged/feature-augmented DataFrame
    """

    print(f'Shape of original sales_train data set = {sales_train.shape}')

    # remove outlier rows from training set (first make a DataFrame copy so we can reuse sales_train later, if we need to)
    sales_train_cleaned = sales_train.copy(deep=True)
    print('Rows being deleted:')
    for i in sorted(delete_rows, reverse=True):   # delete the rows in reverse order to be sure we don't run into issues with indexing
        print(f'  {i}')
        sales_train_cleaned.drop(sales_train_cleaned.index[i],inplace=True)
    print(f'Shape of sales_train_cleaned after {len(delete_rows)} outlier rows were removed: {sales_train_cleaned.shape}')
    
    # Merge the 3 shops we are nearly certain must correctly fit into the other shops' dropout regions:
    sales_train_cleaned.shop_id = sales_train_cleaned.shop_id.replace(merge_shops)
    print(f'Shape of sales_train_cleaned after merging shops as in {merge_shops}: {sales_train_cleaned.shape}')

    # Remove irrelevant shops entirely from the sales_train_cleaned DataFrame:
    if delete_shops:
        sales_train_cleaned = sales_train_cleaned.query('shop_id != @delete_shops')
        print(f'Shape of sales_train_cleaned after deleting shops {delete_shops}: {sales_train_cleaned.shape}')

    # sales_train_cleaned = sales_train_cleaned[sales_train_cleaned.shop_id != 9]
    # sales_train_cleaned = sales_train_cleaned[sales_train_cleaned.shop_id != 13]
    # print(f'Shape of sales_train_cleaned after removal of shops: {sales_train_cleaned.shape})
    # print(f'{sales_train_cleaned.shop_id.nunique()} shops remaining in sales_train_cleaned DataFrame: {sorted(sales_train_cleaned.shop_id.unique())})

    sales_train_cleaned = sales_train_cleaned.astype({'date_block_num':np.int8,'shop_id':np.int8,'item_id':np.int16,
                                                    'item_price':np.float32,'item_cnt_day':np.int16}).reset_index(drop=True)

    # merge dataframes so we optionally include test elements in our EDA and feature generation
    test_prep = test.copy(deep=True)
    test_prep['date_block_num'] = 34
    test_prep['date'] = '1.11.2015' #pd.Timestamp(year=2015, month=11, day=1)
    sales_traintest_cleaned = sales_train_cleaned.append(test_prep).fillna(0)

    traintest = sales_traintest_cleaned.merge(items_augmented[['item_id','item_category_id','item_tested','item_name']],on='item_id',how='left').reset_index(drop=True)
    traintest = traintest.merge(item_categories_augmented[['item_category_id','en_cat_name','item_cat_tested','item_category3','item_category4']],on='item_category_id',how='left').reset_index(drop=True)
    traintest = traintest.merge(shops_augmented[['shop_id', 'en_shop_name', 'shop_city', 'shop_federal_district', 'shop_tested']], on='shop_id',how='left').reset_index(drop=True)  
    traintest = traintest.merge(shops_new[['shop_id', 'shop_type']], on='shop_id',how='left').reset_index(drop=True)
    traintest = traintest[['date', 'date_block_num', 'item_price', 'item_cnt_day', 'shop_id', 'item_id', 'item_name', 'item_tested', 'item_category_id', 'en_cat_name', 'item_cat_tested',
                                'item_category3', 'item_category4', 'en_shop_name', 'shop_type','shop_tested', 'shop_federal_district', 'shop_city']]
    traintest.columns = ['date', 'month', 'price', 'sales', 'shop_id', 'item_id', 'item_name', 'it_test', 'item_category_id', 'item_category_name', 'it_cat_test', 'item_cat3', 'item_cat4', 
                            'shop_name', 'sh_cat', 'sh_test', 'district', 'city']
    traintest.item_name.astype(str)
    print(f'Shape of traintest after merging: {traintest.shape}')
        
    # Add in the time-based feature columns
    traintest.date =  pd.to_datetime(traintest.date, dayfirst=True, infer_datetime_format=True)
    traintest.insert(1,'day', traintest.date.apply(lambda x: (x - day0).days))
    traintest.insert(2,'DoW', traintest.date.apply(lambda x: x.strftime('%a')))  # lambda x: (x.weekday()+1)%7 )  # 0=Sun, 1=Mon, ... 6= Sat   # use x.strftime('%A') to get full text string of day (Sunday, Monday, ...)
    traintest.insert(3,'DoM', traintest.date.apply(lambda x: x.day))
    traintest.insert(4,'week', (traintest.day+2) // 7 )             # add the 2 days so we have end of a week coinciding with end of training data Oct. 31, 2015
    traintest.insert(5,'qtr', (traintest.month + 2) // 3 )          # add the 2 months so we have end of a quarter aligning with end of training data Oct. 31, 2015
    traintest.insert(6,'season', (traintest.month + 2) % 4 ) 
    traintest.drop('date',axis=1,inplace=True)
    traintest = traintest.sort_values(['day','shop_id','item_id']).reset_index(drop=True)  # note that the train dataset is sorted by month, but nothing obvious within the month; we sort it here for consistent results in calculations below
    print(f'Shape of traintest after creating time-based feature columns: {traintest.shape}')
    print(f'traintest DataFrame creation done: {strftime("%a %X %x")}\n')
    return traintest

print(f'\nDone: {strftime("%a %X %x")}\n')


Done: Thu 08:28:04 06/18/20



In [5]:
if not traintest_loaded:
    print(f'traintest dataframe creation started: {strftime("%a %X %x")}\n')
    traintest = clean_merge_augment()

    # optional save file as feather type (big file; don't store inside repo) and/or csv.gz type (inside repo)
    %cd "{OUT_OF_REPO_PATH}"
    traintest.to_feather('traintest.ftr')
    print("traintest.ftr feather file stored on google drive, outside repo")
    %cd "{GDRIVE_REPO_PATH}"
    # alternative, or, in addition, can save as csv.gz for < 100 MB storage and sync with GitHub
    compression_opts = dict(method='gzip',
                            archive_name='traintest.csv')  
    traintest.to_csv('data_output/traintest.csv.gz', index=False, compression=compression_opts)
    print("traintest.csv.gz file stored on google drive in data_output directory")
    print(f'traintest file save done: {strftime("%a %X %x")}')

display(traintest[traintest.week == 102].tail(2))

# Copy in case we screw up tt; don't want to recreate traintest
tt = traintest.copy(deep=True)

print(f'\ntraintest done: {strftime("%a %X %x")}')

Unnamed: 0,day,DoW,DoM,week,qtr,season,month,price,sales,shop_id,item_id,item_name,it_test,item_category_id,item_category_name,it_cat_test,item_cat3,item_cat4,shop_name,sh_cat,sh_test,district,city
2257039,718,Sat,20,102,8,1,23,399,1,59,21970,shar predictor soccer ball,False,69,Gifts - Souvenirs,True,Gifts,Gifts,"Yaroslavl shopping center ""Altair""",SEC,True,Central,Yaroslavl
2257040,718,Sat,20,102,8,1,23,499,1,59,22060,epic bluray dvd,True,37,Movie - Blu-Ray,True,Movies,Movies,"Yaroslavl shopping center ""Altair""",SEC,True,Central,Yaroslavl



traintest done: Thu 08:28:04 06/18/20


#2.5) ***items*** Dataset: EDA, Cleaning, Correlations, and Feature Generation

---



---



###2.5.1) Initial data exploration and Russian -> English translation

####Thoughts regarding items dataframe
Let's first look at how many training examples we have to work with...

Many of the items have similar names, but slightly different punctuation, or only very slightly different version numbers or types.  (e.g., 'Call of Duty III' vs. 'Call of Duty III DVD')

One can expect that these two items would have similar sales in general, and by grouping them into a single feature category, we can eliminate some of the overfitting that might come as a result of the relatively small ratio of (training set shop-item-date combinations = 2935849)/(total number of unique items = 22170).  (This is an average of about 132 rows in the sales_train data for each shop-item-date combination that we are using to train our model.  Our task is to produce a monthly estimate of sales (for November 2015), so it is relevant to consider training our model based on how many sales in a month vs. how many sales in the entire training set.  Given that the sales_train dataset covers the time period from January 2013 to October 2015 (34 months), we have on average fewer than 4 shop-item combinations in our training set for a given item in any given month.  Furthermore, as we are trying to predict for a particular month (*November* 2015), it is relevant to consider how many rows in our training set occur in the month of November.  The sales_train dataset contains data for two 'November' months out of the total 34 months of data.  Another simple calculation gives us an estimate that our training set contains on average 0.23 shop-item combinations per item for November months.

To summarize:

*  *sales_train* contains 34 months of data, including 2935849 shop-item-date combinations
*  *items* contains 22170 "unique" item_id values

In the *sales_train* data, we therefore have:
*  on average, 132 rows with a given shop-item pair for a given item_id
*  on average, 4 rows with a given shop-item pair for a given item_id in a given month
*  on average, 0.23 rows with a given shop-item pair for a given item_id in all months named 'November'

If we wish to improve our model predictions for the following month of November, it behooves us to use monthly grouping of sales, or, even better, November grouping of sales.  This smooths out day-to-day variations in sales for a better monthly prediction.  However, the sparse number of available rows in the *sales_train* data will contribute to inaccuracy in our model training and predictions.

Imagine if we could reduce the number of item_id values from 22170 to perhaps half that or even less.  Given that the number of rows for training (per item, on a monthly or a November basis) is so small, then such a reduction in the number of item_id values would have a big impact.  (The same is true for creating features to supplement "shop_id" so as to group and reduce the individuality of each shop - and thus effectively create, on average, more rows of training data for each shop-item pair.

####Translate and Ruminate
We will start by translating the Russian text in the dataframe, and add our ruminations on possible new features we can generate.

The dataframe *items_transl* (equivalent to *items* plus a column for English translation) is saved as a .csv file so we do not have to repeat the translation process the next time we open a Google Colab runtime.

In [6]:
print(items_augmented.info())
print("\n")
print(items_augmented.tail(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   item_id               22170 non-null  int64 
 1   item_name             22169 non-null  object
 2   item_tested           22170 non-null  bool  
 3   item_category_id      22170 non-null  int64 
 4   orig_eng_name_transl  22170 non-null  object
dtypes: bool(1), int64(2), object(2)
memory usage: 714.6+ KB
None


       item_id                                            item_name  item_tested  item_category_id                                   orig_eng_name_transl
22160    22160                                   vanity fair region        False                40                                   Vanity Fair (Region)
22161    22161           yaroslav thousand of year ago e bluray dvd        False                37                YAROSLAV. Thousands of years ago e (BD)
22162    221

###2.5.2) **NLP for feature generation from items data**
Automate the search for commonality among items, and create new categorical feature to prevent overfitting from close similarity between many item names

####**Delimited Groups of Words**

Investigating "special" delimited word groups (like this) or [here] or /hobbitville/ that are present in item names, and may be particularly important in creating n>1 n-grams for uniquely identifying items so that we can tell if two items are the same or nearly the same

#####Some details on the approach, and code for helper functions to clean and separate the text:

In [7]:
# explanation of regex string I'm using to parse the item_name
'''

^\s+|\s*[,\"\/\(\)\[\]]+\s*|\s+$

gm
1st Alternative ^\s+
^ asserts position at start of a line
\s+ matches any whitespace character (equal to [\r\n\t\f\v ])
+ Quantifier — Matches between one and unlimited times, as many times as possible, giving back as needed (greedy)

2nd Alternative \s*[,\"\/\(\)\[\]]+\s*
\s* matches any whitespace character (equal to [\r\n\t\f\v ])
* Quantifier — Matches between zero and unlimited times, as many times as possible, giving back as needed (greedy)
Match a single character present in the list below [,\"\/\(\)\[\]]+
+ Quantifier — Matches between one and unlimited times, as many times as possible, giving back as needed (greedy)
, matches the character , literally (case sensitive)
\" matches the character " literally (case sensitive)
\/ matches the character / literally (case sensitive)
\( matches the character ( literally (case sensitive)
\) matches the character ) literally (case sensitive)
\[ matches the character [ literally (case sensitive)
\] matches the character ] literally (case sensitive)
\s* matches any whitespace character (equal to [\r\n\t\f\v ])
* Quantifier — Matches between zero and unlimited times, as many times as possible, giving back as needed (greedy)

3rd Alternative \s+$
\s+ matches any whitespace character (equal to [\r\n\t\f\v ])
+ Quantifier — Matches between one and unlimited times, as many times as possible, giving back as needed (greedy)
$ asserts position at the end of a line

Global pattern flags
g modifier: global. All matches (don't return after first match)
m modifier: multi line. Causes ^ and $ to match the begin/end of each line (not only begin/end of string)
'''
print(f'done: {strftime("%a %X %x")}')  # prevent Jupyter from printing triple-quoted comments

done: Thu 08:28:04 06/18/20


In [8]:
# This cell contains no code to run; it is simply a record of some inspections that were done on the items database

# before removing undesirable characters / punctuation from the item name,
#   let's see if we can find n-grams or useful describers or common abbreviations by looking between the nasty characters
# first, let's see what characters are present in the en_item_name column
'''
nasty_symbols = re.compile('[^0-9a-zA-Z ]')
nasties = set()
for i in range(len(items_transl)):
  n = nasty_symbols.findall(items_transl.at[i,'en_item_name'])
  nasties = nasties.union(set(n))
print(nasties)
{'[', '\u200b', 'ñ', '(', ')', '.', 'à', '`', 'ó', '®', 'Á', 
'\\', 'è', '&', '-', ':', 'ë', '_', 'û', '»', '=', '+', ']', ',', 
'«', 'ú', "'", 'ö', '#', 'ä', ';', 'ü', '"', 'ô', '/', '№', 'é', 
'í', '!', '°', 'å', '*', 'ĭ', 'ð', '?', 'â'}
'''
# From the above set of nasty characters, it looks like slashes, single quotes, double quotes, parentheses, and square brackets might enclose relevant n-grams
# Let's pull everything from en_item_name that is inside ' ', " ", (), or [] and see how many unique values we get, and if they are n-grams or abbreviations, for example
# It also seems that many of the item names end in a single character "D" for example, which should be converted to DVD

# ignore the :&+' stuff for now...
# Let's set up columns for ()[]-grams, for last string in the name, and for first string in name, and for text that precedes ":", and for text that surrounds "&" or "+"
#   but first, we will strip out every nasty character except ()[]:&+'"/ and replace the nasties with spaces, then eliminating double spaces

'''
# sanity check:
really_nasty_symbols = re.compile('[^0-9a-zA-Z \(\)\[\]:&+\'"/]')
really_nasties = set()
for i in range(len(items_transl)):
  rn = really_nasty_symbols.findall(items_transl.at[i,'en_item_name'])
  really_nasties = really_nasties.union(set(rn))
print(really_nasties)
{'\u200b', 'ñ', '.', 'à', '`', 'ó', '®', 'Á', '\\', 'è', '-', 'ë', '_', 'û', '»', '=', ',', '«', 'ú', 'ö', '#', 'ä', ';', 'ü', 'ô', '№', 'é', 'í', '!', '°', 'å', '*', 'ĭ', 'ð', '?', 'â'}
OK, looks good
'''
print(f'done: {strftime("%a %X %x")}')  # prevent Jupyter from printing triple-quoted comments

done: Thu 08:28:04 06/18/20


In [9]:
#  Start by defining stopwords and delimiters and punctuation that we wish to remove
#  Then, create a couple of functions to use for text cleaning, and for extracting delimited text n-grams

# stopwords to remove from item names (these are only a bit better than arbitrary selections from large stopwords lists -- may be worth adjusting them)
stop_words = "a,the,an,only,more,are,any,on,your,just,it,its,has,with,for,by,from".split(",")

# pre-compile regex strings to use for fast symbol removal or delimiting
nasty_symbols_re = re.compile(r'[^0-9a-zA-Z ]')  # remove all punctuation
really_nasty_symbols_re = re.compile(r'[^0-9a-zA-Z ,;\"\/\(\)\[\]\:\-\@]')  # remove nasties, but leave behind the delimiters
delimiters_re = re.compile(r'[,;\"\/\(\)\[\]\:\-\@\u00AB\u00BB~<>]')  # unicodes are << and >> thingies
# special symbols indicating a delimiter --> a space at start or end of item name will be removed at split time, along with ,;/()[]:"-@~<<>><>
delim_pattern_re = re.compile(r'^\s+|\s*[,;\"\/\(\)\[\]\:\-\@\u00AB\u00BB~<>]+\s*|\s+$') 
multiple_whitespace_re = re.compile(r'[ ]{2,}')

# pre-compile some specific regex strings to deal with inconsistencies in item names (more of this will be done later, after delimiting)
cleanup_text = {}
cleanup_text['preorder'] = re.compile(r'pre.?order')
cleanup_text[' dvd'] = re.compile(r'\s+d$')  #several item names end in "d" -- which actually seems to indicate dvd (because the items I see are in category 40: Movies-DVD)... standardize so d --> dvd
cleanup_text['digital version'] = re.compile(r'digital in$') # several items seem to end in "digital in"... maybe in = internet?, but looking at nearby items/categories, 'digital version' looks standard
cleanup_text['bluray dvd'] = re.compile(r'\bbd\b|\bblu\s+ray\b|\bblu\-ray\b|\bblueray\b|\bblue\s+ray\b|\bblue\-ray\b')
cleanup_text['007 : james bond : skyfall'] = re.compile(r'\bskyfall\b|\bskayfoll\b')
cleanup_text[' and '] = re.compile(r'[\&\+]')
cleanup_text[' xbox'] = re.compile(r'\bx[^0-9a-zA-Z ]box')  # anything like "x box" or "x-box" or "x%box" gets converted to a standard "xbox"
cleanup_text[' ps'] = re.compile(r'\bp[^0-9a-zA-Z ]s')      # attempt to do the same with "p-s4" --> "ps4"
cleanup_text['sim city'] = re.compile(r'\bsim(s)?(\s)?(city)?\b')
cleanup_text['watchdog'] = re.compile(r'\bwatch(\s)?dog\b')
cleanup_text['bloodborne'] = re.compile(r'\bbloodborn(e)?\b')
cleanup_text['plant vs zombie'] = re.compile(r'\bplant\b.*zombie\b')
cleanup_text['tom clancy'] = re.compile(r'\btom clancy(s)?\b')
cleanup_text['pirate caribbean'] = re.compile(r'\bpirate\b.*car\w*\b')
cleanup_text['one of you'] = re.compile(r'\bone of (yo)?u\b')
cleanup_text['titanfall'] = re.compile(r'\btitan(\s)?fall\b')

def maid_service(text):
    """
    Compact routine to implement multiple regex substitutions using the above 'cleanup_text' dictionary
    """
    text = text.lower()
    for repl_text, pattern in cleanup_text.items():
        text = pattern.sub(repl_text, text)
    #r = re.compile(r'\bskayfoll\b')   # can add 'quickie' items here if you don't want to add to above dictionary, or if you want to perform something other than re.sub
    #text = r.sub('skyfall',text)  
    return text

def text_total_clean(text):
    """
    Gives a punctuation-free, cleaned, lemmatized version of the original English translation
    inputs: (text): the original en_item_name single-string, uncleaned, translated version of the Russian item name
    returns: single-string text, made lowercase, stripped of "really_nasties" and multiple spaces, and every word lemmatized
    """
    text = maid_service(text)
    text = delimiters_re.sub(" ", text)  # replace all delimiters with a space; other nasties get simply deleted
    text = nasty_symbols_re.sub("", text)  # delete anything other than letters, numbers, and spaces
    text = multiple_whitespace_re.sub(" ", text)  # replace multiple spaces with a single space
    text = text.strip() # remove whitespace around string
    # lemmatize each word
    text = " ".join([lemmatizer.lemmatize(w) for w in text.split(" ") if w not in stop_words])
    text = maid_service(text)
    return text

def text_clean_delimited(text):
    """
    Gives a punctuation-free, cleaned version of the original English translation, 
        but the function returns a list of strings instead of a single string,
        with each element in the list corresponding to text that was separated from neighboring
        text with one of the above-defined 'delimiter' characters
        (so, rather than analyzing the full item name for n-grams, we define an item's important
        n-grams as being separated by such delimiters.  It greatly reduces the number of n-grams we need to analyze)
    inputs: (text): the original en_item_name single-string, uncleaned, translated version of the Russian item name
    returns: en_item_name made lowercase, stripped of "really_nasties" and multiple spaces, 
        in a list of strings that had been separated by one of the above 'delimiters',
        and, with every word in every string lemmatized 
    """
    text = maid_service(text)
    text = really_nasty_symbols_re.sub("", text)  # just delete the nasty symbols
    text = multiple_whitespace_re.sub(" ", text)  # replace multiple spaces with a single space
    t = delim_pattern_re.split(text)           # split item_name at all delimiters, irrespective of number of spaces before or after the string or delimiter
    text = []
    for i in t:
        text.append(maid_service(i))
    text = [x.strip() for x in text if x != ""]           # remove empty strings "" from the list of split items in text, and remove whitespace outside text n-gram
    # lemmatize each word
    lemtext = []
    for ngram in text:
        lemtext.append(" ".join([lemmatizer.lemmatize(w) for w in ngram.split(" ") if w not in stop_words]))
    return lemtext

print(f'done: {strftime("%a %X %x")}')

done: Thu 08:28:04 06/18/20


#####Add 'delimited' and 'cleaned' data columns; shorten the titles of other columns so dataframe fits better on the screen

In [10]:
items_delimited = items_augmented[['item_id','orig_eng_name_transl','item_tested','item_category_id']].copy(deep=True)
# delete the wide "item_name" column so we can read more of the data table width-wise
items_delimited = items_delimited.rename(columns = {'orig_eng_name_transl':'item_name','item_category_id':'i_cat_id','item_tested':'i_tested'})
items_in_test_set = test.item_id.unique()
# items_delimited["i_tested"] = False
# for i in items_in_test_set:
#   items_delimited.at[i,"i_tested"] = True


# add item_category name with delimiter to the item_name, as this will be useful info for grouping similar items (remove delimiting punctuation from cat names first, so it stays as one chunk of text)
items_delimited['item_name'] = items_delimited.apply(lambda x: text_total_clean(item_categories_augmented.at[x.i_cat_id,'en_cat_name']) + " : " + x.item_name, axis=1)

# add a column of simply cleaned text without any undesired punctuation or delimiters
items_delimited['clean_item_name'] = items_delimited['item_name'].apply(text_total_clean)

# now add a column of lists of delimited (cleaned) text
items_delimited['delim_name_list'] = items_delimited['item_name'].apply(text_clean_delimited)

# remove duplicate entries and single-character 1-grams to assist with operations to come later in this notebook
alphnum = list(string.ascii_lowercase) + list('1234567890')  # get rid of all length=1 1-grams
def remove_dupes_singles(gramlist):
    unwanted = set(alphnum)
    dupe_gramset = unwanted
    return [x for x in gramlist if x not in dupe_gramset and not dupe_gramset.add(x)]
items_delimited.delim_name_list = items_delimited.delim_name_list.apply(lambda x: remove_dupes_singles(x) )


# have a look at what we got with our delimited text globs
def maxgram(gramlist):
    maxg = 0
    for g in gramlist:
        maxg = max(maxg,len(g.split()))
    return maxg
items_delimited['d_len'] = items_delimited.delim_name_list.apply(lambda x: len(x))
items_delimited['d_maxgram'] = items_delimited.delim_name_list.apply(maxgram)

#items_delimited.to_csv("data_output/items_delimited.csv", index=False)

print(f'done: {strftime("%a %X %x")}')
print("\n")
print(items_delimited.describe())
print("\n")
print(items_delimited.iloc[31][:])
print("\n")
items_delimited.head()

done: Thu 08:28:12 06/18/20


         item_id  i_cat_id  d_len  d_maxgram
count      22170     22170  22170      22170
mean  11,084.500    46.291  3.347      4.497
std    6,400.072    15.941  1.336      1.984
min            0         0      1          2
25%    5,542.250        37      2          3
50%   11,084.500        40      3          4
75%   16,626.750        58      4          5
max        22169        83     13         17


item_id                                                                                                 31
item_name                                              movie bluray dvd : 007: COORDINATES "SKAYFOLL» (BD)
i_tested                                                                                              True
i_cat_id                                                                                                37
clean_item_name       movie bluray dvd 007 coordinate 007 james bond 007 : james bond : skyfall bluray dvd
delim_name_list    [movie bl

Unnamed: 0,item_id,item_name,i_tested,i_cat_id,clean_item_name,delim_name_list,d_len,d_maxgram
0,0,movie dvd : ! POWER IN glamor (PLAST.) D,False,40,movie dvd power in glamor plast dvd,"[movie dvd, power in glamor, plast, dvd]",4,3
1,1,"program home and office digital : ! ABBYY FineReader 12 Professional Edition Full [PC, Digital Version]",False,76,program home and office digital abbyy finereader 12 professional edition full pc digital version,"[program home and office digital, abbyy finereader 12 professional edition full, pc, digital version]",4,6
2,2,movie dvd : *** In the glory (UNV) D,False,40,movie dvd in glory unv dvd,"[movie dvd, in glory, unv, dvd]",4,2
3,3,movie dvd : *** BLUE WAVE (Univ) D,False,40,movie dvd blue wave univ dvd,"[movie dvd, blue wave, univ, dvd]",4,2
4,4,movie dvd : *** BOX (GLASS) D,False,40,movie dvd box glass dvd,"[movie dvd, box, glass, dvd]",4,2


In [11]:
# # adjust items_augmented to remove delimited name list, and instead put a clean item name in the file (but with no prefix of item category)
# # add a column of simply cleaned text without any undesired punctuation or delimiters
# items_augmented.delim_name_list = items_augmented.item_name
# items_delimited.item_name = items_augmented.delim_name_list.apply(text_total_clean)
# items_augmented.columns = ['item_id','item_category_id','item_name','item_tested','orig_eng_name_transl']
# items_augmented = items_augmented[['item_id','item_name','item_tested','item_category_id','orig_eng_name_transl']]
# # items_augmented.to_csv("data_output/items_augmented.csv", index=False)
# items_augmented.head()

In [12]:
# do some more text manipulation to help ensure items are properly grouped
#   also, expand the breadth of n-gram matches to ignore things like version number, in an effort to reduce the final number of clusters that are generated
#   (looking for perhaps 200 clusters instead of 2000+ that we get without this extra treatment (see 'items_nlp_clusters_v3...ipynb' ))

highlight_roots = OrderedDict()
cleanup_sub = OrderedDict()
cleanup_final = OrderedDict()
#cleanup_complete_replace = OrderedDict()

# for some matches, I want to only make a new entry in the list to standardize games to root values (e.g., "assasin creed special ops" = "assasin creed part 2")
#     The new list element will be a 5-gram, for example, to give it substantial weight when grouping items
#     The original list of delimited text will remain the same, so as to catch matches like "assasin creed special ops dvd bluray english version"

# use replacement text = '' if you want the create operation to use the match as a base string (possibly adding to it with fill_strings until n_gram size is reached)
games1 = "adventure of tintin|advanced warfare|army of two|assasin creed|angry bird|batman|battlefield|behind enemy line|black ops|bloodborne|borderland|call of duty|chaggington funny train"
games2 = "child of light|dark soul|dead space|diablo|disney infinity|dragon age|elder scroll|far cry|fifa|final fantasy|fury|game of throne|god of war|gran turismo|grand theft auto|harry potter|hobbit|injustice god|james bond"
games3 = "jurassic|lord of ring|mario|masha and bear|max payne|medal of honor|men in black|metal gear solid|mickey mouse|might and magic|modern warfare|mortal kombat|nba|need speed|nhl|ninja storm|one of you|pirate caribbean"
games4 = "plant vs zombie|pro evolution|resident evil|secret of unicorn|shadow of mordor|sherlock holmes|sid meiers civilization|simcity|skylander|sniper elite|star war|stick of truth|street fighter"
games5 = "tiger wood|titanfall|tom clancy|tomb raider|total war|transformer|walking dead|warhammer|watchdog|witcher|world of warcraft"
popular_games = re.compile(rf'\b({games1}|{games2}|{games3}|{games4}|{games5})\b')
highlight_roots['compress game names to root values'] =     {'optype':['create'], 'reg_pattern':popular_games, 
                                                                'replacement_text':'', 'final_gram_n':5, 
                                                                'fill_strings':['game','computer','electronic','multirelease']}

lego = re.compile(r'\blego\b')
#lego = re.compile(rf'\b(lego.*({games1}|{games2}|{games3}|{games4}|{games5})?|({games1}|{games2}|{games3}|{games4}|{games5}).*lego)\b')
highlight_roots['lego products'] =                          {'optype':['create'], 'reg_pattern':lego,
                                                                'replacement_text':'lego brand lego style game'}

popular_companies = re.compile(rf'\b(1c|abbyy)\b')
highlight_roots['highlight product origins'] =              {'optype':['create'], 'reg_pattern':popular_companies, 
                                                                'replacement_text':'', 'final_gram_n':4, 
                                                                'fill_strings':['educational','software','learning']}

# for some of the matches, I just want to do an inplace substitution
fix_accessory_game = re.compile(r'\baccessory game\b')
cleanup_sub['game accessory'] =         {'optype':['sub'], 'replacement_text':'game accessory', 'reg_pattern':fix_accessory_game}

biz = re.compile(r'\b(firm|enterprise|company|corporation|shop|store|outlet)\b')
cleanup_sub['standardize biz'] =        {'optype':['sub'], 'replacement_text':'business', 'reg_pattern':biz}

digit = re.compile(r'\b(digital|download|online)(\s?(version|edition|set|box set))?\b')
cleanup_sub['special edition'] =        {'optype':['sub'], 'replacement_text':'online digital version', 'reg_pattern':digit}

special = re.compile(r'\b(collector|premier|platinum|special|suite)(.*(version|edition|set|box set|suite))?\b')
cleanup_sub['special edition'] =        {'optype':['sub'], 'replacement_text':'special version', 'reg_pattern':special}

std = re.compile(r'\b(standard|std)(\s?(edition|version|set|box set))?\b')
cleanup_sub['standard edition'] =       {'optype':['sub'], 'replacement_text':'standard version', 'reg_pattern':std}

russia = re.compile(r'\b(russian|ru)(\s?(edition|version|set|box set|documentation|instruction|language|format|subtitle|feature))?\b')
cleanup_sub['russian version'] =        {'optype':['sub'], 'replacement_text':'russian language version', 'reg_pattern':russia}

engl = re.compile(r'\b(english|en|eng|engl)(\s?(edition|version|set|box set|documentation|instruction|language|format|subtitle|feature))?\b')
cleanup_sub['english version'] =        {'optype':['sub'], 'replacement_text':'english language version', 'reg_pattern':engl}


# for other matches, I want to create a new n-gram and insert it into the list, and also do an inplace substitution
#   substitution text is made longer or shorter, depending on rough importance to matching (longer matching n-grams get more weight)
yo = re.compile(r'\b(yo|yoyo|yo yo)\b')
cleanup_final['yo yo yo'] =         {'optype':['sub','create'], 'replacement_text':'yo yo toy game fun', 'reg_pattern':yo}

music = re.compile(r'\b(cd mirex|mirex cd|cd mirex cd|vinyl|cd.*production firm|cd.*local production|mp3)(\s?(cd mirex|mirex cd|cd mirex cd|vinyl|cd.*production firm|cd.*local production|mp3))?\b')
cleanup_final['music media'] =      {'optype':['sub','create'], 'replacement_text':'music media', 'reg_pattern': music}

dvdclean = re.compile(r'\b(\d\s?)?(disc\s?)?(\d\s?)?dvd\b')
cleanup_final['dvd'] =             {'optype':['sub','create'], 'replacement_text':'dvd', 'reg_pattern':dvdclean}

brdvd = re.compile(r'\b(4k\s?)?(\d\s?)?(bluray\s?)?(\d\s?)?(dvd\s?)?(and\s?)?(\d\s?)?(disc\s?)?(4k\s?)?(\d\s?)?bluray(\s?and)?(\s?4k)?(\s?(\d\s?)?dvd)?(\s?4k)?(\s?and)?(\s?(\d\s?)?dvd)?\b|\b2bd\b')
cleanup_final['bluray dvd'] =      {'optype':['sub','create'], 'replacement_text':'bluray dvd', 'reg_pattern':brdvd}

br3d=re.compile(r'\b(\d\s?)?(disc)?\s?(\d\s?)?(dvd)?\s?(and)?\s?(3d\s?(\d\s?)?(dvd)?\s?(\d\s?)?bluray\s?(\d\s?)?(dvd)?|(\d\s?)?bluray\s?(\d\s?)?(dvd)?\s?3d)\s?(\d\s?)?(bluray dvd)?\s?(3d)?\s?(and)?\s?(\d\s?)?(dvd)?\s?(3d)?\b')
cleanup_final['3d bluray dvd'] =   {'optype':['sub','create'], 'replacement_text':'3d bluray dvd', 'reg_pattern':br3d}

macregx = re.compile(r'\b(support\s?)?(mac|ipad|macbook|powerbook|imac|apple)(\s?support)?\b')
cleanup_final['pc'] =              {'optype':['create'], 'replacement_text':'mac computing platform product', 'reg_pattern':macregx}

pcregx = re.compile(r'\b(support\s?)?(pc|windows|microsoft windows)(\s?support)?\b')
cleanup_final['pc'] =              {'optype':['create'], 'replacement_text':'pc computing platform product', 'reg_pattern':pcregx}

playsta = re.compile(r'\b(support\s?)?p(laystation|sp|\s?s|\s?s?\s?(move|2|3|4|pro|vita|vita 1000))\b')
cleanup_final['sony playstn'] =    {'optype':['create'], 'replacement_text':'sony playstation gaming platform', 'reg_pattern':playsta}

xbox = re.compile(r'\bx?\s?box\s?(one|360|live)(.*(kinect|knect))?\b')
cleanup_final['microsoft xbox'] =  {'optype':['create'], 'replacement_text':'microsoft xbox gaming platform','reg_pattern':xbox}

kinect = re.compile(r'\b(support)?\s?m?\s?s?\s?(kinect|knect)\b')
cleanup_final['microsoft knect'] = {'optype':['create'], 'replacement_text':'microsoft xbox gaming platform', 'reg_pattern':kinect}

msoffice = re.compile(r'\b(microsoft office|ms office|m office|office mac|office home|office professional|home and office|office student|office enterprise)\b')
cleanup_final['ms office'] =       {'optype':['create'], 'replacement_text':'microsoft office productivity software', 'reg_pattern':msoffice}

educate = re.compile(r'\b(education|educational|development|course|school|history|lesson|accounting|b8)\b')
cleanup_final['educational dev'] = {'optype':['create'], 'replacement_text':'educational development training lessons', 'reg_pattern':educate}

paycard = re.compile(r'\b(payment|card|ticket|debit)(\s?(card|ticket|debit))?\b')
cleanup_final['payment card'] =    {'optype':['create'], 'replacement_text':'payment card ticket', 'reg_pattern':paycard}

licenses = re.compile(r'\b(subscription|renewal|1 year|extension|license)(.*(subscription|renewal|1 year|extension|license))?(.*(subscription|renewal|1 year|extension|license))?\b')
cleanup_final['licenses'] =        {'optype':['create'], 'replacement_text':'license renewal subscription extension', 'reg_pattern':licenses}

download = re.compile(r'\b(online|digital|download|access|without disc|without disk|epay)(.*(online|digital|download|access|without disc|without disk|epay|version|edition))?\b')
cleanup_final['downloads'] =       {'optype':['create'], 'replacement_text':'online download version', 'reg_pattern':download}

ship = re.compile(r'\b(courier|delivery|deliver|postage|mail|send|ship|shipment)(.*(courier|delivery|deliver|postage|mail|send|ship|shipment))?\b')
cleanup_final['shipping'] =        {'optype':['create'], 'replacement_text':'shipping delivery postage mail', 'reg_pattern':ship}

virus = re.compile(r'\b(point pixel|kaspersky|panda|drweb|eset nod32|security|antivirus|virus)(.*(point pixel|kaspersky|panda|drweb|eset nod32|security|antivirus|virus|software))?\b')
cleanup_final['antivirus'] =       {'optype':['create'], 'replacement_text':'antivirus defender internet security software', 'reg_pattern':virus}

print(f'done: {strftime("%a %X %x")}')

done: Thu 08:28:13 06/18/20


In [13]:
# here are the routines to implement the above pattern-matching instructions, and modify the delim_items_list column of the dataframe

def expand_gram(gram,final_gram_n,fill_strings):
    for f in range(final_gram_n - len(gram.split())):
        gram = gram + " " + fill_strings[f]
    return gram

def cleanup_service(gramlist=["word1 this is a 6 gram", "word1", "two gram", "three gram string"], 
                    pattern_dict=OrderedDict({'replace 0007 with 007':{'optype':['sub'],'reg_pattern':re.compile(r'\b0007\b'), 'replacement_text':'007', 
                                                                       'final_gram_n':4, 'fill_strings':['game','computer','electronic']},
                                             'replace skayfall with skyfall':{'optype':['sub','create'],'reg_pattern':re.compile(r'\bskayfall\b'), 'replacement_text':'skyfall'}})):
    """
    for text modification in the items_delimited dataframe, in an effort to help standardize terms to better highlight similarities between items,
    and to help group items a bit more broadly in some cases, so we create fewer clusters with the following graph/network analysis.

    gramlist = list of delimited n-grams provided typically from a single cell from 'items_delimited' DF, at a single row, in column = 'delim_name_list'
    pattern_dict = ordered dictionary of lists where operations are done in the order created by user (e.g., clean up "dvd" variants before cleaning up "bluray dvd" variants, so the latter becomes simpler in regex)
        keys = representative text, describing what is being done (somewhat irrelevant to this function)
        values = dict{  
                    optype = list of strings indicating if one wants to do one or more of the following 4 types of operation on the gramlist
                            'sub': (sub)stitute regex matches, searching each element in the gramlist,  (len(gramlist) remains the same, but each string in gramlist may shrink or grow or remain unchanged)
                            'create': (create) new "standardized" list elements if a match is found within the gramlist (so len(gramlist) grows by 1 for each match); original gramlist strings remain unchanged
                            'complete_replace': wherever you have matching elements in the gramlist, replace the entire gramlist element with the pattern_dict key (len(gramlist) remains the same, but n in each n-gram may change)
                    reg_pattern = regex patterns to find/substitute/create/replace, 
                    replacement_text = the text to put in place of the reg_pattern match, or to use when creating a new gramlist list element
                    final_gram_n = integer; desired final n-gram length (if desired) 
                    fill_strings= list(padding strings used to append on to the shorter regex matches to make final string = n grams in length, in order from most important to least)... must be long enough!
                    }
    """
    print_counter = 0
    previous_gramlist = gramlist.copy()
    for key_text, op_details in pattern_dict.items():
        optype = op_details['optype']
        do_sub = 'sub' in optype
        do_create = 'create' in optype
        do_replace = 'complete_replace' in optype
        do_regfind = do_create or do_replace
        reg_pattern = op_details['reg_pattern']
        replacement_text = op_details['replacement_text']
        gram_set_n = False  # don't try to expand the n-gram to an (n+x)-gram unless the information is provided
        if 'final_gram_n' in op_details.keys():
            gram_set_n = True
            final_gram_n = op_details['final_gram_n']
        if 'fill_strings' in op_details.keys():
            fill_strings = op_details['fill_strings']
        else:
            gram_set_n = False

        updated_gramlist = previous_gramlist.copy()

        if do_sub:   # do substitutions first (cleanup), then do create, then do full replace
            for idx, gram in enumerate(previous_gramlist):
                updated_gramlist[idx] = reg_pattern.sub(replacement_text, gram)
            previous_gramlist = updated_gramlist.copy()

        if do_regfind:
            for idx, gram in enumerate(previous_gramlist):
                patt_find = reg_pattern.findall(previous_gramlist[idx])                                 # ['education ', ('', '', '20', '500'), ('21', 'failed', '', '')]
                patt_find = [[match.strip()] if type(match) == str else match for match in patt_find]   # [['education'], ['20', '500'], ['21', 'failed']]
                patt_find = [[x.strip() for x in match if (x != "")] for match in patt_find]            # converts findall from mix of strings and tuples(including empty strings) to compact list of lists of string matches
                flat_patt_find = [x for p in patt_find for x in p]                                      # ['education', '20', '500', '21', 'failed']
                if len(flat_patt_find) > 0: #find_list:  # proceed only if we have found some matches
                    if do_create:  # do creations before full replacements
                        new_grams = []
                        for nmatch, match_str in enumerate(flat_patt_find):
                            if match_str:  # make sure it's not an empty list that was found as one of the matching groups
                                if replacement_text:
                                    new_grams.append(replacement_text)
                                elif gram_set_n:
                                    new_grams.append(expand_gram(match_str, final_gram_n, fill_strings))
                                else:
                                    new_grams.append(match_str)
                        updated_gramlist += new_grams

                
                    if do_replace:
                        print('You should not be in replace; not employed at this time')
                        modlist[idx] = replacement_text

        previous_gramlist = updated_gramlist.copy()
    return updated_gramlist

print(f'done: {strftime("%a %X %x")}')

done: Thu 08:28:13 06/18/20


In [14]:
%%time
# Test it on a few rows  highlight_roots, cleanup_sub, cleanup_sub_create
# dlist = items_delimited.at[36,'delim_name_list'].copy()

# for i in range(16000,16030):
#     dlist = items_delimited.at[i,'delim_name_list'].copy()
#     # print(dlist)
#     for clean_dict in [highlight_roots,cleanup_sub,cleanup_final]:
#         dlist = cleanup_service(dlist,clean_dict)
#     #print(dlist)

# Let's remove duplicate entries and unwanted stuff
clean_items_delim = items_delimited.copy(deep=True)
alphnum = list(string.ascii_lowercase) + list('1234567890')  # get rid of all length=1 1-grams
def remove_dupes(gramlist):
    unwanted = set(['and','weighed in','given y'] + alphnum)
    gramset = unwanted
    return [x for x in gramlist if x not in gramset and not gramset.add(x)]

for clean_dict in [highlight_roots,cleanup_sub,cleanup_final]: #[cleanup_games,cleanup_sub,cleanup_sub_create]:
    clean_items_delim.delim_name_list = clean_items_delim.delim_name_list.apply(lambda x: remove_dupes(cleanup_service(x,clean_dict)))

clean_items_delim['d_len'] = clean_items_delim.delim_name_list.apply(lambda x: len(x))
clean_items_delim['d_maxgram'] = clean_items_delim.delim_name_list.apply(maxgram)

print(f'done: {strftime("%a %X %x")}\n')
display(clean_items_delim.head())

done: Thu 08:28:18 06/18/20



Unnamed: 0,item_id,item_name,i_tested,i_cat_id,clean_item_name,delim_name_list,d_len,d_maxgram
0,0,movie dvd : ! POWER IN glamor (PLAST.) D,False,40,movie dvd power in glamor plast dvd,"[movie dvd, power in glamor, plast, dvd]",4,3
1,1,"program home and office digital : ! ABBYY FineReader 12 Professional Edition Full [PC, Digital Version]",False,76,program home and office digital abbyy finereader 12 professional edition full pc digital version,"[program home and office digital, abbyy finereader 12 professional edition full, pc, digital version, abbyy educational software learning, pc computing platform product, microsoft office productivity software, educational development training lessons, online download version]",9,6
2,2,movie dvd : *** In the glory (UNV) D,False,40,movie dvd in glory unv dvd,"[movie dvd, in glory, unv, dvd]",4,2
3,3,movie dvd : *** BLUE WAVE (Univ) D,False,40,movie dvd blue wave univ dvd,"[movie dvd, blue wave, univ, dvd]",4,2
4,4,movie dvd : *** BOX (GLASS) D,False,40,movie dvd box glass dvd,"[movie dvd, box, glass, dvd]",4,2


CPU times: user 4.95 s, sys: 1.65 ms, total: 4.95 s
Wall time: 4.95 s


In [15]:
# make item df easier to read for the following stuff
items_clean_delimited = clean_items_delim.copy(deep=True).drop("item_name", axis=1).rename(columns = {'clean_item_name':'item_name'})

display(items_clean_delimited.describe())
print("\n")
display(items_clean_delimited.head())

print(f'\ndone: {strftime("%a %X %x")}')

Unnamed: 0,item_id,i_cat_id,d_len,d_maxgram
count,22170.0,22170.0,22170.0,22170.0
mean,11084.5,46.291,3.991,4.628
std,6400.072,15.941,2.092,1.993
min,0.0,0.0,1.0,2.0
25%,5542.25,37.0,2.0,3.0
50%,11084.5,40.0,3.0,4.0
75%,16626.75,58.0,5.0,5.0
max,22169.0,83.0,15.0,18.0






Unnamed: 0,item_id,i_tested,i_cat_id,item_name,delim_name_list,d_len,d_maxgram
0,0,False,40,movie dvd power in glamor plast dvd,"[movie dvd, power in glamor, plast, dvd]",4,3
1,1,False,76,program home and office digital abbyy finereader 12 professional edition full pc digital version,"[program home and office digital, abbyy finereader 12 professional edition full, pc, digital version, abbyy educational software learning, pc computing platform product, microsoft office productivity software, educational development training lessons, online download version]",9,6
2,2,False,40,movie dvd in glory unv dvd,"[movie dvd, in glory, unv, dvd]",4,2
3,3,False,40,movie dvd blue wave univ dvd,"[movie dvd, blue wave, univ, dvd]",4,2
4,4,False,40,movie dvd box glass dvd,"[movie dvd, box, glass, dvd]",4,2



done: Thu 08:28:18 06/18/20


In [16]:
#%%time
# Inspect a single n, gathered from all possible delimited n-grams (4.64sec to run this cell without GPU, 4.01sec with GPU)
n_in_ngram = 4    # look at, e.g. length-4 (4-grams) strings of words
print_top_f = 10  # printout the top xx ngram strings, sorted by frequency of occurrence in the data

total_dupe_grams = 0
item_ngram = items_clean_delimited.copy(deep=True)
item_ngram['delim_ngrams'] = item_ngram.delim_name_list.apply(lambda x: [a for a in x if len(a.split()) == n_in_ngram])
display(item_ngram.head())
item_ngram = item_ngram.explode('delim_ngrams').reset_index(drop=True) # < 0.2sec this method (CPU)

freq_grams = item_ngram.delim_ngrams.value_counts()
grams_dupe = len(freq_grams[freq_grams > 1])
print(f'done: {strftime("%a %X %x")}')
print('\n')
print(f'Number of unique delimited {n_in_ngram}-grams: {len(freq_grams)}')
print(f'Number of unique delimited {n_in_ngram}-grams that are duplicated at least once: {grams_dupe}\n')
print(f'Top {print_top_f:d} {n_in_ngram:d}-grams by frequency of appearance in item names:')
print(freq_grams[:print_top_f])
print('\n')
display(item_ngram.head())

Unnamed: 0,item_id,i_tested,i_cat_id,item_name,delim_name_list,d_len,d_maxgram,delim_ngrams
0,0,False,40,movie dvd power in glamor plast dvd,"[movie dvd, power in glamor, plast, dvd]",4,3,[]
1,1,False,76,program home and office digital abbyy finereader 12 professional edition full pc digital version,"[program home and office digital, abbyy finereader 12 professional edition full, pc, digital version, abbyy educational software learning, pc computing platform product, microsoft office productivity software, educational development training lessons, online download version]",9,6,"[abbyy educational software learning, pc computing platform product, microsoft office productivity software, educational development training lessons]"
2,2,False,40,movie dvd in glory unv dvd,"[movie dvd, in glory, unv, dvd]",4,2,[]
3,3,False,40,movie dvd blue wave univ dvd,"[movie dvd, blue wave, univ, dvd]",4,2,[]
4,4,False,40,movie dvd box glass dvd,"[movie dvd, box, glass, dvd]",4,2,[]


done: Thu 08:28:18 06/18/20


Number of unique delimited 4-grams: 3464
Number of unique delimited 4-grams that are duplicated at least once: 378

Top 10 4-grams by frequency of appearance in item names:
pc computing platform product               2882
educational development training lessons    1736
sony playstation gaming platform            1262
1c educational software learning            1042
microsoft xbox gaming platform               773
game pc standard version                     756
microsoft office productivity software       619
music cd production business                 397
gift gadget robot sport                      295
program home and office                      277
Name: delim_ngrams, dtype: int64




Unnamed: 0,item_id,i_tested,i_cat_id,item_name,delim_name_list,d_len,d_maxgram,delim_ngrams
0,0,False,40,movie dvd power in glamor plast dvd,"[movie dvd, power in glamor, plast, dvd]",4,3,
1,1,False,76,program home and office digital abbyy finereader 12 professional edition full pc digital version,"[program home and office digital, abbyy finereader 12 professional edition full, pc, digital version, abbyy educational software learning, pc computing platform product, microsoft office productivity software, educational development training lessons, online download version]",9,6,abbyy educational software learning
2,1,False,76,program home and office digital abbyy finereader 12 professional edition full pc digital version,"[program home and office digital, abbyy finereader 12 professional edition full, pc, digital version, abbyy educational software learning, pc computing platform product, microsoft office productivity software, educational development training lessons, online download version]",9,6,pc computing platform product
3,1,False,76,program home and office digital abbyy finereader 12 professional edition full pc digital version,"[program home and office digital, abbyy finereader 12 professional edition full, pc, digital version, abbyy educational software learning, pc computing platform product, microsoft office productivity software, educational development training lessons, online download version]",9,6,microsoft office productivity software
4,1,False,76,program home and office digital abbyy finereader 12 professional edition full pc digital version,"[program home and office digital, abbyy finereader 12 professional edition full, pc, digital version, abbyy educational software learning, pc computing platform product, microsoft office productivity software, educational development training lessons, online download version]",9,6,educational development training lessons


#####Gather all info for duplicated n-grams in our delimited set

In [17]:
%%time 
# Should take < 4sec on CPU

# Get all of the delimited n-grams that are duplicated at least once in item names
#  range of sizes of delimited phrases (number of 'words'):

min_gram = 1
max_gram = items_delimited.d_maxgram.max()

total_dupe_grams = 0
gram_freqs = {}   # dict will hold elements that are pd.Series with index = phrase, value = number of repeats in items database item names
for n in range(min_gram,max_gram+1):
    item_ngram = items_clean_delimited.copy(deep=True)
    item_ngram['delim_ngrams'] = item_ngram.delim_name_list.apply(lambda x: [a for a in x if len(a.split()) == n])
    item_ngram = item_ngram.explode('delim_ngrams').reset_index(drop=True)  
    freq_grams = item_ngram.delim_ngrams.value_counts()
    print(f'Number of unique delimited {n}-grams: {len(freq_grams)}')
    grams_dupe = len(freq_grams[freq_grams > 1])
    print(f'Number of unique delimited {n}-grams that are duplicated at least once: {grams_dupe}\n')
    if grams_dupe > 0:
        gram_freqs[n] = freq_grams[freq_grams > 1].copy(deep=True)
        total_dupe_grams += grams_dupe
print(f'\nTotal number of unique, delimited, duplicated n-grams for all n: {total_dupe_grams}')
print(f'\nGram Processing Done: {strftime("%a %X %x")}\n')

Number of unique delimited 1-grams: 2817
Number of unique delimited 1-grams that are duplicated at least once: 1169

Number of unique delimited 2-grams: 4236
Number of unique delimited 2-grams that are duplicated at least once: 1191

Number of unique delimited 3-grams: 3867
Number of unique delimited 3-grams that are duplicated at least once: 751

Number of unique delimited 4-grams: 3464
Number of unique delimited 4-grams that are duplicated at least once: 378

Number of unique delimited 5-grams: 2852
Number of unique delimited 5-grams that are duplicated at least once: 295

Number of unique delimited 6-grams: 1936
Number of unique delimited 6-grams that are duplicated at least once: 144

Number of unique delimited 7-grams: 1259
Number of unique delimited 7-grams that are duplicated at least once: 68

Number of unique delimited 8-grams: 829
Number of unique delimited 8-grams that are duplicated at least once: 31

Number of unique delimited 9-grams: 522
Number of unique delimited 9-gram

In [18]:
'''
May 25: try adjusting code to incude ngrams in range 1 and up, but reduce weight for n-grams that contain many common words
'''
start_n = 0
finish_n = 10
# first, inspect data to see what are the common n-grams of little value in determining cluster coupling
df_busy_grams=pd.DataFrame({'n3_names':gram_freqs[3].index[start_n:finish_n], 'n3_counts':gram_freqs[3].values[start_n:finish_n],
                 'n4_names':gram_freqs[4].index[start_n:finish_n], 'n4_counts':gram_freqs[4].values[start_n:finish_n],
                 'n5_names':gram_freqs[5].index[start_n:finish_n], 'n5_counts':gram_freqs[5].values[start_n:finish_n]
                 })
display(df_busy_grams)

Unnamed: 0,n3_names,n3_counts,n4_names,n4_counts,n5_names,n5_counts
0,music music media,3582,pc computing platform product,2882,program home and office digital,333
1,online download version,2097,educational development training lessons,1736,antivirus defender internet security software,245
2,movie bluray dvd,1787,sony playstation gaming platform,1262,xbox 360 russian language version,151
3,russian language version,1688,1c educational software learning,1042,batman game computer electronic multirelease,136
4,game pc digital,1125,microsoft xbox gaming platform,773,call of duty game computer,135
5,game xbox 360,501,game pc standard version,756,warhammer game computer electronic multirelease,115
6,payment card ticket,371,microsoft office productivity software,619,angry bird game computer electronic,97
7,gift soft toy,366,music cd production business,397,star war game computer electronic,93
8,english language version,346,gift gadget robot sport,295,lego brand lego style game,91
9,cinema special version,332,program home and office,277,borderland game computer electronic multirelease,82


In [19]:
# format data for feeding into word vector creator

count_bins = [0, 2, 4, 8, 16, 32, 128, 1024, 32768]
idf_weights = [8,7,6,5,4,3,2,1]  # more weight for ngrams with lower counts

notfirst = False
for n,s in gram_freqs.items():
    a=len(s)
    n_array = np.ones(a,dtype=np.int32)*n
    gram_count = s.values.astype(np.int32)
    gram_string0 = s.index.to_numpy(dtype='str')
    gram_string = [re.compile(r'\b' + gs + r'\b') for gs in gram_string0]  # I'm not looking for partial words; n-grams must match at word boundaries
    weight_bin = pd.cut(s,count_bins,labels=idf_weights,retbins=False).astype(np.int32)

    if notfirst:
        n_arrays = np.concatenate((n_arrays,n_array))
        gram_counts = np.concatenate((gram_counts,gram_count))
        gram_strings = np.concatenate((gram_strings,gram_string))
        weight_bins = np.concatenate((weight_bins,weight_bin))
    else:
        n_arrays = n_array
        gram_counts = gram_count
        gram_strings = gram_string
        weight_bins = weight_bin
        notfirst = True

print(n_arrays[:5],gram_counts[:5],gram_strings[:5],weight_bins[:5])
print(len(n_arrays))

[1 1 1 1 1] [2742 1835 1360  893  816] [re.compile('\\bpc\\b') re.compile('\\bregion\\b')
 re.compile('\\bjewel\\b') re.compile('\\b1c\\b') re.compile('\\bcd\\b')] [1 1 1 2 2]
4065


In [20]:
# use np matrix storage to speed this up... the following code cell takes about 3 min using np, vs. 8 min with pandas dataframe calculations
#   also, reducing np matrix to hold only ngrams of size 3 or greater (5/25/20) takes 48 sec on CPU
def make_word_vecs(item_names, ngram_re_patterns, ngram_ns, ngram_weights):
    """Output is word vectors for input containing item names (english transl)"""

    # create np zeros array of size (number of items, word vector length)
    n_items = len(item_names)
    wv_len = len(ngram_ns)
    item_vec_array = np.zeros((n_items, wv_len), dtype = np.int32)

    for g in range(wv_len):
        gram_pattern = ngram_re_patterns[g] 
        gram_len = ngram_ns[g]
        gram_weight = ngram_weights[g]
        for i in range(n_items):
            if gram_pattern.search(item_names[i]):
                item_vec_array[i,g] = 2 * gram_len * gram_weight  # use weighting function 2 * (n= length of ngram) * (idf weight from binning above)
    return item_vec_array


In [21]:
%%time
item_word_vectors = make_word_vecs(items_clean_delimited.loc[:,'item_name'].to_numpy(dtype='str'), gram_strings,n_arrays,weight_bins)

CPU times: user 2min 48s, sys: 233 ms, total: 2min 48s
Wall time: 2min 48s


In [22]:
# # intermediate point: can save word vectors here for the 22170 items
#np.savez_compressed('data_output/item_word_vectorsCompressed.npz', arrayname = item_word_vectors)
# # ...
# iwv = np.load("data_output/item_word_vectors.npz")
# item_word_vectors = iwv['arrayname']
# print(item_word_vectors.shape)

#####Use scipy sparse matrices instead of pandas... faster, and less memory use

In [23]:
item_vec_matrix = sparse.csr_matrix(item_word_vectors)

In [24]:
%%time
# <2sec for 21,700 items x 4000+ ngrams; output is a csr matrix of type int64
dots = item_vec_matrix.dot(item_vec_matrix.transpose()) 

CPU times: user 1.45 s, sys: 443 ms, total: 1.89 s
Wall time: 1.9 s


In [25]:
# wicked fast way to get top K # of items by dot product value (i.e., closest K items to the item of interest)
# https://stackoverflow.com/questions/31790819/scipy-sparse-csr-matrix-how-to-get-top-ten-values-and-indices
# also, great reference for speeding up python here: https://colab.research.google.com/drive/1nMDtWcVZCT9q1VWen5rXL8ZHVlxn2KnL

@jit(cache=True)
def row_topk_csr(data, indices, indptr, K):
    """Take a sparse scipy csr matrix, and for each column, find the K largest 
    values in that column (like argmax or argsort[:K]).  Return the row indices 
    and associated values for each column as two separate np arrays of 
    length = number of columns in sparse matrix.  Inputs are data/indices/indptr
    of csr matrix, and integer K.  Call function like this:
    rows, vals = row_topk_csr(csr_name.data, csr_name.indices, csr_name.indptr, K)
    Use jit by importing jit and prange from numba, and decorating with
    @jit(cache=True) immediately before this function definition
    (adopted from https://stackoverflow.com/users/3924566/deepak-saini ) """

    m = indptr.shape[0] - 1
    max_indices = np.zeros((m, K), dtype=indices.dtype)
    max_values = np.zeros((m, K), dtype=data.dtype)
    # for i in prange(m):
    #     top_inds = np.argsort(data[indptr[i] : indptr[i + 1]])[::-1][:K]
    #     max_indices[i] = indices[indptr[i] : indptr[i + 1]][top_inds]
    #     max_values[i] = data[indptr[i] : indptr[i + 1]][top_inds]
    for i in prange(m):
        top_inds = np.arange(22190-K,22190)
        tops = np.argsort(data[indptr[i] : indptr[i + 1]])[::-1][:K]
        top_inds[:len(tops)] = tops
        #print(i,top_inds)
        max_indices[i] = indices[indptr[i] : indptr[i + 1]][top_inds]
        max_values[i] = data[indptr[i] : indptr[i + 1]][top_inds]

    return max_indices, max_values


In [26]:
%%time
dots.setdiag(0)
print(dots.indptr.shape)
kval = 20
closest_indices, highest_values = row_topk_csr(dots.data, dots.indices, dots.indptr, K=kval)  # Changed K from 10 to 2 on 5/25/20 to 3 on 5/26 to 15 with fakedots

  self._set_arrayXarray(i, j, x)


(22171,)
CPU times: user 6.4 s, sys: 485 ms, total: 6.88 s
Wall time: 6.92 s


In [27]:
#print(closest_indices.shape)
print(closest_indices[:5,:7]) #[:10,:])
print(highest_values[8000:8008,:7])

[[ 9920  9922 16973 21346 21661  9932 21667]
 [ 1155  1154  1156  1152  1153  1157  1182]
 [17212 16518 16519 16521 16616 16691 16692]
 [19630  9633 20027 10463 10462  9029 12427]
 [ 9172  2716  9449  9450  9451  7732  7845]]
[[ 656  480  480  480  480  480  480]
 [ 864  864  864  864  864  864  656]
 [ 224  224  224  224  224  224  224]
 [1024 1024 1024 1024 1024  224  224]
 [ 152  152  152  152  152  152  152]
 [2448  928  864  864  864  864  864]
 [2448  736  736  528  480  480  480]
 [ 772  720  708  708  708  672  672]]


In [28]:
similar_items = pd.DataFrame({'item_id':range(22170)}) #,'close_item_idx':closest_indices,'close_item_dot':highest_values})
similar_items['close_item_idx'] = [closest_indices[x][:kval] for x in range(22170)]
similar_items['close_item_dot'] = [highest_values[x][:kval] for x in range(22170)]
similar_items = similar_items.merge(items_clean_delimited[['item_id','i_tested','i_cat_id']], on='item_id')
similar_items['close_item_cat'] = similar_items.close_item_idx.apply(lambda x: [items_augmented.at[i,'item_category_id'] for i in x])
print(similar_items.head())


   item_id                                                                                                                                close_item_idx  \
0        0        [9920, 9922, 16973, 21346, 21661, 9932, 21667, 12449, 20043, 21420, 15819, 16616, 13950, 10811, 8125, 10290, 14864, 17831, 8635, 8631]   
1        1                      [1155, 1154, 1156, 1152, 1153, 1157, 1182, 1177, 1174, 1172, 1184, 1170, 1181, 5730, 3873, 3876, 3878, 3877, 3875, 3874]   
2        2  [17212, 16518, 16519, 16521, 16616, 16691, 16692, 16973, 17125, 17251, 18732, 17265, 17352, 17518, 17831, 17910, 18530, 18531, 18532, 16513]   
3        3                [19630, 9633, 20027, 10463, 10462, 9029, 12427, 13115, 2486, 4662, 5639, 5638, 5603, 5600, 5582, 5163, 4668, 4661, 4569, 4660]   
4        4         [9172, 2716, 9449, 9450, 9451, 7732, 7845, 17724, 17725, 17726, 17727, 17728, 17729, 17730, 17732, 17738, 17739, 17740, 17741, 17742]   

                                                               

In [29]:
# create a graph with nodes = item ids in test set, and edge weights = dot product values

# we will use the "community" algorithms to determine useful groupings of other items around/including the test items
# ##### start with a graph containing the 5100 items in the test set as starter nodes, and add in the 10 highest-match wordvector items if dot product > threshold
# TAKING A LEAP... gonna try with 21700 full items dataset / top 10 matches

edge_threshold = 100  # dot product (edge weight) must be greater than this for two item_ids to be connected in the graph

graph_items = similar_items[['item_id','close_item_idx']].copy(deep=True).explode('close_item_idx').reset_index(drop=True)
graph_weights = similar_items[['item_id','close_item_dot']].copy(deep=True).explode('close_item_dot').reset_index(drop=True)
graph_items['weight'] = graph_weights.loc[:]['close_item_dot']
graph_items.columns = ['item1_id','item2_id','weight']

print(len(graph_items))
graph_items = graph_items[graph_items.weight > edge_threshold]
print(len(graph_items))
graph_items.head()
# depending on threshold, we may end up dropping some of the test items (for example, we lose item 22154 if threshold = 150, but not if threshold = 100)
# K=15
# 332550
# x
# 284132 (thresh 100)
# 265912 (200)
# 143099 (500)

# K=10
# 221700
# x
# 192094 (100)

# K = 20
# 443400 -> 374990 (100)

443400
375055


Unnamed: 0,item1_id,item2_id,weight
0,0,9920,288
1,0,9922,288
2,0,16973,288
3,0,21346,288
4,0,21661,288


In [30]:
%%time
# import pandas df into weighted-edge graph:
G = nx.from_pandas_edgelist(graph_items, 'item1_id', 'item2_id', ['weight'])

CPU times: user 4.59 s, sys: 82.9 ms, total: 4.67 s
Wall time: 4.68 s


In [31]:
%%time
# employ a clustering method that utilizes the edge weights
communities2 = community.asyn_lpa_communities(G, weight='weight', seed=42)

CPU times: user 44.2 s, sys: 14.3 ms, total: 44.2 s
Wall time: 44.2 s


#####Extract and display community (cluster) information

In [32]:
num_communities = 0
community_items = set()
cluster_nodes = []
n_nodes = []
weight_avgs = []
weight_sums = []
weight_maxs = []
weight_mins = []
weight_stds = []
for i,c in enumerate(communities2):
    num_communities += 1
    community_items = community_items | set(c)
    nodelist = list(c)
    n_nodes.append(len(nodelist))
    edgeweights = []
    for m in range(n_nodes[-1]-1):
        for n in range(m+1,n_nodes[-1]):
            try:
                edgeweights.append(G.edges[nodelist[m], nodelist[n]]['weight'])
            except:
                pass
    cluster_nodes.append(nodelist)
    weight_avgs.append(np.mean(edgeweights))
    weight_sums.append(np.sum(edgeweights))
    weight_maxs.append(np.max(edgeweights))
    weight_mins.append(np.min(edgeweights))
    weight_stds.append(np.std(edgeweights))

print(num_communities)

1403


In [33]:
weight_avgs = [round(x) for x in weight_avgs]
community_df = pd.DataFrame({'n_nodes':n_nodes,'w_avg':weight_avgs,'w_sum':weight_sums,'w_max':weight_maxs,'w_min':weight_mins,'w_std':weight_stds,'cluster_members':cluster_nodes})
print(community_df.head())
print("\n")
print(community_df.describe())

   n_nodes  w_avg   w_sum  w_max  w_min     w_std  \
0      111    351  472856  21220    212   651.611   
1       33   2599  956500   4000    160 1,568.929   
2       27    330   78836   4368    256   341.139   
3        4   3424   20544   3424   3424         0   
4       21   1537  195140   6284    104 1,473.718   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               cluster_members  
0  [0, 2

In [34]:
cluster_items = community_df[['n_nodes','cluster_members']].copy(deep=True).explode('cluster_members').reset_index(drop=True)
print(f'community_df length: {len(community_df)}')
print(f'cluster_items df length: {len(cluster_items)}')
print(f'number of unique item ids contained in clusters: {cluster_items.cluster_members.nunique()}')
for nn in [9,24,49]:
    nn_community = community_df.query("n_nodes > @nn").copy(deep=True)
    print(f'number of clusters with at least {nn+1} items as members: {len(nn_community)}')
    print(nn_community.describe())
    print('\n')

community_df length: 1403
cluster_items df length: 20405
number of unique item ids contained in clusters: 20405
number of clusters with at least 10 items as members: 423
       n_nodes     w_avg       w_sum     w_max   w_min     w_std
count      423       423         423       423     423       423
mean    40.201   991.073 331,544.217 5,495.102 382.809   686.413
std     73.157 1,041.634 631,964.521 6,309.470 404.129   902.919
min         10       142        9572       168     104         0
25%         15   445.500       65140      1328     196   149.370
50%         23       678      135700      3452     256   399.994
75%     41.500 1,130.500      359504      7088     400   879.486
max       1177     11251     7393596     41760    3192 9,408.742


number of clusters with at least 25 items as members: 195
       n_nodes     w_avg       w_sum     w_max   w_min     w_std
count      195       195         195       195     195       195
mean    68.687   882.395 585,480.205 6,718.790 289.928 

In [35]:
# with K=15 and threshold = 100, we get 1624 clusters, quantiles of n_nodes = 2 min, 2, 4 med, 11, 1161 max; 479 clusters with at least 10 items 479/10... 182/25...73/50;  20405 items actually clustered (out of 21700)
# with K=15 and threshold = 200, we get 1664 clusters, quantiles of n_nodes = 2 min, 2, 4 med, 10, 1164 max; 431 clusters with at least 10 items, 19787 items actually clustered (out of 21700)
# with K=15 and threshold = 500, we get 1843 clusters, quantiles of n_nodes = 2 min, 2, 3 med,  6,  236 max; 319 clusters with at least 10 items 319/10... 103/25...31/50;  13422 items actually clustered (out of 21700)
# with K=10 and threshold = 100, we get 1962 clusters, quantiles of n_nodes = 2 min, 2, 4 med, 10, 1096 max; 529 clusters with at least 10 items 529/10... 152/25...56/50;  20404 items actually clustered (out of 21700)
# community_df length: 1962
# cluster_items df length: 20404
# number of unique item ids contained in clusters: 20404
# number of clusters with at least 10 items as members: 529
#        n_nodes     w_avg       w_sum     w_max   w_min     w_std
# count      529       529         529       529     529       529
# mean    28.371 1,118.951 151,510.904 4,402.389 423.274   719.163
# std     56.233 1,174.116 292,205.958 5,415.598 359.267   967.964
# min         10       132        2904       132     104         0
# 25%         12       509       36548      1108     228   149.643
# 50%         17       791       75604      2604     320   385.788
# 75%         27      1293      165472      5668     504   987.902
# max       1096     11827     3816308     41760    2940 9,873.388


# number of clusters with at least 25 items as members: 152
#        n_nodes     w_avg       w_sum     w_max   w_min     w_std
# count      152       152         152       152     152       152
# mean    61.704   917.283 309,274.921 4,552.974 339.474   537.762
# std     97.197 1,102.951 480,567.401 5,888.855 251.088   947.367
# min         25       167       21352       244     104         0
# 25%         30   436.250      108438      1088     196   106.725
# 50%         39       639      191836      2586     272   255.362
# 75%         58 1,001.750      317200      5629     400   671.011
# max       1096     11495     3816308     41760    2448 9,873.388


# number of clusters with at least 50 items as members: 56
#        n_nodes   w_avg       w_sum     w_max   w_min     w_std
# count       56      56          56        56      56        56
# mean   110.143 670.875 458,976.500 4,891.786 268.071   423.299
# std    148.554 403.079 562,950.003 5,501.423 108.325   516.756
# min         50     167       84276       372     104     5.652
# 25%     56.500 403.500      198484      1288     196    95.334
# 50%         72 558.500      314638      2720     256   255.362
# 75%    106.250 823.500      534489      6053     320   575.131
# max       1096    1989     3392540     22140     608 2,978.267
#


#####################################################
## use clusters n>49, k=20, thresh = 100
###################################

# # with K=20 and threshold = 100, we get 1387 clusters, quantiles of n_nodes = 2 min, 2, 4 med, 13, 1319 max; 422 clusters with at least 10 items 422/10... 179/25...78/50;  20406 items actually clustered (out of 21700)
# community_df length: 1387
# cluster_items df length: 20406
# number of unique item ids contained in clusters: 20406
# number of clusters with at least 10 items as members: 422
#        n_nodes     w_avg       w_sum     w_max   w_min     w_std
# count      422       422         422       422     422       422
# mean    40.405   987.488 334,640.844 5,371.365 382.237   706.833
# std     79.321 1,002.201 663,848.830 6,042.819 366.626   930.412
# min         10       142        9836       168     104         0
# 25%         15   445.500       61230      1387     196   168.636
# 50%         22   688.500      130628      3394     256   424.621
# 75%         39 1,149.750      363573      6901     416   910.528
# max       1319     11251     7558860     41760    3192 9,339.161


# number of clusters with at least 25 items as members: 179
#        n_nodes     w_avg       w_sum     w_max   w_min     w_std
# count      179       179         179       179     179       179
# mean    73.682   878.994 633,793.497 6,350.324 295.330   606.438
# std    113.674   860.866 924,124.723 6,362.418 230.057   891.521
# min         25       165       36576       484     104     9.963
# 25%         33   435.500      198778      2168     164   180.056
# 50%         44       623      388704      4276     256   377.799
# 75%         75 1,072.500      667122      7824     342   767.243
# max       1319      8846     7558860     41760    2448 9,339.161


# number of clusters with at least 50 items as members: 78
#        n_nodes     w_avg         w_sum     w_max   w_min     w_std
# count       78        78            78        78      78        78
# mean   124.269   822.628 1,054,526.103 7,313.077 248.205   559.201
# std    158.805 1,056.614 1,253,915.801 7,405.189 106.790 1,093.614
# min         50       179         68600       528     104     9.963
# 25%     63.250   405.250        453204      2467     167   154.897
# 50%         83   562.500        666060      4136     244   300.057
# 75%    138.750   891.250       1162252      9802     272   628.492
# max       1319      8846       7558860     41760     592 9,339.161

# updated regexes, 6/16/20... k=20, thr=100
# community_df length: 1403
# cluster_items df length: 20405
# number of unique item ids contained in clusters: 20405
# number of clusters with at least 10 items as members: 429
#        n_nodes     w_avg       w_sum     w_max   w_min     w_std
# count      429       429         429       429     429       429
# mean    39.816   964.718 330,242.284 5,215.291 363.347   666.080
# std     71.759 1,017.564 639,627.695 5,820.533 306.239   891.461
# min         10       146        7288       168     104         0
# 25%         15       445       62944      1348     196   171.419
# 50%         23       660      126668      3328     260   384.643
# 75%         40      1121      355724      6904     408   843.593
# max       1149     11251     7195364     41760    2940 9,400.753


# number of clusters with at least 25 items as members: 190
#        n_nodes     w_avg       w_sum     w_max   w_min     w_std
# count      190       190         190       190     190       190
# mean    69.747   898.179 609,067.453 6,547.221 283.116   629.240
# std    100.097   913.846 870,781.482 6,344.391 156.825   935.361
# min         25       160       40240       244     104         0
# 25%         31       442      195290      2121     185   208.694
# 50%         43   641.500      357134      4500     256   383.428
# 75%         69 1,068.750      669944      8082     356   766.780
# max       1149      9381     7195364     41760    1296 9,400.753


# number of clusters with at least 50 items as members: 77
#        n_nodes   w_avg         w_sum     w_max   w_min     w_std
# count       77      77            77        77      77        77
# mean       122 719.208   992,967.844 7,237.299 259.273   467.077
# std    142.094 527.528 1,099,653.689 6,194.954 127.151   446.270
# min         50     160         81852       564     104     9.931
# 25%         60     421        413644      2592     152   184.071
# 50%         81     545        672344      5248     224   314.437
# 75%        140     894       1159600      9896     320   645.293
# max       1149    3379       7195364     27240     784 2,267.530

#####Encode cluster categories
* I could use the index number as item_cluster_id to feed into training of the model & predictions (simplest, easiest)

* Or, I could try to give the clusters a bit more meaning... perhaps higher cluster numbers if higher weights between nodes on average, but
```
community_df.w_avg.nunique()
``` 
shows there are not enough unique average weights to use this as a cluster category code.

* I could sort on w_avg, then on number of nodes as perhaps the next most important defining characteristic of a given cluster.  Then, to make the categorization unique, I could take the w_avg value and sum it with the index (row number)... (with the sorting, this favors even more the clusters with high average item-to-item similarity)
```
community_df = community_df.sort_values(['w_avg','n_nodes']).reset_index(drop=True)
community_df['item_cluster_id'] = community_df.index + community_df['w_avg']
community_df.head()
```
This is not too difficult, but does make cluster category codes rather large, and requires more RAM for storage of training data


* Lastly, I could do some sort of mean encoding for sales, such as using the average of cluster sales over a certain number of important months (such as months 25 to 33) using the training data with outliers removed.  It's similar to the above "weighted" method, but a bit more complex

Since I am not sure if any of this will help, particularly with decision tree models, I will just use index number for now, and perhaps revisit this in the future.  However, since there are many unclustered items, I will deal with them as follows:
* clustered items cluster code = 100 + index_row_number
* unclustered items cluster code = original item_category_id (0-83)

In [38]:
# unravel / explode the cluster node lists... we know this will not duplicate item ids, from the counting we did above (nodes are not duplicated in clusters)
item_clusters = community_df.copy(deep=True).explode('cluster_members').reset_index().rename(columns = {'index':'item_cluster','cluster_members':'item_id'})
display(item_clusters.head())
print(f't1: {strftime("%a %X %x")}')

n_cluster_codes = item_clusters.item_cluster.nunique()
all_items = items_augmented.item_id.unique()
n_items_total = len(all_items)
items_in_clusters = item_clusters.item_id.unique()
items_in_test = test.item_id.unique()
items_in_train = tt.query('month < 34').item_id.unique()
items_in_train2533 = tt.query('((month < 34) & (month > 24))').item_id.unique()
print(f't2: {strftime("%a %X %x")}')

items_unclustered = [x for x in all_items if x not in items_in_clusters]
print(f't3: {strftime("%a %X %x")}')
test_unclustered = [x for x in items_in_test if x in items_unclustered]
print(f't4: {strftime("%a %X %x")}')
train_unclustered = [x for x in items_in_train if x in items_unclustered]
print(f't5: {strftime("%a %X %x")}')
train2533_unclustered = [x for x in items_in_train2533 if x in items_unclustered]
print(f't6: {strftime("%a %X %x")}')
test_trained = [x for x in items_in_test if x in items_in_train]
print(f't7: {strftime("%a %X %x")}')
test_trained2533 = [x for x in items_in_test if x in items_in_train2533]
print(f't8: {strftime("%a %X %x")}')

# Merge cluster information with item dataset and utilize original item_category_id if a particular item is not in one of the NLP-defined clusters
items_clustered = items_clean_delimited[['item_id','i_cat_id','i_tested','item_name']].merge(item_clusters,on='item_id',how='left')
items_clustered = items_clustered[['item_id','item_name','i_cat_id','i_tested','item_cluster','n_nodes']] #,'w_avg','w_sum','w_max','w_min','w_std']]
items_clustered.columns = ['item_id','item_name','item_category_id','item_tested','item_cluster','n_items_in_cluster'] #'w_avg','w_sum','w_max','w_min','w_std']

items_clustered.item_cluster = items_clustered.apply(lambda x: x.item_cluster+100 if x.item_cluster >= 0 else x.item_category_id, axis = 1)

print(f'done: {strftime("%a %X %x")}')


Unnamed: 0,item_cluster,n_nodes,w_avg,w_sum,w_max,w_min,w_std,item_id
0,0,111,351,472856,21220,212,651.611,0
1,0,111,351,472856,21220,212,651.611,2
2,0,111,351,472856,21220,212,651.611,20
3,0,111,351,472856,21220,212,651.611,12828
4,0,111,351,472856,21220,212,651.611,14366


t1: Thu 09:18:14 06/18/20
t2: Thu 09:18:14 06/18/20
t3: Thu 09:28:16 06/18/20
t4: Thu 09:28:17 06/18/20
t5: Thu 09:28:17 06/18/20
t6: Thu 09:28:18 06/18/20
t7: Thu 09:28:18 06/18/20
t8: Thu 09:28:18 06/18/20
done: Thu 09:28:19 06/18/20


In [39]:
print(f'\nNumber of "cluster category" codes: {n_cluster_codes:,d}')
print(f'Number of "cluster category" codes after merging unclustered items with original category codes: {items_clustered.item_cluster.nunique():,d}\n')

print(f'Number of items not included in clusters: {len(items_unclustered):,d} (out of {n_items_total:,d})')
print(f'Number of test items not included in clusters: {len(test_unclustered):,d} (out of {len(items_in_test):,d})')
print(f'Number of sales_train items not included in clusters: {len(train_unclustered):,d} (out of {len(items_in_train):,d})')
print(f'Number of sales_train items from only months including 25-33 not included in clusters: {len(train2533_unclustered):,d} (out of {len(items_in_train2533):,d})\n')

print(f'Number of test items not included in sales_train: {(len(items_in_test) - len(test_trained)):,d} (out of {len(items_in_test):,d} test items)')
print(f'Number of test items not included in sales_train months 25-33: {(len(items_in_test) - len(test_trained2533)):,d} (out of {len(items_in_test):,d} test items)\n')

# Perhaps calculate size of clusters with % of test items and with % of test items not in train set... i.e., better determination of "coverage" of clusters for test items
#    then, maybe run a loop to optimize threshold and k

display(items_clustered.head())


Number of "cluster category" codes: 1,403
Number of "cluster category" codes after merging unclustered items with original category codes: 1,409

Number of items not included in clusters: 1,765 (out of 22,170)
Number of test items not included in clusters: 251 (out of 5,100)
Number of sales_train items not included in clusters: 1,739 (out of 21,806)
Number of sales_train items from only months including 25-33 not included in clusters: 704 (out of 10,747)

Number of test items not included in sales_train: 363 (out of 5,100 test items)
Number of test items not included in sales_train months 25-33: 485 (out of 5,100 test items)



Unnamed: 0,item_id,item_name,item_category_id,item_tested,item_cluster,n_items_in_cluster
0,0,movie dvd power in glamor plast dvd,40,False,100,111
1,1,program home and office digital abbyy finereader 12 professional edition full pc digital version,76,False,105,19
2,2,movie dvd in glory unv dvd,40,False,100,111
3,3,movie dvd blue wave univ dvd,40,False,110,5
4,4,movie dvd box glass dvd,40,False,116,84


In [41]:
# # how many test items are represented by clusters?
# tested_clustered = items_clustered[items_clustered.item_tested==True][['item_id','item_category_id','item_cluster_id','item_name']]
# tested_clustered['unclustered'] = tested_clustered.apply(lambda x: np.NaN if x.item_cluster_id > 0  else x.item_id, axis = 1)
# print(tested_clustered.head(10))
# print('\n')
# print(tested_clustered.item_id.nunique())
# unclustered = tested_clustered.unclustered.unique()
# unclustered = [x for x in unclustered if x > 0]
# print(len(unclustered))
# train_items = sales_train.item_id.unique()
# print(len(train_items))
# print(len(items_augmented))
# untrained = [x for x in unclustered if x not in train_items]
# print(len(untrained))
# print(len(items_augmented) - len(train_items) - len(untrained))

In [40]:
# save what we have; maybe refine later

compression_opts = dict(method='gzip',
                        archive_name='items_clustered_21700c.csv')  
items_clustered.to_csv('data_output/items_clustered_21700c.csv.gz', index=False, compression=compression_opts)

print(f'File Saved at: {strftime("%a %X %x")}')

File Saved at: Thu 09:33:46 06/18/20
