# Product Invoice Classification

* Inv_ID (Invoice ID) : Unique number representing Invoice created by supplier/vendor

* Vendor Code (Vendor ID) : Unique number representing Vendor/Seller in the procurement system

* GL_Code: Account’s Reference ID

* Inv_Amt : Invoice Amount

* Item Description : Description of Item Purchased Example: “Corporate Services Human Resources Contingent Labor/Temp Labor Contingent Labor/Temp Labor”

* Product Category : Category of Product for which Invoice is raised A pseudo product category is represented in the dataset as CLASS-???, where ? is a digit.* 

## Imports

In [1]:
import pandas as pd
import numpy as np

In [2]:
# for saving the pipeline
import joblib

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [5]:
from sklearn.pipeline import Pipeline

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

In [7]:
from feature_engine.encoding import OrdinalEncoder
from feature_engine.selection import DropFeatures

In [8]:
from custom_functions import Mapper, Custom_Fillna, splitter

In [9]:
import json

## Data

In [10]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')
sample_submission_data = pd.read_csv('sample_submission.csv')

## Transformations

In [11]:
## map for the target
tmp = train_data.groupby('Product_Category', as_index = False).agg(counts = ('Inv_Id','count')).sort_values('counts')
tmp['ranks'] = range(1,len(tmp)+1)
map_target = dict(tmp[['Product_Category','ranks']].values)

In [12]:
#saving json
with open('map_target.json', 'w') as fp:
    json.dump(map_target, fp)

In [13]:
#loading json
with open('map_target.json', 'r') as fp:
    map_target = json.load(fp)

In [14]:
train_data['Map_Product_Category'] = train_data.Product_Category.map(map_target)

In [15]:
train_data = splitter(data = train_data)

###  split data

In [16]:
seed = 12

In [17]:
initial_features = ['Inv_Id', 'Vendor_Code', 'GL_Code', 'Inv_Amt', 'Item_Description']
target = 'Map_Product_Category'

In [18]:
X = train_data[initial_features]
y = train_data[target]

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=seed)

### Pipeline

<b> expanding

In [20]:
X_train = splitter(data = X_train)

<b> maps

In [21]:
def get_dictionary_rank1_tomap(data, variable):
    tmp = data.groupby([variable,'Map_Product_Category'], as_index = False).agg(counts = ('Inv_Id','count'))
    tmp["Rank"] = tmp.groupby([variable])["counts"].rank("dense", ascending=False)
    tmp = tmp[tmp.Rank == 1]
    map_rank =  dict(tmp[[variable,'Map_Product_Category']].values)
    return map_rank

In [22]:
TypeEnt_number_maps = get_dictionary_rank1_tomap(data = train_data, variable = 'TypeEnt_number')

In [23]:
#saving json
with open('TypeEnt_number_maps.json', 'w') as fp:
    json.dump(TypeEnt_number_maps, fp)

In [24]:
#loading json
with open('TypeEnt_number_maps.json', 'r') as fp:
    TypeEnt_number_maps = json.load(fp)

In [25]:
to_drop = ['Inv_Id','Vendor_Code','GL_Code', 'Item_Description','TypeEnt','GCL']

<b> the pipeline

In [26]:
category_prediction_pipeline = Pipeline([
    ('GCL_Code-cardinal-ordering',
     OrdinalEncoder(encoding_method='ordered', variables=['GCL_number'])
    ),
    
    ('TypeEnt_number_map_modes', 
    Mapper(variables = ['TypeEnt_number'], mappings = TypeEnt_number_maps)
    ),
    
    ('drop_features',
     DropFeatures(features_to_drop= to_drop)
    ),
    
    ('Fill_na',
    Custom_Fillna(variables = ['TypeEnt_number'], fill_value = 0)
    ),
    
    ('KNN', OneVsRestClassifier(KNeighborsClassifier(n_neighbors = 4 )))
    
])

In [27]:
category_prediction_pipeline.fit(X_train, y_train)

Pipeline(steps=[('GCL_Code-cardinal-ordering',
                 OrdinalEncoder(variables=['GCL_number'])),
                ('TypeEnt_number_map_modes',
                 Mapper(mappings={'1000': 16, '1003': 35, '1005': 35,
                                  '1006': 35, '1007': 35, '1008': 35,
                                  '1010': 35, '1011': 35, '1012': 35,
                                  '1013': 35, '1014': 35, '1016': 35,
                                  '1018': 35, '1019': 35, '1021': 35,
                                  '1022': 35, '1024': 35, '1025': 35,
                                  '1026': 35, '1027': 35, '1028': 9, '1030': 3...
                                  '1034': 25, '1035': 29, '1036': 27,
                                  '1037': 35, '1038': 35, ...},
                        variables=['TypeEnt_number'])),
                ('drop_features',
                 DropFeatures(features_to_drop=['Inv_Id', 'Vendor_Code',
                                                '

In [28]:
y_train_pred = category_prediction_pipeline.predict(X_train)

In [29]:
y_train_pred

array([33, 35, 29, ..., 35, 36, 34], dtype=int64)

<b> validation

In [30]:
X_val = splitter(data = X_val)

In [31]:
y_val_pred = category_prediction_pipeline.predict(X_val)

In [32]:
y_val_pred

array([34, 36, 27, ..., 30, 33, 33], dtype=int64)

<b> test data

In [33]:
test_data = splitter(data = test_data)

In [34]:
X_test = test_data[initial_features]

In [35]:
X_test = splitter(data = X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [36]:
y_test_pred = category_prediction_pipeline.predict(X_test)

In [37]:
y_test_pred.shape

(2446,)

In [38]:
X_test['yhat'] = y_test_pred

In [39]:
X_test

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description,TypeEnt,TypeEnt_number,GCL,GCL_number,yhat
0,15003,VENDOR-2513,GL-6050310,56.13,Travel and Entertainment Miscellaneous Company...,VENDOR,2513,GL,6050310,36
1,15008,VENDOR-1044,GL-6101400,96.56,Final Site Clean Up Store Construction Advance...,VENDOR,1044,GL,6101400,34
2,15013,VENDOR-1254,GL-6101400,55.93,Arabian American Development Co Final Site Cle...,VENDOR,1254,GL,6101400,34
3,15019,VENDOR-1331,GL-2182000,32.62,Corporate Services Contingent Labor/Temp Labor...,VENDOR,1331,GL,2182000,32
4,15020,VENDOR-2513,GL-6050310,25.81,Fortune National Corp Miscellaneous Company Ca...,VENDOR,2513,GL,6050310,36
...,...,...,...,...,...,...,...,...,...,...
2441,22995,VENDOR-2237,GL-6101400,7.75,General Contractor General Requirements Final ...,VENDOR,2237,GL,6101400,34
2442,23002,VENDOR-1066,GL-6050310,37.35,Ground Transportation Miscellaneous Company Ca...,VENDOR,1066,GL,6050310,36
2443,23006,VENDOR-1883,GL-2182000,42.16,Auto Leasing Corporate Services Corning Inc /N...,VENDOR,1883,GL,2182000,33
2444,23009,VENDOR-2120,GL-6100410,63.52,Artworking/Typesetting Production 2007Mar Aqui...,VENDOR,2120,GL,6100410,31


## saving machine

In [40]:
joblib.dump(category_prediction_pipeline, 'category_prediction_pipeline.joblib') 

['category_prediction_pipeline.joblib']