This attempt will be using logreg as baseline

In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.multiclass import OneVsRestClassifier

from sklearn.linear_model import LogisticRegression

from sklearn import metrics

import pickle

In [2]:
df = pd.read_csv('../datasets/cleaned/combined_text.csv')

In [3]:
df.head()

Unnamed: 0,combined_text,category,asin
0,I have a 9 year old Badger 1 that needs replac...,appliances,B00004U9JP
1,model number This may help InSinkErator Model ...,appliances,B00004U9JP
2,can I replace Badger 1 1/3 with a Badger 5 1/2...,appliances,B00004U9JP
3,Does this come with power cord and dishwasher ...,appliances,B00004U9JP
4,loud noise inside when turned on. sounds like ...,appliances,B00004U9JP


In [4]:
category_list = list(df.category.unique())
category_dict = { x:category_list.index(x) for x in category_list}
print(category_dict)

{'appliances': 0, 'arts_crafts_and_sewing': 1, 'automotive': 2, 'baby': 3, 'beauty': 4, 'cell_phones_and_accessories': 5, 'clothing_shoes_and_jewelry': 6, 'electronics': 7, 'grocery_and_gourmet_food': 8, 'health_and_personal_care': 9, 'home_and_kitchen': 10, 'industrial_and_scientific': 11, 'musical_instruments': 12, 'office_products': 13, 'patio_lawn_and_garden': 14, 'pet_supplies': 15, 'software': 16, 'sports_and_outdoors': 17, 'tools_and_home_improvement': 18, 'toys_and_games': 19, 'video_games': 20}


In [5]:
df.replace({'category':category_dict}, inplace=True)
df.head()

Unnamed: 0,combined_text,category,asin
0,I have a 9 year old Badger 1 that needs replac...,0,B00004U9JP
1,model number This may help InSinkErator Model ...,0,B00004U9JP
2,can I replace Badger 1 1/3 with a Badger 5 1/2...,0,B00004U9JP
3,Does this come with power cord and dishwasher ...,0,B00004U9JP
4,loud noise inside when turned on. sounds like ...,0,B00004U9JP


In [6]:
my_stop_words = []#extra words. todo: create a text file to load from
stop_words = text.ENGLISH_STOP_WORDS.union(my_stop_words)

vectorizer = TfidfVectorizer(
    ngram_range=(1,1), 
    stop_words=stop_words,
    analyzer='word',
    token_pattern=r'[a-zA-Z]+',
)

X = vectorizer.fit_transform(df['combined_text'])
y = df['category']

In [7]:
%%time
lr = LogisticRegression(multi_class='ovr',max_iter=300,n_jobs=-1)
clf = OneVsRestClassifier(lr).fit(X, y)

Wall time: 15min 26s


In [8]:
y_hat = clf.predict(X)

In [18]:
clf.score(X, y)

0.6839148902959502

In [9]:
metrics.accuracy_score(y, y_hat)

0.6839148902959502

In [15]:
probs = clf.predict_proba(X)

In [16]:
probs[0]

array([0.31374618, 0.07596179, 0.13106712, 0.01746021, 0.00793469,
       0.00815209, 0.00181063, 0.0152032 , 0.00208402, 0.21833584,
       0.00762823, 0.00257229, 0.00303513, 0.00821017, 0.0142055 ,
       0.00572368, 0.00683542, 0.04472159, 0.0508844 , 0.05894804,
       0.00547978])

In [10]:
inv_category_dict = {v: k for k, v in category_dict.items()}

count = 0
for coef in clf.coef_:
    words = list(zip(coef, vectorizer.get_feature_names()))
    top_words = sorted(words, reverse=True)[:40]
    bottom_words = sorted(words, reverse=False)[:40]
    
    print(f"===Category: {inv_category_dict[count]}===")
    print(f"-Top-")
    print([x[1] for x in top_words])
    print(f"-Bottom-")
    print([x[1] for x in bottom_words])
    print('')
    count+=1

===Category: appliances===
-Top-
['whirlpool', 'maytag', 'frigidaire', 'ge', 'amana', 'kenmore', 'chimney', 'jenn', 'washer', 'keg', 'hood', 'kegs', 'refrigerator', 'ice', 'roper', 'fridge', 'hotpoint', 'dryer', 'ductless', 'cooktop', 'element', 'range', 'broan', 'duct', 'ja', 'humidifier', 'cycle', 'kitchenaid', 'freezer', 'vent', 'burners', 'ducted', 'disposal', 'wr', 'kegerator', 'da', 'knobs', 'ventless', 'filter', 'ww']
-Bottom-
['phone', 'camera', 'lens', 'safe', 'battery', 'seat', 'car', 'coffee', 'case', 'mm', 'hair', 'bed', 'charger', 'bottle', 'bike', 'table', 'windows', 'galaxy', 'speakers', 'laptop', 'mattress', 'chair', 'card', 'cable', 'pool', 'speaker', 'computer', 'keyboard', 'honda', 'drive', 'bowl', 'iphone', 'oil', 'wear', 'vacuum', 'microwave', 'cream', 'play', 'tablet', 'engine']

===Category: arts_crafts_and_sewing===
-Top-
['singer', 'janome', 'bobbin', 'mannequin', 'bernina', 'yarn', 'brother', 'airbrush', 'serger', 'juki', 'stitch', 'sizzix', 'bobbins', 'cricut

===Category: home_and_kitchen===
-Top-
['juicer', 'duvet', 'cuisinart', 'curtain', 'comforter', 'slicer', 'futon', 'kettle', 'cooker', 'toaster', 'roomba', 'vitamix', 'dyson', 'blender', 'induction', 'shams', 'headboard', 'pitcher', 'carafe', 'krups', 'infuser', 'dough', 'purifier', 'beater', 'topper', 'peeler', 'ironing', 'stools', 'pan', 'ksm', 'mug', 'frother', 'foodsaver', 'waring', 'ozone', 'mattress', 'eureka', 'bodum', 'miele', 'jvm']
-Bottom-
['camera', 'crib', 'bike', 'phone', 'lens', 'stroller', 'play', 'gate', 'bassinet', 'feeder', 'doll', 'toy', 'software', 'tire', 'headphones', 'gun', 'scope', 'tires', 'macbook', 'pool', 'iphone', 'antenna', 'radio', 'ram', 'guitar', 'chart', 'holster', 'roof', 'galaxy', 'ink', 'waist', 'tripod', 'honda', 'ear', 'mic', 'lamp', 'headset', 'music', 'nikon', 'ford']

===Category: industrial_and_scientific===
-Top-
['stethoscope', 'toothpaste', 'microscope', 'amscope', 'winch', 'stethoscopes', 'hydrometer', 'epoxy', 'oreck', 'littman', 'filame

===Category: video_games===
-Top-
['game', 'ds', 'vita', 'saitek', 'controller', 'multiplayer', 'wii', 'console', 'joystick', 'dlc', 'psn', 'steam', 'controllers', 'nyko', 'xbox', 'kinect', 'razer', 'dsi', 'mechanical', 'dance', 'ps', 'code', 'campaign', 'offline', 'yoke', 'edition', 'account', 'dpi', 'character', 'gamepad', 'mod', 'play', 'psp', 'headset', 'games', 'op', 'simulator', 'violence', 'steelseries', 'mods']
-Bottom-
['phone', 'fit', 'water', 'size', 'unit', 'printer', 'bag', 'camera', 'bottle', 'machine', 'hair', 'samsung', 'radio', 'lens', 'speaker', 'blade', 'canon', 'diameter', 'office', 'door', 'filter', 'product', 'bike', 'ipad', 'seat', 'handle', 'apple', 'car', 'record', 'bulb', 'pump', 'nikon', 'kit', 'oil', 'china', 'mattress', 'dvd', 'height', 'tall', 'lid']



In [19]:
filename = '../models/category_logistic_regression_ovr.sav'
pickle.dump(clf, open(filename, 'wb'))