This attempt will be using only non nn classifiers

In [17]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.multiclass import OneVsRestClassifier

from sklearn.linear_model import LogisticRegression

from sklearn import metrics

In [2]:
df = pd.read_csv('../datasets/cleaned/combined_text.csv')

In [3]:
df.head()

Unnamed: 0,combined_text,category,asin
0,I have a 9 year old Badger 1 that needs replac...,appliances,B00004U9JP
1,model number This may help InSinkErator Model ...,appliances,B00004U9JP
2,can I replace Badger 1 1/3 with a Badger 5 1/2...,appliances,B00004U9JP
3,Does this come with power cord and dishwasher ...,appliances,B00004U9JP
4,loud noise inside when turned on. sounds like ...,appliances,B00004U9JP


In [7]:
category_list = list(df.category.unique())
category_dict = { x:category_list.index(x) for x in category_list}
print(category_dict)

{'appliances': 0, 'arts_crafts_and_sewing': 1, 'automotive': 2, 'baby': 3, 'beauty': 4, 'cell_phones_and_accessories': 5, 'clothing_shoes_and_jewelry': 6, 'electronics': 7, 'grocery_and_gourmet_food': 8, 'health_and_personal_care': 9, 'home_and_kitchen': 10, 'industrial_and_scientific': 11, 'musical_instruments': 12, 'office_products': 13, 'patio_lawn_and_garden': 14, 'pet_supplies': 15, 'software': 16, 'sports_and_outdoors': 17, 'tools_and_home_improvement': 18, 'toys_and_games': 19, 'video_games': 20}


In [8]:
df.replace({'category':category_dict}, inplace=True)
df.head()

Unnamed: 0,combined_text,category,asin
0,I have a 9 year old Badger 1 that needs replac...,0,B00004U9JP
1,model number This may help InSinkErator Model ...,0,B00004U9JP
2,can I replace Badger 1 1/3 with a Badger 5 1/2...,0,B00004U9JP
3,Does this come with power cord and dishwasher ...,0,B00004U9JP
4,loud noise inside when turned on. sounds like ...,0,B00004U9JP


In [27]:
my_stop_words = []#extra words. todo: create a text file to load from
stop_words = text.ENGLISH_STOP_WORDS.union(my_stop_words)

vectorizer = TfidfVectorizer(
    ngram_range=(1,1), 
    stop_words=stop_words,
    analyzer='word',
    token_pattern=r'[a-zA-Z]+',
)

X = vectorizer.fit_transform(df['combined_text'])
y = df['category']

In [34]:
%%time
lr = LogisticRegression(multi_class='ovr',max_iter=300,n_jobs=-1)
clf = OneVsRestClassifier(lr).fit(X, y)

Wall time: 25min 33s


In [35]:
y_hat = clf.predict(X)

In [36]:
metrics.accuracy_score(y, y_hat)

0.6935751481051644

In [37]:
inv_category_dict = {v: k for k, v in category_dict.items()}

count = 0
for coef in clf.coef_:
    words = list(zip(coef, vectorizer.get_feature_names()))
    top_words = sorted(words, reverse=True)[:40]
    bottom_words = sorted(words, reverse=False)[:40]
    
    print(f"===Category: {inv_category_dict[count]}===")
    print(f"-Top-")
    print([x[1] for x in top_words])
    print(f"-Bottom-")
    print([x[1] for x in bottom_words])
    print('')
    count+=1

===Category: appliances===
-Top-
['whirlpool', 'maytag', 'kenmore', 'ge', 'frigidaire', 'washer', 'jenn', 'amana', 'hood', 'chimney', 'refrigerator', 'fridge', 'keg', 'ice', 'dryer', 'model', 'cooktop', 'broan', 'kegs', 'hotpoint', 'element', 'ductless', 'roper', 'range', 'duct', 'humidifier', 'lg', 'kitchenaid', 'vent', 'freezer', 'burners', 'ducted', 'knobs', 'stove', 'filter', 'disposal', 'cycle', 'bosch', 'clothes', 'insinkerator']
-Bottom-
['phone', 'camera', 'lens', 'case', 'safe?', 'seat', 'battery', 'car', 'hair', 'bed', 'charger', 'coffee', 'galaxy', 'bottle', 'safe', 'windows', 'microwave', 'bike', 'table', 'cable', 'card', 'drive', 'laptop', 'pool', 'speakers', 'mattress', 'chair', 'bowl', 'iphone', 'play', 'computer', 'wear', 'vacuum', 'keyboard', 'speaker', 'cream', 'honda', 'washed', 'oil', 'bar']

===Category: arts_crafts_and_sewing===
-Top-
['singer', 'janome', 'brother', 'bernina', 'bobbin', 'yarn', 'airbrush', 'mannequin', 'easel', 'bobbins', 'juki', 'sewing', 'serger

===Category: home_and_kitchen===
-Top-
['juicer', 'duvet', 'cuisinart', 'comforter', 'cooker', 'roomba', 'slicer', 'kettle', 'curtain', 'futon', 'toaster', 'blender', 'dyson', 'vitamix', 'induction', 'carafe', 'krups', 'headboard', 'pitcher', 'topper', 'peeler', 'beater', 'dough', 'shams', 'pan', 'purifier', 'ironing', 'stools', 'foodsaver', 'mixer', 'infuser', 'breville', 'mattress', 'mug', 'waring', 'wilton', 'eureka', 'chef', 'sunbeam', 'frother']
-Bottom-
['camera', 'crib', 'bike', 'phone', 'lens', 'play', 'gate', 'stroller', 'toy', 'doll', 'tire', 'feeder', 'pool', 'software', 'bassinet', 'macbook', 'gun', 'headphones', 'iphone', 'scope', 'tires', 'waist', 'antenna', 'radio', 'music', 'nikon', 'roof', 'ear', 'ink', 'ram', 'holster', 'sony', 'dell', 'lamp', 'mic', 'guitar', 'asus', 'tripod', 'ipod', 'chart']

===Category: industrial_and_scientific===
-Top-
['toothpaste', 'microscope', 'stethoscope', 'winch', 'amscope', 'stethoscope?', 'epoxy', 'oreck', 'caulk', 'calibration', 'vac'

===Category: video_games===
-Top-
['game', 'vita', 'controller', '3ds', 'saitek', 'multiplayer', 'joystick', 'wii', 'dlc', 'console', 'controllers', 'game?', 'psn', 'razer', 'steam', 'kinect', 'ds', 'xbox', 'nyko', '2ds', 'dsi', 'dance', 'mechanical', 'edition', 'offline', 'campaign', 'code', 'ps2', 'play', 'account', 'headset', 'mx', 'ps', 'dpi', 'psp', 'playstation', 'ps4', 'yoke', 'steam?', 'games']
-Bottom-
['phone', 'fit', 'water', 'size', 'unit', 'camera', 'printer', 'bag', 'samsung', 'machine', 'bottle', 'hair', 'lens', 'product', 'canon', 'filter', 'radio', 'speaker', 'blade', 'ipad', 'car', 'diameter', 'door', 'record', 'office', 'handle', 'bike', 'dvd', 'rules', 'seat', 'air', 'nikon', 'kit', 'oil', 'apple', 'made?', 'bulb', 'pump', 'metal', 'bed']

