In [1]:
# Import required libraries
# requirements are
# pandas>=0.19
# scikit_learn>=0.18

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

In [2]:
# Read data
df = pd.read_csv('Products.csv', 
                 usecols=['Product title', 'Category', 'description', 'description-short'])

In [3]:
# Print out product' categories with theirs #occurences
df.Category.value_counts()

Home | Hobby & UnderhÃ¥llning | Radiostyrt | DrÃ¶nare                      15
Home | Hobby & UnderhÃ¥llning | Hoverboards                                 6
Home | Hemmet | Bygg | El & Elinstallation | Batterier                      5
Home | Fritid & TrÃ¤ning | TrÃ¤ning & Tillskott | Aktivitetsarmband         3
Home | Hemmet | Bygg | El & Elinstallation | Belysning | LED-Lampor E27     2
Name: Category, dtype: int64

In [4]:
# Add target column with numerical representation product' category
mapped_values = {index: value for value, index in enumerate(df.Category.unique())}

df['target'] = df.Category.map(mapped_values)

In [5]:
# Add data column with aggregated text from featured columns
# Product title, description, and description-short
df['data'] = df[['Product title', 'description', 'description-short']].apply(lambda x: ' '.join(x), axis=1)

In [6]:
# Split dataset into train and test sets (80/20)
df_train, df_test = train_test_split(df, test_size = 0.2)

# Create vector representation of text "Bag of words"
vectorizer = TfidfVectorizer(sublinear_tf=True, 
                             max_df=0.5,
                             stop_words='english')

# Calculate TFIDF scores for tokens in product' data description
# for both train and test sets
X_train = vectorizer.fit_transform(df_train.data.as_matrix())
X_test = vectorizer.transform(df_test.data.as_matrix())

# Create train and test labels for classifier
y_train, y_test = df_train.target, df_test.target

# Create Multinomial Naive Bayes model (can be changed by anyoneelse model)
clf = MultinomialNB(alpha=.01)

# Train model
clf.fit(X_train, y_train)

# Get predicted values from pretrainded model
pred = clf.predict(X_test)

# Print out model' evaluation metrics (confusion matrix and precision/recall/f1-score)
print(metrics.classification_report(y_test, pred))
print(metrics.confusion_matrix(y_test, pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         1
          1       1.00      1.00      1.00         1
          2       1.00      1.00      1.00         1
          3       1.00      1.00      1.00         3
          4       1.00      1.00      1.00         1

avg / total       1.00      1.00      1.00         7

[[1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 3 0]
 [0 0 0 0 1]]


In [7]:
# Print out test product with predicted product' categories
# Category - text representation of product' category
# target - numerical representation of an actual product' category
# predicted - numerical representation of an predicted product' category
df_test['predicted'] = pred
df_test.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Product title,Category,description,description-short,target,data,predicted
19,Parrot Bebop Drone 2 RÃ¶d/Svart,Home | Hobby & UnderhÃ¥llning | Radiostyrt | D...,<p>Ett tillbehÃ¶r med perfekt passform och til...,Parrot Bebop Drone 2 RÃ¶d/Svart,3,Parrot Bebop Drone 2 RÃ¶d/Svart <p>Ett tillbeh...,3
6,Belkin Wemo Led Starterkit 2X E27 Bulbs + Wemo...,Home | Hemmet | Bygg | El & Elinstallation | B...,"<font size=""2"">â€‹</font>Med WeMo-startpaketet...",BELKIN WEMO LED STARTERKIT 2x E27 BULBS + WEMO...,2,Belkin Wemo Led Starterkit 2X E27 Bulbs + Wemo...,2
9,Smartband 2 Swr12 Vit,Home | Fritid & TrÃ¤ning | TrÃ¤ning & Tillskot...,<p><b>â€‹</b><span><b>FÃ¶lj dina rÃ¶relser ida...,<p>SMARTBAND 2 SWR12 WHITE</p>,0,Smartband 2 Swr12 Vit <p><b>â€‹</b><span><b>FÃ...,0
4,Ultra Power Mn 1400 K2 Lr14,Home | Hemmet | Bygg | El & Elinstallation | B...,<p>BÃ¤st fÃ¶r apparater som drar mycket strÃ¶m...,"<p>Ultra Power MN 1400 K2 LR14</p>\n<div id=""m...",1,Ultra Power Mn 1400 K2 Lr14 <p>BÃ¤st fÃ¶r app...,1
24,"Denver Hoverboard 10"" Vit",Home | Hobby & UnderhÃ¥llning | Hoverboards,"<p><strong>Motoriserad balans-brÃ¤da, Ã¤ven ka...","Denver HoverBoard 10"" Vit",4,"Denver Hoverboard 10"" Vit <p><strong>Motoriser...",4
21,Parrot Bebop 2 Skycontroller RÃ¶d/Sv,Home | Hobby & UnderhÃ¥llning | Radiostyrt | D...,<p>Ett tillbehÃ¶r med perfekt passform och til...,Parrot Bebop 2 SkyController RÃ¶d/Sv,3,Parrot Bebop 2 Skycontroller RÃ¶d/Sv <p>Ett ti...,3
30,Denver DrÃ¶nare med Hd-Kamera,Home | Hobby & UnderhÃ¥llning | Radiostyrt | D...,<p>2.4GHz drÃ¶nare med inbyggd HD-kamera och g...,Denver DrÃ¶nare med HD-kamera,3,Denver DrÃ¶nare med Hd-Kamera <p>2.4GHz drÃ¶na...,3
