# Data Exploration

In [18]:
import pandas as pd
import numpy as np

In [101]:
from sklearn.model_selection import train_test_split
dataset = pd.read_csv('data/train.tsv', sep='\t', header=0)
X = dataset.loc[:, dataset.columns != 'price']
Y = dataset.price

# 80% training data, 20% test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [102]:
X_train

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,shipping,item_description
604635,604635,Enzo,3,Women/Shoes/Mules & Clogs,,0,Enzo Angiolini Mules/Clogs. Super Cute with so...
496799,496799,Black dress,3,Women/Dresses/Full-Length,,1,Xl long black dress Solid under with lace over...
1035231,1035231,2 items for Brittany,1,Electronics/Cell Phones & Accessories/Cables &...,,0,- Urban Decay Eyeshadow This has never been us...
628659,628659,Texas budle,2,Women/Tops & Blouses/T-Shirts,,0,Very cute!! No flaws
261459,261459,North face rain coat,3,Women/Athletic Apparel/Jackets,The North Face,0,Sea foam green rain coat Size 18 XL in youth I...
...,...,...,...,...,...,...,...
359783,359783,LuLaRoe Leggings OS,2,"Women/Athletic Apparel/Pants, Tights, Leggings",Independent,1,Solid Mustard OS leggings
152315,152315,Lularoe 2 XL Carly,1,Women/Dresses/Asymmetrical Hem,LuLaRoe,0,New with tags Non smoking home
963395,963395,Coach Poppy,2,Women/Women's Accessories/Wallets,,0,This gorgeous medium size bag is like new I us...
117952,117952,Micheal Kors beanie and scarf,1,Women/Women's Accessories/Scarves & Wraps,Michael Kors,1,All new and authentic


In [103]:
X_train.describe()

Unnamed: 0,train_id,item_condition_id,shipping
count,1186028.0,1186028.0,1186028.0
mean,740909.0,1.90761,0.4473031
std,428073.1,0.9032736,0.4972155
min,0.0,1.0,0.0
25%,370261.8,1.0,0.0
50%,740533.5,2.0,0.0
75%,1111653.0,3.0,1.0
max,1482533.0,5.0,1.0


In [109]:
Y_train

604635     26.0
496799     16.0
1035231    14.0
628659      7.0
261459     51.0
           ... 
359783     18.0
152315     37.0
963395     66.0
117952     28.0
305711      6.0
Name: price, Length: 1186028, dtype: float64

In [104]:
categories = sorted([str(c) for c in set(X_train.category_name)])
print(f'There are {len(categories)} including:')
print("\n".join(np.random.choice(categories, 10)))

There are 1272 including:
Men/Blazers & Sport Coats/Double Breasted
Sports & Outdoors/Fan Shop/NCAA
Kids/Gear/Activity Centers & Entertainers
Women/Tops & Blouses/Blouse
Men/Tops/Button-Front
Men/Coats & Jackets/Poncho
Home/Kitchen & Dining/Water Coolers & Filters
Women/Pants/Casual Pants
Beauty/Bath & Body/Other
Men/Athletic Apparel/Jerseys


In [105]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train.name)

Counts are good but frequency is better to account for discrepencies in name length.

In [108]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [111]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X_train_tfidf, Y_train)

In [112]:
X_test_counts = count_vect.transform(X_test.name)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [113]:
test_predictions = regression.predict(X_test_tfidf)

In [135]:
print(f'The cheapest thing predicted ({np.min(test_predictions)}) was', X_test.iloc[np.argmin(test_predictions)])
print(f'The most expensive thing predicted ({np.max(test_predictions)}) was ', X_test.iloc[np.argmax(test_predictions)])

The cheapest thing predicted (-221.57795255046105) was train_id                                                       1105354
name                                                Rain Design mStand
item_condition_id                                                    2
category_name        Electronics/Computers & Tablets/Components & P...
brand_name                                                         NaN
shipping                                                             1
item_description     Designer laptop riser that looks great with yo...
Name: 1105354, dtype: object
The most expensive thing predicted (1123.0273187881558) was  856.0


In [162]:
print(f'Warning: {sum(test_predictions < 0)} predictions are negative. These will be clipped to 0.')



In [159]:
from sklearn.metrics import mean_squared_log_error
rmsle = np.sqrt(mean_squared_log_error(Y_test, test_predictions.clip(min=0)))
print('Root mean squared log error: %.2f' % rmsle)

Mean squared error: 0.73


In [126]:
docs_new = ['Cozy Boots', 'Victoria\'s Secret Swim Suit', 'nails and nail polish set', 'Rain Design mStand']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [127]:
regression.predict(X_new_tfidf)

array([  24.11252732,   20.25974502,   13.65111033, -221.57795255])