In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy import sparse
import xgboost as xgb
from sklearn.metrics import confusion_matrix, mean_squared_error
import math



In [2]:
dat = pd.read_table("train.tsv", sep = "\t")

In [3]:
dat.describe()

Unnamed: 0,train_id,item_condition_id,price,shipping
count,1482535.0,1482535.0,1482535.0,1482535.0
mean,741267.0,1.90738,26.73752,0.4472744
std,427971.1,0.9031586,38.58607,0.4972124
min,0.0,1.0,0.0,0.0
25%,370633.5,1.0,10.0,0.0
50%,741267.0,2.0,17.0,0.0
75%,1111900.0,3.0,29.0,1.0
max,1482534.0,5.0,2009.0,1.0


In [4]:
dat.shape

(1482535, 8)

In [5]:
dat.columns.values #list out column names

array(['train_id', 'name', 'item_condition_id', 'category_name',
       'brand_name', 'price', 'shipping', 'item_description'], dtype=object)

In [6]:
len(set(dat["brand_name"])) # length of unique brand names

4810

In [7]:
dat["name"].head(10)

0       MLB Cincinnati Reds T Shirt Size XL
1          Razer BlackWidow Chroma Keyboard
2                            AVA-VIV Blouse
3                     Leather Horse Statues
4                      24K GOLD plated rose
5          Bundled items requested for Ruie
6        Acacia pacific tides santorini top
7      Girls cheer and tumbling bundle of 7
8                     Girls Nike Pro shorts
9    Porcelain clown doll checker pants VTG
Name: name, dtype: object

In [8]:
dat["category_name"].head(5)

0                                    Men/Tops/T-shirts
1    Electronics/Computers & Tablets/Components & P...
2                          Women/Tops & Blouses/Blouse
3                   Home/Home Décor/Home Décor Accents
4                              Women/Jewelry/Necklaces
Name: category_name, dtype: object

In [9]:
len(set(dat["category_name"])) #1288 categories

1288

In [189]:
dat["item_condition_id"].value_counts()

1    640549
3    432161
2    375479
4     31962
5      2384
Name: item_condition_id, dtype: int64

In [8]:
# say I only look at 10% of the rows
#dat = dat.sample(frac=0.3, replace=False)

y = dat["price"]
train = dat.drop(["price"], axis=1) # drop price column and select the rest (can also do in-place)

# First pass:
# - create dtm for item_description
# - remove name, category name, and brand name
# - 1B: try with tf-idf instead of 1-hot encoding
train.drop(["category_name", "name", "brand_name"], axis = 1, inplace = True)


# Second pass:
# - add dtm's for category name and name (remove '/' and impute with ' ')

# Third pass:
# - try PCA and/or ICA on document-term matrices and see if it helps

# Optional Word-embedding path:
# - word2vec that bitch
# - cluster listing embeddings


# Fourth pass:
# - include feature engineering by transforming the category groups (or 1-hot encode category)
# - group prices into buckets
# - find tf-idf for the words 

In [9]:
x_train, x_test, y_train, y_test = train_test_split(train, y, test_size = 0.3)

In [10]:
x_train.shape # dimensions of x_train

(1037774, 4)

In [11]:
x_test.shape

(444761, 4)

In [12]:
train_id = x_train["train_id"]
test_id = x_test["train_id"]

In [13]:
n_feats = 1500
#vec = CountVectorizer(max_features = n_feats, stop_words = 'english')
vec = TfidfVectorizer(max_features = n_feats, stop_words = 'english')
vec_fit = vec.fit_transform(x_train['item_description'].values.astype('U')) # have to convert to unicode

In [14]:
pca = TruncatedSVD(n_components=300)
vec_fit = pca.fit_transform(vec_fit)

In [70]:
#num_feats = x_train[["shipping", "item_condition_id"]].values #why is this creating a memory error?
#num_feats = x_train["item_condition_id"] #numeric features
#x_train_final = sparse.hstack([vec_fit, num_feats]) # gives me a dimension error?

In [24]:
type(x_train["shipping"])

pandas.core.series.Series

In [15]:
#item_condition = sparse.csr_matrix(x_train["item_condition_id"].values).T #converts df to sparse matrix (t for transpose)

In [16]:
shipping = sparse.csr_matrix(x_train["shipping"].values).T 

In [30]:
x_train_final = np.c_[vec_fit, x_train[["shipping", "item_condition_id"]]] # bind columns
x_train_final.shape

(1037774, 302)

In [86]:
x_train_final = np.c_[vec_fit, x_train[["shipping", "item_condition_id"]]]

(1037774, 2002)

In [31]:
# transform test data
test_fit = vec.transform(x_test["item_description"].values.astype('U'))
test_fit = pca.transform(test_fit)
item_condition_test = sparse.csr_matrix(x_test["item_condition_id"].values).T
shipping_test = sparse.csr_matrix(x_test["item_condition_id"].values).T
x_test_final = np.c_[test_fit, x_test[["shipping", "item_condition_id"]]]

In [None]:
# time for some xgb
xgb_model = xgb.XGBRegressor().fit(x_train_final, y_train)
predictions = xgb_model.predict(x_test_final)
actuals = y_test
print(mean_squared_error(actuals, predictions))

In [80]:
predictions2 = xgb_model.predict(x_train_final)
actuals2 = y_train
print(mean_squared_error(actuals2, predictions2))

1196.02922005


In [3]:
# not overfitting, but the rmse is pretty shit at 1200
def rmsle(y, y_pred):
    terms_to_sum = 0
    for i in range(len(y)):
        terms_to_sum = terms_to_sum + (math.log(y_pred[i] + 1) - math.log(y[i] + 1))**2
    print ((terms_to_sum * (1.0/len(y))) ** 0.5)

In [4]:
rmsle(actuals.values, predictions)

NameError: name 'actuals' is not defined