# run here

In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk

from nltk.corpus import stopwords

from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer

from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, cross_val_score, train_test_split

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [None]:
# load training data
orig_train = pd.read_csv("/Users/yf/Documents/big data team project/mercari/train.tsv", sep = '\t')
orig_train.head()

In [None]:
# drop observations with missing outcome
orig_train = orig_train[orig_train.price != 0]
orig_train.shape

# to here

In [None]:
# check punctuation
from string import punctuation
punctuation

In [None]:
# check stopwords
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words

In [None]:
# define functions for text preocessing
import string
# remove punctuations
def remove_punctuation(sentence: str) -> str:
    return sentence.translate(str.maketrans('', '', string.punctuation))
# remove stop words
def remove_stopwords(x):
    x = ' '.join([i for i in x.lower().split(' ') if i not in stop_words])
    return x
# lowercase
def to_lower(x):
    return x.lower()

In [None]:
# stem the words
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
orig_train['item_description'] = orig_train['item_description'].apply(porter.stem)
orig_train['item_description'] = orig_train['item_description'].apply(remove_punctuation).apply(remove_stopwords).apply(to_lower)
orig_train['name'] = orig_train['name'].apply(remove_punctuation).apply(remove_stopwords).apply(to_lower)

In [None]:
# check df again
orig_train.head()

In [None]:
# check item description
orig_train['item_description'][115:125]

In [None]:
# tokenize item description
#nltk.download('punkt')
from nltk.tokenize import word_tokenize
text1 = orig_train['item_description'][120]
tokens = word_tokenize(text1)
print(tokens)

In [None]:
# apply countVectorize to name, main category, category, sub category
cv = CountVectorizer(min_df=10)
x_name = cv.fit_transform(orig_train['name'])
x_main_category = cv.fit_transform(orig_train['main_category'])
x_category = cv.fit_transform(orig_train['category'])
x_sub_category = cv.fit_transform(orig_train['sub_category'])

In [None]:
print("Item Name Shape: " + str(x_name.shape))
print("Main Category Shape: " + str(x_main_category.shape))
print("Category Shape: " + str(x_category.shape))
print("Sub Category Shape: " + str(x_sub_category.shape))

In [None]:
#### Count Vectorizer Example ####
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

In [None]:
# apply LabelBinarizer to brand
lb = LabelBinarizer(sparse_output=True)
x_brand = lb.fit_transform(orig_train['brand_name'])
print("Item Brand Shape: " + str(x_brand.shape))

In [None]:
lb.classes_

In [None]:
# apply get_dummies to item_condition_id, shipping
x_dummies = csr_matrix(pd.get_dummies(orig_train[["item_condition_id","shipping"]],sparse=True).values)
print("Dummy Shape: " + str(x_dummies.shape))

In [None]:
# Perform TFIDF Transformation of the item description 
# with the top 55000 features and has an n-gram range of 1-2
# TfidfVectorizer = CountVectorizer followed by TfidfTransformer
tv = TfidfVectorizer(max_features=55000, ngram_range=(1, 2), stop_words='english')
x_description = tv.fit_transform(orig_train['item_description'])
print("Item Description Shape: " + str(x_description.shape))

In [None]:
# create a dictionary mapping the tokens to their tfidf values
tfidf = dict(zip(tv.get_feature_names(), tv.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']
# Lowest TFIDF Scores
print(tfidf.sort_values(by=['tfidf'], ascending=True).head(10))
# Highest TFIDF Scores
print(tfidf.sort_values(by=['tfidf'], ascending=False).head(10))

In [None]:
# combine everything together
# sparse matrix (csr matrix)
sparse_merge = hstack((x_dummies, x_description, x_brand, x_name, x_main_category, x_category, x_sub_category)).tocsr()

# run here

In [None]:
# Get 10% of the Training Data
train = pd.read_csv("/Users/yf/Documents/big data team project/mercari/train.tsv", sep = '\t')
train = train[train.price != 0]
reduced_X_train = train.sample(frac=0.1).reset_index(drop=True)
reduced_y_train = np.log1p(reduced_X_train['price'])

In [None]:
# Fast Cleaning of Data
reduced_X_train['category_name'] = reduced_X_train['category_name'].fillna('Other').astype(str)
reduced_X_train['brand_name'] = reduced_X_train['brand_name'].fillna('missing').astype(str)
reduced_X_train['shipping'] = reduced_X_train['shipping'].astype(str)
reduced_X_train['item_condition_id'] = reduced_X_train['item_condition_id'].astype(str)
reduced_X_train['item_description'] = reduced_X_train['item_description'].fillna('None')

In [None]:
reduced_X_train.shape

# to here

In [None]:
%%time
# topic modeling + LDA

from sklearn.decomposition import LatentDirichletAllocation

# Initialize CountVectorizer
cvectorizer = CountVectorizer(max_features=20000,stop_words='english',lowercase=True)

# Fit it to our dataset
cvz = cvectorizer.fit_transform(reduced_X_train['item_description'])

# Initialize LDA Model with 10 Topics
lda_model = LatentDirichletAllocation(n_components=10,random_state=42)

# Fit it to our CountVectorizer Transformation
X_topics = lda_model.fit_transform(cvz)

# Define variables
n_top_words = 10
topic_summaries = []

# Get the topic words
topic_word = lda_model.components_

# Get the vocabulary from the text features
vocab = cvectorizer.get_feature_names()

# Display the Topic Models
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'.format(i, ' | '.join(topic_words)))

In [None]:
# define rmsle with cross validation
def rmsle_cv(model):
    kf = KFold(shuffle=True, random_state=42).get_n_splits(reduced_X_train["item_description"])
    rmse = np.sqrt(-cross_val_score(estimator=pipe,X=reduced_X_train["item_description"],y=reduced_y_train,scoring="neg_mean_squared_error",cv=kf))
    return (rmse.mean())

In [None]:
from sklearn.linear_model import Ridge
import eli5
# baseline model with Count Vectorizer
vec = CountVectorizer()
clf = Ridge(random_state=42)
pipe = make_pipeline(vec, clf)
pipe.fit(reduced_X_train["item_description"],reduced_y_train)
cv_rmsle = rmsle_cv(pipe)
eli5.show_prediction(estimator=clf,vec=vec,doc=reduced_X_train['item_description'][129])

In [None]:
# baseline model with Count Vectorizer and Stop Words
vec = CountVectorizer(stop_words='english')
clf = Ridge(random_state=42)
pipe = make_pipeline(vec, clf)
pipe.fit(reduced_X_train['item_description'], reduced_y_train)
cv_sw_rmsle = rmsle_cv(pipe)
eli5.show_prediction(estimator=clf,vec=vec,doc=reduced_X_train['item_description'][1297])

In [None]:
# baseline model with TF-IDF
vec = TfidfVectorizer()
clf = Ridge(random_state=42)
pipe = make_pipeline(vec, clf)
pipe.fit(reduced_X_train["item_description"],reduced_y_train)
tfidf_rmsle = rmsle_cv(pipe)
eli5.show_prediction(estimator=clf,vec=vec,doc=reduced_X_train['item_description'][1297])

In [None]:
# baseline model with TF-IDF and Stop Words
vec = TfidfVectorizer(stop_words='english')
clf = Ridge(random_state=42)
pipe = make_pipeline(vec, clf)
pipe.fit(reduced_X_train["item_description"],reduced_y_train)
tfidf_sw_rmsle = rmsle_cv(pipe)
eli5.show_prediction(estimator=clf,vec=vec,doc=reduced_X_train['item_description'][1297])

In [None]:
# baseline model with TF-IDF, Stop Words and N-Gram
vec = TfidfVectorizer(stop_words='english',ngram_range=(1,2))
clf = Ridge(random_state=42)
pipe = make_pipeline(vec, clf)
pipe.fit(reduced_X_train["item_description"],reduced_y_train)
tfidf_sw_ng_rmsle = rmsle_cv(pipe)
eli5.show_prediction(estimator=clf,vec=vec,doc=reduced_X_train['item_description'][1297])

In [None]:
# RMSLE comparison between models
print ("RMSLE Score: " + str(cv_rmsle) + " | CountVectorizer")
print ("RMSLE Score: " + str(cv_sw_rmsle) + " | CountVectorizer | Stop Words")
print ("RMSLE Score: " + str(tfidf_rmsle) + " | TF-IDF")
print ("RMSLE Score: " + str(tfidf_sw_rmsle) + " | TF-IDF | Stop Words")
print ("RMSLE Score: " + str(tfidf_sw_ng_rmsle) + " | TF-IDF | Stop Words | N-Grams")

# run here

In [None]:
from sklearn.pipeline import FeatureUnion

default_preprocessor = CountVectorizer().build_preprocessor()

def build_preprocessor(field):
    field_idx = list(reduced_X_train.columns).index(field)
    return lambda x: default_preprocessor(x[field_idx])

vectorizer = FeatureUnion([
    ('name', CountVectorizer(
        ngram_range=(1, 2),
        max_features=50000,
        preprocessor=build_preprocessor('name'))),
    ('category_name', CountVectorizer(
        token_pattern='.+',
        preprocessor=build_preprocessor('category_name'))),
    ('brand_name', CountVectorizer(
        token_pattern='.+',
        preprocessor=build_preprocessor('brand_name'))),
    ('shipping', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('shipping'))),
    ('item_condition_id', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('item_condition_id'))),
    ('item_description', TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=55000,
        stop_words='english',
        preprocessor=build_preprocessor('item_description'))),
])

In [None]:
# Create Transformed Train Set
reduced_Xt_train = vectorizer.fit_transform(reduced_X_train.values)
reduced_Xt_train

In [None]:
# calculate rmsle
def get_rmsle(y, pred): return np.sqrt(mean_squared_error(y, pred))

In [None]:
%%time
# Ridge Cross Validation

# Create 3-Fold CV
cv = KFold(n_splits=3, shuffle=True, random_state=42)
for train_ids, valid_ids in cv.split(reduced_Xt_train):
    # Define Ridge Model
    model_ridge = Ridge(solver = "sag", fit_intercept=True, random_state=42)
    
    # Fit Ridge Model
    model_ridge.fit(reduced_Xt_train[train_ids], reduced_y_train[train_ids])
    
    # Predict & Evaluate Training Score
    y_pred_train = model_ridge.predict(reduced_Xt_train[train_ids])
    rmsle_train = get_rmsle(y_pred_train, reduced_y_train[train_ids])
    
    # Predict & Evaluate Validation Score
    y_pred_valid = model_ridge.predict(reduced_Xt_train[valid_ids])
    rmsle_valid = get_rmsle(y_pred_valid, reduced_y_train[valid_ids])
    
    print(f'Ridge Training RMSLE: {rmsle_train:.5f}')
    print(f'Ridge Validation RMSLE: {rmsle_valid:.5f}')

In [None]:
%%time
# Lasso Cross Validation

# Create 3-Fold CV
cv = KFold(n_splits=3, shuffle=True, random_state=42)
for train_ids, valid_ids in cv.split(reduced_Xt_train):
    # Define Lasso Model
    model_lasso = Lasso(fit_intercept=True, random_state=42)
    
    # Fit Lasso Model
    model_lasso.fit(reduced_Xt_train[train_ids], reduced_y_train[train_ids])
    
    # Predict & Evaluate Training Score
    y_pred_train = model_lasso.predict(reduced_Xt_train[train_ids])
    rmsle_train = get_rmsle(y_pred_train, reduced_y_train[train_ids])
    
    # Predict & Evaluate Validation Score
    y_pred_valid = model_lasso.predict(reduced_Xt_train[valid_ids])
    rmsle_valid = get_rmsle(y_pred_valid, reduced_y_train[valid_ids])
    
    print(f'Lasso Training RMSLE: {rmsle_train:.5f}')
    print(f'Lasso Validation RMSLE: {rmsle_valid:.5f}')

In [None]:
%%time
# ElasticNet Cross Validation

# Create 3-Fold CV
cv = KFold(n_splits=3, shuffle=True, random_state=42)
for train_ids, valid_ids in cv.split(reduced_Xt_train):
    # Define ElasticNet Model
    model_enet = ElasticNet(random_state=42)
    
    # Fit ElasticNet Model
    model_enet.fit(reduced_Xt_train[train_ids], reduced_y_train[train_ids])
    
    # Predict & Evaluate Training Score
    y_pred_train = model_enet.predict(reduced_Xt_train[train_ids])
    rmsle_train = get_rmsle(y_pred_train, reduced_y_train[train_ids])
    
    # Predict & Evaluate Validation Score
    y_pred_valid = model_enet.predict(reduced_Xt_train[valid_ids])
    rmsle_valid = get_rmsle(y_pred_valid, reduced_y_train[valid_ids])
    
    print(f'ElasticNet Training RMSLE: {rmsle_train:.5f}')
    print(f'ElasticNet Validation RMSLE: {rmsle_valid:.5f}')

In [None]:
%%time
# LightGBM Cross Validation

# Create 3-Fold CV
cv = KFold(n_splits=3, shuffle=True, random_state=42)
for train_ids, valid_ids in cv.split(reduced_Xt_train):
    # Define LGBM Model
    model_lgb = LGBMRegressor(num_leaves=31, n_jobs=-1, learning_rate=0.1, n_estimators=500, random_state=42)
    
    # Fit LGBM Model
    model_lgb.fit(reduced_Xt_train[train_ids], reduced_y_train[train_ids])
    
    # Predict & Evaluate Training Score
    y_pred_train = model_lgb.predict(reduced_Xt_train[train_ids])
    rmsle_train = get_rmsle(y_pred_train, reduced_y_train[train_ids])
    
    # Predict & Evaluate Validation Score
    y_pred_valid = model_lgb.predict(reduced_Xt_train[valid_ids])
    rmsle_valid = get_rmsle(y_pred_valid, reduced_y_train[valid_ids])
    
    print(f'LGBM Training RMSLE: {rmsle_train:.5f}')
    print(f'LGBM Validation RMSLE: {rmsle_valid:.5f}')

In [None]:
# Ensemble step1
# Define LGBM Model
model_lgb = LGBMRegressor(num_leaves=31, n_jobs=-1, learning_rate=0.1, n_estimators=500, random_state=42)

# Fit LGBM Model
model_lgb.fit(train_X, train_y)

# Predict with LGBM Model
lgbm_y_pred = model_lgb.predict(test_X)

In [None]:
# Ensemble step2
# Define Ridge Model
model_ridge = Ridge(solver = "lsqr", fit_intercept=True, random_state=42)
    
# Fit Ridge Model
model_ridge.fit(train_X, train_y)
    
# Evaluate Training Score
ridge_y_pred = model_ridge.predict(test_X)

In [None]:
# Ensemble step3
ensemble_y_pred = (lgbm_y_pred+ridge_y_pred)/2

ensemble_rmsle = get_rmsle(ensemble_y_pred, test_y)

print(f'Ensemble RMSLE: {ensemble_rmsle:.5f}')

In [None]:
# Prediction
ensemble_y = (np.expm1(lgbm_y_pred)+np.expm1(ridge_y_pred))/2
ensemble_y[200:220]
# Test Predictions 
np.expm1(test_y[200:220])

# to here