## Importing Libraries

In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.preprocessing import LabelBinarizer
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import Ridge


## Loading Dataset

In [2]:
# Loading dataset
train = pd.read_csv(r'C:\Users\Ojas\Desktop\Mercari Price Suggestions\Mercari Price Suggestions\train.tsv', sep = '\t')
test = pd.read_csv(r'C:\Users\Ojas\Desktop\Mercari Price Suggestions\Mercari Price Suggestions\test.tsv', sep = '\t')

In [5]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [5]:
print("Training data size is: " + str(train.shape))

Training data size is: (1482535, 8)


There are total 1482535 observations in the training dataset.

In [7]:
# Exploring testing data
test.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


In [8]:
print("Test data size is: " + str(test.shape))


Test data size is: (693359, 7)


There are around 700,000 observations in the test dataset.

## Decreasing size of Training DataSet

Sampling only 10% of the dataset now, to improve processing speed.

In [6]:
decrease_train = train.sample(frac=0.1).reset_index(drop=True)
train= decrease_train

##  Data Cleaning 

In [8]:
train['category_name'] = train['category_name'].fillna('Other').astype(str)
train['brand_name'] = train['brand_name'].fillna('missing').astype(str)
train['shipping'] = train['shipping'].astype(str)
train['item_condition_id'] = train['item_condition_id'].astype(str)
train['item_description'] = train['item_description'].fillna('None')

### Target Variable in different dataframe


In [13]:
target = np.log1p(train['price'])

## Topic Modelling with LDA

# LDA – Latent Dirichlet Allocation 
Base of LDA is Probabilistic Graphical Models


### Use of Topic Modeling
There are several scenarios when topic modeling can prove useful. Here are some of them:

1. Text classification – Topic modeling can enhance classification by combining similar words together in topics rather than using each word as a feature
2. Recommender Systems – Using a similarity measure we can build recommender systems. If our system would recommend articles for readers, it will recommend articles with a topic structure similar to the articles the user has already read.
3. Discovering Themes in Texts – Useful for detecting trends in online publications for example

### Working of LDA
LDA is an iterative algorithm. Here are the two main steps:

1. In the initialization stage, each word is assigned to a random topic.
2. Iteratively, the algorithm goes through each word and reassigns the word to a topic taking into consideration: What’s the probability of the word belonging to a topic and What’s the probability of the document to be generated by a topic

Each topic in a document are percentages that all add up to 1.

In [14]:
# Importing Libraries
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, cross_val_score, train_test_split

from sklearn.linear_model import Ridge

In [15]:
from sklearn.decomposition import LatentDirichletAllocation

# Making changes to the countvectorizer function
cvec = CountVectorizer(max_features=55000, stop_words ='english', lowercase=True)
# Fitting to the dataset4
cvz = cvec.fit_transform(train['item_description'])
# Initializing LDA with 10 topics
lda_model = LatentDirichletAllocation(n_topics=10,random_state=42)

# Fitting to the CountVectorizer Transformation
x_topics = lda_model.fit_transform(cvz) 

# Defining Variables
n_top_words = 10
topic_summaries = []

# Getting the topic words
topic_word = lda_model.components_

# Getting the vocabulary from the text features
vocab = cvec.get_feature_names()

# Displaying the topic models
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'. format(i, ' | '.join(topic_words)))



Topic 0: condition | good | used | great | size | excellent | worn | wear | small | times
Topic 1: size | worn | cute | new | super | black | brand | fit | dress | small
Topic 2: rm | iphone | shipping | plus | free | quality | stickers | note | color | high
Topic 3: free | shipping | price | bundle | new | firm | items | save | check | brand
Topic 4: new | brand | box | used | oz | authentic | opened | sealed | body | set
Topic 5: shipping | new | gold | free | rm | item | items | silver | price | ship
Topic 6: size | new | brand | black | tags | worn | pink | white | blue | shirt
Topic 7: description | free | home | smoke | pet | used | lip | matte | lipstick | jeans
Topic 8: bag | skin | new | leather | use | pocket | hair | oil | used | black
Topic 9: case | iphone | new | phone | comes | works | brush | charger | screen | box


## Eli5

### Eli5  – Explain it Like I'm 5
- It's a library that allows you to see what our model has learned from the text features.
- Observing the features helps us to **understand how our classifier works**. 


### Use of Eli5
Looking at features helps to understand how classifier works. Maybe even more importantly, it helps us to notice preprocessing bugs, data leaks, issues with task specification - all these nasty problems that we get in a real world.

### Working of Eli5
It shows us the correlation of each feature/text with the target variable. We can observe features and weights because we’re using a bag-of-words vectorizer and a linear classifier (so there is a direct mapping between individual words and classifier coefficients). 

### Debugging Best Practices
- classifier assigns high weights to seemingly unrelated words like ‘do’ or ‘my’ -> Remove Stop Words

I think it might be more important to see what the model finds important and try to normalize maybe the top 10-30 tokens our particular model sees as important. Focusing on some of the top features and finding normalizations for that also mixed with some extra feature engineering has helped me to push my score a little bit farther.

## Analyzing Item Description with Eli5

In [16]:
# Defining RMSLE Cross Validation Function
def rmsle_cv(model):
    kf= KFold(shuffle=True, random_state=42).get_n_splits(train['item_description'])
    rmse= np.sqrt(-cross_val_score(model, train['item_description'], target, scoring ='neg_mean_squared_error', cv= kf))
    return(rmse.mean())

## Creating Baseline Model with CountVectorizer

In [17]:
from sklearn.linear_model import Ridge

vec= CountVectorizer()
rid= Ridge(random_state=42)
pipe = make_pipeline(vec,rid)
pipe.fit(train['item_description'], target)

cv_rmsle = rmsle_cv(pipe)
print("The Validation Score is : " + str(cv_rmsle))



The Validation Score is : 0.6678939455483531


In [18]:
#pip install eli5

In [19]:
import eli5
eli5.show_weights(pipe, vec=vec, top=100, feature_filter=lambda x: x != '<BIAS>') 

Weight?,Feature
+1.369,hatchimal
+0.925,substitutions
+0.923,deadstock
+0.916,hatchimals
+0.844,vnds
+0.806,louboutin
+0.804,dustbag
+0.781,64gb
+0.774,médium
+0.742,kitchenaid


In [20]:
eli5.show_prediction(rid, doc=train['item_description'][1297], vec=vec)

Contribution?,Feature
2.815,<BIAS>
0.015,Highlighted in text (sum)


## Baseline Model with CountVectorizer and Stopwords

In [21]:
vec = CountVectorizer(stop_words='english')
rid = Ridge(random_state=42)
pipe = make_pipeline(vec, rid)
pipe.fit(train['item_description'], target)

cv_sw_rmsle = rmsle_cv(pipe)

print("The Validation Score is: " + str(cv_sw_rmsle))



The Validation Score is: 0.6689016603920348


In [22]:
eli5.show_prediction(rid, doc=train['item_description'][1297], vec=vec)

Contribution?,Feature
2.817,<BIAS>
0.039,Highlighted in text (sum)


## Baseline Model with TF-IDF

In [23]:
vec = TfidfVectorizer()
rid = Ridge(random_state=42)
pipe = make_pipeline(vec, rid)
pipe.fit(train['item_description'], target)

tfidf_rmsle = rmsle_cv(pipe)

print("The Validation Score is: " + str(tfidf_rmsle))



The Validation Score is: 0.625677935586742


In [24]:
eli5.show_prediction(rid, doc=train['item_description'][1297], vec=vec)

Contribution?,Feature
2.707,<BIAS>
0.094,Highlighted in text (sum)


## Baseline Model with TFIDF and Stopwords

In [25]:
vec = TfidfVectorizer(stop_words='english')
rid = Ridge(random_state=42)
pipe = make_pipeline(vec, rid)
pipe.fit(train['item_description'], target)

tfidf_sw_rmsle = rmsle_cv(pipe)

print("The Validation Score is: " + str(tfidf_sw_rmsle))



The Validation Score is: 0.6274273429906468


In [26]:
eli5.show_prediction(rid, doc=train['item_description'][1297], vec=vec)

Contribution?,Feature
2.717,<BIAS>
0.12,Highlighted in text (sum)


## Baseline Model with TF-IDF, Stop words and N-Grams 

In [27]:
vec = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
rid = Ridge(random_state=42)
pipe = make_pipeline(vec, rid)
pipe.fit(train['item_description'], target)

tfidf_sw_ng_rmsle = rmsle_cv(pipe)

print("The Validation Score is: " + str(tfidf_sw_ng_rmsle))



The Validation Score is: 0.6152139172906265


In [28]:
eli5.show_prediction(rid, doc=train['item_description'][1297], vec=vec)

Contribution?,Feature
2.747,<BIAS>
0.115,Highlighted in text (sum)


## Summary of RMSLE

TF-IDF + Stop Words + N-Grams will probably work ultimately

In [29]:
print ("RMSLE Score: " + str(cv_rmsle) + " | CountVectorizer")
print ("RMSLE Score: " + str(cv_sw_rmsle) + " | CountVectorizer | Stop Words")
print ("RMSLE Score: " + str(tfidf_rmsle) + " | TF-IDF")
print ("RMSLE Score: " + str(tfidf_sw_rmsle) + " | TF-IDF | Stop Words")
print ("RMSLE Score: " + str(tfidf_sw_ng_rmsle) + " | TF-IDF | Stop Words | N-Grams")

RMSLE Score: 0.6678939455483531 | CountVectorizer
RMSLE Score: 0.6689016603920348 | CountVectorizer | Stop Words
RMSLE Score: 0.625677935586742 | TF-IDF
RMSLE Score: 0.6274273429906468 | TF-IDF | Stop Words
RMSLE Score: 0.6152139172906265 | TF-IDF | Stop Words | N-Grams


## Feature Transformation

It's super modular. So you have an estimator or a transformer, then you have a pipeline, then you connect more than one transformations together

In [30]:
from sklearn.pipeline import FeatureUnion

default_preprocessor = CountVectorizer().build_preprocessor()

def build_preprocessor(field):
    field_idx = list(train.columns).index(field)
    return lambda x: default_preprocessor((x[field_idx]))

vectorizer = FeatureUnion([
    ('name', CountVectorizer(
        ngram_range=(1, 2),
        max_features=50000,
        preprocessor=build_preprocessor('name'))),
    ('category_name', CountVectorizer(
        token_pattern='.+',
        preprocessor=build_preprocessor('category_name'))),
    ('brand_name', CountVectorizer(
        token_pattern='.+',
        preprocessor=build_preprocessor('brand_name'))),
    ('shipping', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('shipping'))),
    ('item_condition_id', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('item_condition_id'))),
    ('item_description', TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=55000,
        stop_words='english',
        preprocessor=build_preprocessor('item_description'))),
])

## Modeling with 
<ul>
    <li>Ridge Regression</li>
    <li>LASSO Regression</li>
    <li>Light GBM</li></ul>

### Creating Transformed Training Set

In [31]:
train_transform = vectorizer.fit_transform(train.values)
train_transform

<148254x108389 sparse matrix of type '<class 'numpy.float64'>'
	with 4628446 stored elements in Compressed Sparse Row format>

### Defining RMSLE Function
It puts more penalty on lower errors.This is used when you want to penalize under estimates more than over estimates.
Lets have a look at the below example

Case a) : Pi = 600, Ai = 1000 RMSE = 400, RMSLE = 0.5108

Case b) : Pi = 1400, Ai = 1000 RMSE = 400, RMSLE = 0.3365

As it is evident, the differences are same between actual and predicted in both the cases. RMSE treated them equally however RMSLE penalized the under estimate more than over estimate.

In [32]:
def rmsle(y,pred): 
    return np.sqrt(mean_squared_error(y,pred))

## Ridge cross-validation

In [37]:
%%time

# Create 3-Fold CV
cv = KFold(n_splits=3, shuffle=True, random_state=42)
for train_ids, valid_ids in cv.split(train_transform):
    # Define Ridge Model
    model_ridge = Ridge(solver = "lsqr", fit_intercept=True, random_state=42)
    
    # Fitting Ridge Model
    model_ridge.fit(train_transform[train_ids], target[train_ids])
    
    # Predict & Evaluate Training Score
    y_pred_train = model_ridge.predict(train_transform[train_ids])
    rmsle_train = rmsle(y_pred_train, target[train_ids])
    
    # Predict & Evaluate Validation Score
    y_pred_valid = model_ridge.predict(train_transform[valid_ids])
    rmsle_valid = rmsle(y_pred_valid, target[valid_ids])
    
    print(f'Ridge Training RMSLE: {rmsle_train:.5f}')
    print(f'Ridge Validation RMSLE: {rmsle_valid:.5f}')



Ridge Training RMSLE: 0.29880
Ridge Validation RMSLE: 0.52480




Ridge Training RMSLE: 0.29764
Ridge Validation RMSLE: 0.52576




Ridge Training RMSLE: 0.29720
Ridge Validation RMSLE: 0.52761
Wall time: 27.7 s


## LASSO Cross Validation

In [34]:
%%time
from sklearn.linear_model import Lasso

# Create 3-Fold CV
cv = KFold(n_splits=3, shuffle=True, random_state=42)
for train_ids, valid_ids in cv.split(train_transform):
    # Defining LASSO Model
    model_LASSO = Lasso(fit_intercept=True, random_state=42)
    
    # Fittin LASSO Model
    model_LASSO.fit(train_transform[train_ids], target[train_ids])
    
    # Predict & Evaluate Training Score
    y_pred_train = model_LASSO.predict(train_transform[train_ids])
    rmsle_train = rmsle(y_pred_train, target[train_ids])
    
    # Predict & Evaluate Validation Score
    y_pred_valid = model_LASSO.predict(train_transform[valid_ids])
    rmsle_valid = rmsle(y_pred_valid, target[valid_ids])
    
    print(f'LASSO Training RMSLE: {rmsle_train:.5f}')
    print(f'LASSO Validation RMSLE: {rmsle_valid:.5f}')

LASSO Training RMSLE: 0.75036
LASSO Validation RMSLE: 0.75253
LASSO Training RMSLE: 0.75201
LASSO Validation RMSLE: 0.74923
LASSO Training RMSLE: 0.75088
LASSO Validation RMSLE: 0.75150
Wall time: 1min


### Comparison between RIDGE and LASSO
Here we observe that LASSO performed worse as compared to Ridge
- Ridge RMSLE: 0.53 
- LASSO RMSLE: 0.74

One reason would be that since LASSO performs automatic feature selection.We have to note that majority of our features are just words, so it'll remove some of our text features and it may not generalize well with new data because our dataset is supposed to capture and use all our words as features.

### LightGBM

Why LightGBM ?
- ‘Light’ because of its high speed. 
- Can handle the large size of data
- Can take lower memory to run
- Focuses on accuracy of results

When not to use LightGBM?
- When you have small data
- Prone to overfitting
- Needs a lot of tuning

**Reference:** https://medium.com/@pushkarmandot/https-medium-com-pushkarmandot-what-is-lightgbm-how-to-implement-it-how-to-fine-tune-the-parameters-60347819b7fc

## LGBM Cross Validation

In [36]:
%%time
import lightgbm as lgb

# Create 3-Fold CV
cv = KFold(n_splits=3, shuffle=True, random_state=42)
for train_ids, valid_ids in cv.split(train_transform):
    # Defining LGBM Model
    model_lgb = lgb.LGBMRegressor(num_leaves=31, n_jobs=-1, learning_rate=0.1, n_estimators=500, random_state=42)
    
    # Fit LGBM Model
    model_lgb.fit(train_transform[train_ids], target[train_ids])
    
    # Predict & Evaluate Training Score
    y_pred_train = model_lgb.predict(train_transform[train_ids])
    rmsle_train = rmsle(y_pred_train, target[train_ids])
    
    # Predict & Evaluate Validation Score
    y_pred_valid = model_lgb.predict(train_transform[valid_ids])
    rmsle_valid = rmsle(y_pred_valid, target[valid_ids])
    
    print(f'LGBM Training RMSLE: {rmsle_train:.5f}')
    print(f'LGBM Validation RMSLE: {rmsle_valid:.5f}')

LGBM Training RMSLE: 0.46697
LGBM Validation RMSLE: 0.51435
LGBM Training RMSLE: 0.46713
LGBM Validation RMSLE: 0.51549
LGBM Training RMSLE: 0.46521
LGBM Validation RMSLE: 0.51496
Wall time: 8min 11s


## Splitting the data into training and test set

In [41]:
X_train, X_test, y_train, y_test = train_test_split(train_transform, target, test_size=0.2, random_state=144)

## LGBM Model

In [43]:
import lightgbm as lgb
# Define LGBM Model
model_lgb = lgb.LGBMRegressor(num_leaves=31, n_jobs=-1, learning_rate=0.1, n_estimators=500, random_state=42)

# Fit LGBM Model
model_lgb.fit(X_train, y_train)

# Predict with LGBM Model
lgbm_y_pred = model_lgb.predict(X_test)

## Ridge Model

In [44]:
# Define Ridge Model
model_ridge = Ridge(solver = "lsqr", fit_intercept=True, random_state=42)
    
# Fit Ridge Model
model_ridge.fit(X_train, y_train)
    
# Evaluate Training Score
ridge_y_pred = model_ridge.predict(X_test)



## Ensemble Model

In [45]:
ensemble_y_pred = (lgbm_y_pred+ridge_y_pred)/2

ensemble_rmsle = rmsle(ensemble_y_pred, y_test)

print(f'Ensemble RMSLE: {ensemble_rmsle:.5f}')

Ensemble RMSLE: 0.48694


## Predictions

### Ensemble Predicitons without Inverse Log Transformation

In [46]:
ensemble_y_pred[0:20]

array([3.25186164, 2.63367992, 3.07723464, 1.95269428, 2.93990784,
       2.90984555, 2.56292745, 2.52583405, 2.86351405, 3.84182711,
       2.31098767, 2.92186991, 2.55366153, 4.92870263, 2.49677234,
       2.67134654, 3.4255612 , 3.29999116, 3.74340069, 2.44961912])

### Ensemble Predictions with Inverse Log Exponential

In [47]:
ensemble_y = (np.expm1(lgbm_y_pred)+np.expm1(ridge_y_pred))/2
ensemble_y[200:220]

array([25.95900705, 25.82932967,  8.2400863 , 32.36083608, 15.37796625,
       15.51959371,  9.89758971, 49.56851806, 24.44246037, 26.44206035,
       32.96775071, 16.20025029, 33.09597006, 15.33554484, 14.29512655,
        9.72598143, 14.61898409, 53.34967988, 44.5908025 , 11.52589466])

### Test Predictions with Inverse Log Exponential

In [48]:
np.expm1(y_test[200:220])

27389     39.0
15132     19.0
48407     10.0
109582    47.0
21767     24.0
106154    19.0
4959      18.0
101231    31.0
102148    40.0
19708     39.0
127741    56.0
48785     15.0
133352    36.0
72243     15.0
28311     10.0
6026      18.0
21722     13.0
125264    56.0
52726     42.0
39941     22.0
Name: price, dtype: float64