In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nitis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# from pydrive.auth import GoogleAuth
# from pydrive.drive import GoogleDrive
# from google.colab import auth
# from oauth2client.client import GoogleCredentials

In [0]:
# Authenticate and create the PyDrive client.
# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)

In [0]:
# link = "https://drive.google.com/open?id=1yvfGNLWo3_yQBDkiHmiv9yMeXC2Gxhnn"

In [7]:
# fluff, id = link.split('=')
# print (id) # Verify that you have everything after '='

1yvfGNLWo3_yQBDkiHmiv9yMeXC2Gxhnn


In [0]:
# downloaded = drive.CreateFile({'id':id}) 
# downloaded.GetContentFile('train.tsv')  
# df = pd.read_csv('train.tsv', sep = '\t')
# # Dataset is now stored in a Pandas Dataframe

In [2]:
NUM_BRANDS = 2000
MAX_NUM_WORDS = 50

In [3]:
def rmsle(y_pred, y_test):
  """ root mean square logrithmic error """
  assert len(y_pred) == len(y_test)
  return np.sqrt(np.mean(np.power(y_pred-y_test ,2)))

def readData():
    df = pd.read_csv('..\\mercari-price-suggestion-challenge\\train.tsv', sep='\t', encoding='utf-8')
    df.drop(columns='train_id', inplace=True)
    return df

def fillMissingData(df):
  """ fills the NA values in the dataset """
  df.brand_name.fillna(value='', inplace=True)
  df.category_name.fillna(value='//', inplace=True)
  df.item_description.fillna(value='', inplace=True)
  df.item_description = df.item_description.apply(lambda x : x.replace("No description yet", ''))
  print("*******Filled Missing Data*******")
    
# Creating three new columns to separate the categories 
def splittingCategories(df):
    df['primary_category'] = df.category_name.apply(lambda cat:cat.split('/')[0].strip())
    df['sub_category1'] = df.category_name.apply(lambda cat:cat.split('/')[1].strip())
    df['sub_category2'] = df.category_name.apply(lambda cat:cat.split('/')[2].strip())
    return df
    
def fillBrandNames(x):
    """Fills the brand names using the name column of the dataset"""
    try:
        nameList = []
        for i in [4,3,2,1]:
            temp = [' '.join(n_grams) for n_grams in ngrams(x.split(' '), i) if ' '.join(n_grams) in brands] 
        if len(temp)>0:
            nameList.append(temp)
        if len(nameList)>0:
            return nameList[0]
        else:
            return ''
    except:
        return ''
    
### Cleaning Text fields ###
def cleanText(text):
    try:
        text = ' '.join([w for w in text.split()[:MAX_NUM_WORDS]])
        text = text.lower()
        text = re.sub(u"\u2019", u"'", text)
        text = re.sub(u"\xed", u"i", text)
        text = re.sub(u"w\/", u" with ", text)
        text = re.sub(u"é", u"e", text)
        text = re.sub(u"ē", u"e", text)
        text = re.sub(u"è", u"e", text)
        text = re.sub(u"ê", u"e", text)
        text = re.sub(u"à", u"a", text)
        text = re.sub(u"â", u"a", text)
        text = re.sub(u"ô", u"o", text)
        text = re.sub(u"ō", u"o", text)
        text = re.sub(u"ü", u"u", text)
        text = re.sub(u"ï", u"i", text)
        text = re.sub(u"ç", u"c", text)
        text = re.sub(u"[^a-z0-9]", " ", text)
        text = u" ".join(re.split("(\d+)", text))
        text = re.sub(u"\s+", u" ", text).strip()
        text = "".join(text)
    except:
        text = ""
    return text

def encoding(df, train):
    if train:
        cols = ['brand_name', 'primary_category', 'sub_category1', 'sub_category2']
        for col in cols:
            temp = df[col].unique()
            label_encoder = LabelEncoder()
            df[col] = label_encoder.fit_transform(df[col])
            label_dict[col] = (label_encoder, temp)
    else:
        cols = ['brand_name', 'primary_category', 'sub_category1', 'sub_category2']
        for col in cols:
            idx = label_dict[col][1]
            df.loc[~df[col].isin(idx), col] = ''
            df[col] = label_dict[col][0].transform(df[col])
    print("*******Finished encoding******")
            
def remove_stopWords(text):
    text = ' '.join([word for word in text.split() if not word in stopWords])
    return text

def preprocessData(df, train):
    print("##################Preparing data#############")
    #df = readData()
    fillMissingData(df)
    df.name = df.name.apply(cleanText)
    df.brand_name = df.brand_name.apply(cleanText)
    df.item_description = df.item_description.apply(cleanText)
    df = splittingCategories(df)
    df.loc[df.brand_name == "", 'brand_name'] = df.loc[df.brand_name == "", 'name'].apply(fillBrandNames)
     
    for col in ['name', 'category_name', 'brand_name', 'item_description', 'primary_category', 'sub_category1', 'sub_category2']:
        df[col] = df[col].astype(str)
    
    ### remove stop words from name and description
    for col in ['name', 'item_description']:
        df[col] = df[col].apply(remove_stopWords)
        
    ### create a new col by adding pri_cat + sub_cat1 + sub_cat2 + brandname + name + description[0:5]
    ### remove extra spaces n strip
    df['text_info'] = df.primary_category +" "+ df.sub_category1 +" "+ df.sub_category2 +" "+ df.brand_name +" "+ df.name
    df.text_info = df.text_info +" "+df.item_description.apply(lambda x : " ".join(x.split()[:5]))
    df.text_info = df.text_info.apply(lambda x : re.sub(u"\s+", u" ", x).strip())
    
    # label encode brand name, category name, primary cat, sub cat 1, sub cat 2
    encoding(df, train=True)
    
    ### change item condition id to 0 to 1 range
    df.item_condition_id = df.item_condition_id/5.0
    print("#########Preprocessed Data#########")
    return df


In [4]:
def createOneVocab(df):
    for col in ['name', 'item_description']:
        vTemp = [x.split() for x in df.name]
        vocab = [elem for listOfList in vTemp for elem in listOfList]
    ### remove all words whose freq is less than 3
    freq = Counter(vocab)
    vocab = list(set([elem for elem in vocab if freq[elem]>3]))
    ### delete all the words whose length is less than 3
    print("*********Finished creating 1-gram vocab***************")
    return vocab

In [5]:
########## Experimenting with Bigrams ################
# from itertools import combinations
# def getBiGrams(sentence):
#     try:
#         sentence = np.unique(sentence.split())
#         bigramList = []
#         for comb in combinations(sentence,2):
#             c1 = comb[0] + comb[1]
#             c2 = comb[1] + comb[0]
#             alreadyInList = False
#             if c1 in vocab_onegram:
#               newWord = c1
#             if c2 in vocab_onegram:
#               newWord = c2
#             if alreadyInList == False:
#               newWord = c1
#             if len(c1)>0:
#               bigramList.append(newWord)
#         return ' '.join(bigramList)
#     except:
#         return ' '

In [6]:
originalData = readData()
originalData = shuffle(originalData, random_state=0)
originalData.reset_index(inplace=True, drop=True)
y_original = originalData.price
X_original = originalData
X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(X_original, y_original, test_size=0.3, random_state=42)

brands = X_train_original.groupby('brand_name').size()
stopWords = stopwords.words('english')
label_dict = dict()

X_train_preprocessed = preprocessData(X_train_original, train=True)
# =============================================================================
y_train_scaled = np.log1p(y_train_original)
# =============================================================================
################################Data Preprocessed#############################################
### vocabulary for vectorization
vocab_onegram = createOneVocab(X_train_preprocessed)


# X_train_preprocessed['bigrams'] = X_train_preprocessed['text_info'].apply(getBiGrams)
# vocab_bigram = createBigramVocab(X_train_preprocessed)


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\nitis\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-bfa5df341b45>", line 2, in <module>
    originalData = shuffle(originalData, random_state=0)
  File "C:\Users\nitis\Anaconda3\lib\site-packages\sklearn\utils\__init__.py", line 403, in shuffle
    return resample(*arrays, **options)
  File "C:\Users\nitis\Anaconda3\lib\site-packages\sklearn\utils\__init__.py", line 327, in resample
    resampled_arrays = [safe_indexing(a, indices) for a in arrays]
  File "C:\Users\nitis\Anaconda3\lib\site-packages\sklearn\utils\__init__.py", line 327, in <listcomp>
    resampled_arrays = [safe_indexing(a, indices) for a in arrays]
  File "C:\Users\nitis\Anaconda3\lib\site-packages\sklearn\utils\__init__.py", line 205, in safe_indexing
    return X.iloc[indices]
  File "C:\Users\nitis\Anaconda3\lib\site-packages\pandas\core\indexing.p

KeyboardInterrupt: 

In [None]:
### Caluculate the mean price for each category and brand name and save it in a dictionary
# Can implement the grouping with all the cols combined
meanPriceDict = {}
for col in ['primary_category', 'sub_category1', 'sub_category2', 'brand_name']:
    meanPriceDict[col] = X_train_preprocessed.groupby('primary_category')['price'].mean()
    meanPriceDict[col] /= max(meanPriceDict[col])
    X_train_preprocessed['MeanPrice_'+col] = X_train_preprocessed[col].map(meanPriceDict[col])
    X_train_preprocessed['MeanPrice_'+col].fillna(meanPriceDict[col].mean(), inplace=True)
    
def tokenize(text):
    return [word for word in text.split()]

### Vectorizing the text
nameVectorizer = TfidfVectorizer(vocabulary = vocab_onegram, tokenizer=tokenize)
vectorizedName = nameVectorizer.fit_transform(X_train_preprocessed.name)
print(vectorizedName.shape)
itemVectorizer = TfidfVectorizer(vocabulary=vocab_onegram, tokenizer=tokenize)
vectorizedItem = itemVectorizer.fit_transform(X_train_preprocessed.item_description)

# Text info vectorized
textInfoVectorizer = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1,2), max_features=40000, min_df=3)
vectorizedTextInfo = textInfoVectorizer.fit_transform(X_train_preprocessed.text_info)


In [24]:
###############################################################################
colsToKeep = ['item_condition_id', 'shipping', 'MeanPrice_primary_category',
              'primary_category', 'sub_category1', 'sub_category2', 'brand_name',
              'MeanPrice_sub_category1', 'MeanPrice_sub_category2', 'MeanPrice_brand_name']
# =============================================================================
y_train = y_train_scaled.values
# y_train = y_train_original.values
# =============================================================================
X_train = hstack((X_train_preprocessed[colsToKeep].values, vectorizedName, vectorizedItem, vectorizedTextInfo)).tocsr()
# X_train = hstack((X_train_preprocessed[colsToKeep].values, vectorizedTextInfo)).tocsr()
print("Train matrix size ",X_train.shape)

Train matrix size  (1037774, 86538)


In [25]:
####### Linear Regression #########
print("************Starting Model Training*************")
linReg = LinearRegression()
linReg.fit(X_train, y_train)
y_pred = linReg.predict(X_train)
print("***************Linear Regression****************")
print("RMSLE is ",rmsle(y_pred, y_train))
print("R squared value ",r2_score(y_pred, y_train))


# ridge = Ridge(alpha=1.0)
# ridge.fit(X_train, y_train)
# y_pred = ridge.predict(X_train)
# print("***************Ridge Regression****************")
# print("RMSLE is ",rmsle(y_pred, y_train))


# lasso = Lasso(alpha=1.0)
# lasso.fit(X_train, y_train)
# y_pred = lasso.predict(X_train)
# print("***************Lasso Regression****************")
# print("RMSLE is ",rmsle(y_pred, y_train))




************Starting Model Training*************
***************Linear Regression****************
RMSLE is  0.4231795075211465
R squared value  0.5317033003644911


In [0]:
import lightgbm as lgb
params = {
        'learning_rate': 0.57,
        'application': 'regression',
        'max_depth': 5,
        'num_leaves': 32,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 0.6,
        'bagging_freq': 5,
        'feature_fraction': 0.65,
        'nthread': 4,
        'min_data_in_leaf': 100,
        'max_bin': 31
}
d_train = lgb.Dataset(X_train, label=y_train)
lgmb = lgb.train(params, train_set=d_train, num_boost_round=3000, verbose_eval=1000)


In [27]:
y_pred = lgmb.predict(X_train)
print("***********Light GBM**********")
print("RMSLE is ",rmsle(y_pred, y_train))


***********Light GBM**********
RMSLE is  0.4359617248095056
R squared value  0.49459254868020686


In [28]:
# Preparing Test Data

X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(X_original, y_original, test_size=0.3, random_state=42)

label_dict = dict()

X_test_preprocessed = preprocessData(X_test_original, train=False)
# =============================================================================
y_test_scaled = np.log1p(y_test_original)
# =============================================================================
################################Data Preprocessed#############################################


for col in ['primary_category', 'sub_category1', 'sub_category2', 'brand_name']:
    X_test_preprocessed['MeanPrice_'+col] = X_test_preprocessed[col].map(meanPriceDict[col])
    X_test_preprocessed['MeanPrice_'+col].fillna(meanPriceDict[col].mean(), inplace=True)


### Vectorizing the text
vectorizedNameTest = nameVectorizer.transform(X_test_preprocessed.name)
print(vectorizedNameTest.shape)
vectorizedItemTest = itemVectorizer.transform(X_test_preprocessed.item_description)
print(vectorizedItemTest.shape)

vectorizedTextInfoTest = textInfoVectorizer.transform(X_test_preprocessed.text_info)
print(vectorizedTextInfoTest.shape)




##################Preparing data#############


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


*******Filled Missing Data*******


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

*******Finished encoding******
#########Preprocessed Data#########


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(444761, 23264)
(444761, 23264)
(444761, 40000)


In [29]:
y_test = y_test_scaled.values
X_test = hstack((X_test_preprocessed[colsToKeep].values, vectorizedNameTest, vectorizedItemTest, vectorizedTextInfoTest)).tocsr()
# X_test = hstack((X_test_preprocessed[colsToKeep].values, vectorizedTextInfoTest)).tocsr()

print("Test matrix size ",X_test.shape)



Test matrix size  (444761, 86538)


In [30]:
####### Linear Regression Test Score #########
print("************Starting Model Training*************")
y_pred = linReg.predict(X_test)
print("***************Linear Regression****************")
print("RMSLE is ",rmsle(y_pred, y_test))


************Starting Model Training*************
***************Linear Regression****************
RMSLE is  0.4667819772888975


In [31]:
####### Light GBM Test Score #########
y_pred = lgmb.predict(X_test)
print("***********Light GBM**********")
print("RMSLE is ",rmsle(y_pred, y_test))


***********Light GBM**********
RMSLE is  0.47105998281877476
