In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


import re
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import gensim 
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer, PorterStemmer 
import string
import spacy
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
import xgboost as xgb
from collections import defaultdict

In [2]:
train_df =  pd.read_csv('../input/new-575-train-test/new_train.csv', sep = "\t")
train_df = train_df.fillna('')
test_df = pd.read_csv('../input/new-575-train-test/new_test.csv', sep = "\t")
test_df = test_df.fillna('')

In [3]:
df_all = pd.concat((train_df, test_df), axis=0, ignore_index=True)

In [4]:
table = str.maketrans(dict.fromkeys(string.punctuation))
porter = nltk.PorterStemmer()
#nlp = spacy.load('en_core_web_lg')
num_train = train_df.shape[0]
random_st = 1024

In [5]:
def process_3(text):
    text = text.lower().translate(table)
    tokens = word_tokenize(text)
    return " ".join(tokens)


In [6]:
#df_all = df_all[:10]

In [7]:
df_all["product_description_tokens"] = df_all["product_description"].apply(lambda x : process_3(x))
df_all["product_title_tokens"] = df_all["product_title"].apply(lambda x : process_3(x))
df_all["search_term_tokens"] = df_all["search_term"].apply(lambda x : process_3(x))
df_all["attribute_tokens"] = df_all["attribute"].apply(lambda x : process_3(x))

In [8]:
df_all["product_description_stemmed"] = df_all["product_description_tokens"].apply(lambda x : " ".join([porter.stem(w) for w in x.split(" ") if w.isalpha()]))
df_all["product_title_stemmed"] = df_all["product_title_tokens"].apply(lambda x : " ".join([porter.stem(w) for w in x.split(" ") if w.isalpha()]))
df_all["search_term_stemmed"] = df_all["search_term_tokens"].apply(lambda x : " ".join([porter.stem(w) for w in x.split(" ") if w.isalpha()]))
df_all["attribute_stemmed"] = df_all["attribute_tokens"].apply(lambda x : " ".join([porter.stem(w) for w in x.split(" ") if w.isalpha()]))

In [9]:
df_all.head()

Unnamed: 0,attribute,id,product_description,product_title,product_uid,relevance,search_term,product_description_tokens,product_title_tokens,search_term_tokens,attribute_tokens,product_description_stemmed,product_title_stemmed,search_term_stemmed,attribute_stemmed
0,Versatile connector for various 90 connections...,2,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie 12-Gauge Angle,100001,3.0,angle bracket,not only do angles make joints stronger they a...,simpson strongtie 12gauge angle,angle bracket,versatile connector for various 90 connections...,not onli do angl make joint stronger they also...,simpson strongti angl,angl bracket,versatil connector for variou connect and home...
1,Versatile connector for various 90 connections...,3,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket,not only do angles make joints stronger they a...,simpson strongtie 12gauge angle,l bracket,versatile connector for various 90 connections...,not onli do angl make joint stronger they also...,simpson strongti angl,l bracket,versatil connector for variou connect and home...
2,"Brush,Roller,Spray 6.63 in 7.76 in 6.63 in Rev...",9,BEHR Premium Textured DECKOVER is an innovativ...,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.0,deck over,behr premium textured deckover is an innovativ...,behr premium textured deckover 1gal sc141 tugb...,deck over,brushrollerspray 663 in 776 in 663 in revives ...,behr premium textur deckov is an innov solid c...,behr premium textur deckov tugboat wood and co...,deck over,brushrollerspray in in in reviv wood and compo...
3,Combo Tub and Shower No Includes the trim kit ...,16,Update your bathroom with the Delta Vero Singl...,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head,update your bathroom with the delta vero singl...,delta vero 1handle shower only faucet trim kit...,rain shower head,combo tub and shower no includes the trim kit ...,updat your bathroom with the delta vero single...,delta vero shower onli faucet trim kit in chro...,rain shower head,combo tub and shower no includ the trim kit on...
4,Combo Tub and Shower No Includes the trim kit ...,17,Update your bathroom with the Delta Vero Singl...,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet,update your bathroom with the delta vero singl...,delta vero 1handle shower only faucet trim kit...,shower only faucet,combo tub and shower no includes the trim kit ...,updat your bathroom with the delta vero single...,delta vero shower onli faucet trim kit in chro...,shower onli faucet,combo tub and shower no includ the trim kit on...


In [10]:
df_all.to_csv("df_all_toeknized.csv",index = None, sep = "\t")

In [11]:
train_df = df_all.iloc[:num_train]
train_df.to_csv("feature_train.csv",index = None, sep = "\t")
test_df = df_all.iloc[num_train:]
test_df.to_csv("feature_test.csv",index = None, sep = "\t")

In [12]:
product_expand_raw_search_3 = defaultdict(set)
product_expand_raw_search_2 = defaultdict(set)

for ind, row in train_df.iterrows():
    if row["relevance"] == 3:
        for i in row["search_term"].split(" "):
            product_expand_raw_search_3[row["product_uid"]].add(i)
    elif row["relevance"] >= 2.67:
        for i in row["search_term"].split(" "):
            product_expand_raw_search_2[row["product_uid"]].add(i)

In [13]:
r_dt = []
for k,v in product_expand_raw_search_3.items():
    r_dt.append([k," ".join(list(v))])

product_expand_3 = pd.DataFrame(r_dt, columns = ["product_uid","expand_3"])

In [14]:
r_dt = []
for k,v in product_expand_raw_search_2.items():
    r_dt.append([k," ".join(list(v))])

product_expand_2 = pd.DataFrame(r_dt, columns = ["product_uid","expand_2"])

In [15]:
total_expand = product_expand_3.merge(product_expand_2, on = ["product_uid"],how = "outer").fillna("")

In [16]:
total_expand.to_csv("product_expand.csv", index = False)