In [None]:
import numpy as np
import pandas as pd
import ngram
from nlp_utils import stopwords, english_stemmer, stem_tokens, getTFV
import cPickle
import config

import re
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import decomposition, pipeline, metrics, grid_search
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from nltk.stem.porter import *
from nltk.metrics import edit_distance

from utility import correct_string

# Read dataset

In [None]:
df_train = pd.read_csv('input/train.csv', encoding='ISO-8859-1')
df_test = pd.read_csv('input/test.csv', encoding='ISO-8859-1')
df_pro_desc = pd.read_csv('input/product_descriptions.csv')
df_attr = pd.read_csv('input/attributes.csv')
df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})

num_train = df_train.shape[0]

In [None]:
df_train.info(), df_test.info()

# Feature Engineering

In [None]:
stemmer = PorterStemmer()

def str_stem(s):
    if isinstance(s, str):
        s = correct_string(s)
        s = " ".join([stemmer.stem(re.sub('[^A-Za-z0-9-./]', ' ', word))
                      for word in s.split(" ")])
        s = s.lower()
        return s
    else:
        return "null"

## 1. Common words count

In [None]:
def str_common_word(str1, str2):
    str1, str2 = str1.lower(), str2.lower()
    words, cnt = str1.split(), 0
    for word in words:
        if str2.find(word)>=0:
            cnt+=1
    return cnt

def str_whole_word(str1, str2, i_):
    str1, str2 = str1.lower().strip(), str2.lower().strip()
    cnt = 0
    #if len(str1)>0 and len(str2)>0:
    #    cnt = len(re.findall(str1,str2))
    while i_ < len(str2):
        i_ = str2.find(str1, i_)
        if i_ == -1:
            return cnt
        else:
            cnt += 1
            i_ += len(str1)
    return cnt

In [None]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all = pd.merge(df_all, df_brand, how='left', on="product_uid")

In [None]:
df_all.shape

In [None]:
df_train.head()

In [None]:
import time
print "Generate count features..."

start_time = time.time()

df_all['search_term'] = df_all['search_term'].map(lambda x:str_stem(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stem(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stem(x))

df_all['brand'] = df_all['brand'].map(lambda x:str_stem(x))
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_title'] = df_all['product_title'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_description'] = df_all['product_description'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_brand'] = df_all['brand'].map(lambda x:len(x.split())).astype(np.int64)
df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title'] +"\t"+df_all['product_description']
df_all['query_in_title'] = df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[1],0))
df_all['query_in_description'] = df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[2],0))
df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))
df_all['ratio_title'] = df_all['word_in_title']/df_all['len_of_query']
df_all['ratio_description'] = df_all['word_in_description']/df_all['len_of_query']
df_all['attr'] = df_all['search_term']+"\t"+df_all['brand']
df_all['word_in_brand'] = df_all['attr'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['ratio_brand'] = df_all['word_in_brand']/df_all['len_of_brand']
df_brand = pd.unique(df_all.brand.ravel())
d={}
i = 1
for s in df_brand:
    d[s]=i
    i+=1
df_all['brand_feature'] = df_all['brand'].map(lambda x:d[x])
df_all['search_term_feature'] = df_all['search_term'].map(lambda x:len(x))

# whether has desc or not
df_all['isdesc'] = 1
df_all.loc[df_all['product_description'].isnull(), 'isdesc'] = 0

## get product brand name
#brand_names = attribute_data[attribute_data.name == "MFG Brand Name"][['product_uid', 'value']].rename(columns={"value": "brand_name"})
#df_all = pd.merge(df_all, brand_names, how='left', on='product_uid')
#df_all.brand_name.fillna('Unknown', inplace=True)

## indoor/outdoor type
#product_type = attribute_data[attribute_data.name == "Indoor/Outdoor"][['product_uid', 'value']].rename(columns={"value": "product_type"})
#df_all = pd.merge(df_all, product_type, how='left', on='product_uid')
#df_all.product_type.fillna('Unknown', inplace=True)


#df_all.to_csv("df_all2.csv")  #no need to keep reprocessing for further grid searches
#df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)
#df_all.head()

print("Calculating count feature cost--- %s seconds ---" % (time.time() - start_time))

In [None]:
df_all.info()

In [None]:
df_all.columns

In [None]:
df_all = df_all.drop(['id','search_term','product_title','product_uid', 'relevance','product_description','product_info','attr','brand'],axis=1)

In [None]:
# dump common word feat
np.savetxt(config.path_features + 'common_word_feat.txt', df_all)

## 2. Jaccard coefficient

In [None]:
def try_divided(x, y, val=0.0):
    if y != 0.0:
        val = float(x) / y
    return val

# Jaccard coefficient between search_term and title & search_term and description
def jaccardCoef(A, B):
    A, B = set(A), set(B)
    intersect = len(A.intersection(B))
    union = len(A.union(B))
    coef = try_divided(intersect, union)
    return coef

def diceDist(A, B):
    A, B = set(A), set(B)
    intersect = len(A.intersection(B))
    union  = len(A) + len(B)
    d = try_divided(2*intersect, union)
    return d

def compute_dist(A, B, dist="jaccard_coef"):
    if dist == "jaccard_coef":
        d = jaccardCoef(A, B)
    elif dist == "dice_dist":
        d = diceDist(A, B)
    return d

def pairwise_jaccard_coef(A, B):
    coef = np.zeros((A.shape[0], B.shape[0]), dtype=float)
    for i in range(A.shape[0]):
        for j in range(B.shape[0]):
            coef[i,j] = jaccardCoef(A[i], B[j])
    return coef

def pairwise_jaccard_coef(A, B):
    d = np.zeros((A.shape[0], B.shape[0]), dtype=float)
    for i in range(A.shape[0]):
        for j in range(B.shape[0]):
            d[i,j] = diceDist(A[i], B[j])
    return d

token_pattern = r"(?u)\b\w\w+\b"
def preprocess_data(line, token_pattern=token_pattern,encode_digit=False):
    token_pattern = re.compile(token_pattern, flags=re.UNICODE | re.LOCALE)
    # tokenize
    tokens = [x.lower() for x in token_pattern.findall(line)]
    # stem
    tokens_stemmed = stem_tokens(tokens, english_stemmer)
    
    return tokens_stemmed

In [None]:
df_all.head()

In [None]:
def extract_basic_distance_feat(df):
    ## unigram
    print "generate unigram"
    df["term_unigram"] = list(df.apply(lambda x: preprocess_data(x["search_term"]), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    ## bigram
    print "generate bigram"
    join_str = "_"
    df["term_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["term_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    ## trigram
    print "generate trigram"
    join_str = "_"
    df["term_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["term_unigram"], join_str), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))
 
    ## jaccard coef/dice dist of n-gram
    print "generate jaccard coef and dice dist for n-gram"
    dists = ["jaccard_coef", "dice_dist"]
    grams = ["unigram", "bigram", "trigram"]
    feat_names = ["term", "title", "description"]
    for dist in dists:
        for gram in grams:
            for i in range(len(feat_names)-1):
                for j in range(i+1,len(feat_names)):
                    target_name = feat_names[i]
                    obs_name = feat_names[j]
                    df["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \
                            list(df.apply(lambda x: compute_dist(x[target_name+"_"+gram], x[obs_name+"_"+gram], dist), axis=1))

In [None]:
import time
print "Generate distince features..."

start_time = time.time()
extract_basic_distance_feat(df_all)

print("Calculating jaccard coef cost--- %s seconds ---" % (time.time() - start_time))

In [None]:
df_all.columns

In [None]:
df_all = df_all.drop(['id','product_title','product_uid','relevance','search_term','product_description'
                      ,'term_unigram', 'title_unigram', 'description_unigram'
                      ,'term_bigram', 'title_bigram', 'description_bigram'
                      ,'term_trigram', 'title_trigram', 'description_trigram'],axis=1)

In [None]:
# df_all['term_bigram']
df_all.shape

# dump df_all file
with open('features/jaccard_dice_dist_feat.pkl', 'wb') as f:
    cPickle.dump(df_all, f)

# 1. common word features

In [None]:
# load common word count features
with open('features/common_word_count_feat.pkl', 'rb') as f:
    df_all = cPickle.load(f)

In [None]:
df_all = df_all.drop(['search_term','product_title','product_description','product_info','attr','brand'],axis=1)

In [None]:
df_all.columns

In [None]:
df_all.head()

# 2. Jaccard Coef features

In [None]:
with open('features/jaccard_dice_dist_feat.pkl', 'rb') as f:
    jaccard_features = cPickle.load(f)

In [None]:
jaccard_features.columns

In [None]:
jaccard_features = jaccard_features.drop(['relevance'], axis=1)

In [None]:
jaccard_features.head()

In [None]:
df_all.shape, jaccard_features.shape

In [None]:
df_all = pd.merge(df_all, jaccard_features, how='left', on='id')
df_all.shape

In [None]:
df_all.columns

In [None]:
df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']

In [None]:
# cooccurrence terms column names
column_names = [
    "query_unigram_title_unigram",
    "query_unigram_title_bigram",
    "query_unigram_description_unigram",
    "query_unigram_description_bigram",
    "query_bigram_title_unigram",
    "query_bigram_title_bigram",
    "query_bigram_description_unigram",
    "query_bigram_description_bigram"
]
# feature names
feat_names = [name + "_tfidf" for name in column_names]
ngram_range = (1, 3)
svd_n_component = 100

# Generate co-occurrence tfidf feature
extract_cooccurrence_feature(df_train)
extract_cooccurrence_feature(df_test)

print "For training and testing..."

for feat_name, column_name in zip(feat_names, column_names):
    print "Generate %s feature" % feat_name
    tfv = getTFV(ngram_range=ngram_range)
    X_tfidf_train = tfv.fit_transform(df_train[column_name])
    X_tfidf_test = tfv.transform(df_test[column_name])
    with open("%s/train_%s_feat.pkl" % (config.path_features, feat_name), "wb") as f:
        cPickle.dump(X_tfidf_train, f, -1)
    with open("%s/test_%s_feat.pkl" % (config.path_features, feat_name), "wb") as f:
        cPickle.dump(X_tfidf_test, f, -1)

    # SVD
    svd = TruncatedSVD(n_components=svd_n_component, n_iter=15)
    X_svd_train = svd.fit_transform(X_tfidf_train)
    X_svd_test = svd.transform(X_tfidf_test)
    with open("%s/train_%s_individual_svd%d_feat.pkl" % (config.path_features, feat_name, svd_n_component), "wb") as f:
        cPickle.dump(X_svd_train, f, -1)
    with open("%s/test_%s_individual_svd%d_feat.pkl" % (config.path_features, feat_name, svd_n_component), "wb") as f:
        cPickle.dump(X_svd_test, f, -1)


In [None]:
df_train.head()

# 4. Cosine Similiarity

In [None]:
# load generated cosine features
X_cosine_feat_train = np.loadtxt('features/X_cosine_feat_train.txt')
X_cosine_feat_test = np.loadtxt('features/X_cosine_feat_test.txt')
print type(X_cosine_feat_train)
print X_cosine_feat_train.shape, X_cosine_feat_test.shape

# 5. Word2Vec Similiarity

In [None]:
X_w2v_feat_train = np.loadtxt('features/X_word2vec_sim_train.txt')
X_w2v_feat_test = np.loadtxt('features/X_word2vec_sim_test.txt')
print type(X_w2v_feat_train)
print X_w2v_feat_train.shape, X_w2v_feat_test.shape

# 6. Extended Query count features

In [None]:
X_extquery_count_feat_train = np.loadtxt('processed/train_ext_counts_top10.txt')
X_extquery_count_feat_test = np.loadtxt('processed/test_ext_counts_top10.txt')
print type(X_extquery_count_feat_train)
print X_extquery_count_feat_train.shape, X_extquery_count_feat_test.shape

# Evaluation Method

In [None]:
from sklearn.metrics import mean_squared_error, make_scorer

def fmean_squarded_error(ground_truth, prediction):
    fmean_squared_error_ = mean_squared_error(ground_truth, prediction) ** 0.5
    return fmean_squared_error_

RMSE = make_scorer(fmean_squarded_error, greater_is_better=False)

In [None]:
print df_train.shape, df_test.shape

In [None]:
df_train.columns

In [None]:
y_train = df_train['relevance'].values
X_train = df_train.drop(['id', 'relevance','product_uid'], axis=1)
X_test = df_test.drop(['id', 'relevance','product_uid'], axis=1)
#X_train = df_train.drop(['id', 'relevance','product_uid', 'product_title', 'search_term', 'term_unigram', 'title_unigram', 'description_unigram'], axis=1)
#X_test = df_test.drop(['id', 'relevance','product_uid', 'product_title', 'search_term', 'term_unigram', 'title_unigram', 'description_unigram'], axis=1)

In [None]:
X_train.columns

In [None]:
X_train_count = np.loadtxt('features/train_counts.txt')
X_test_count = np.loadtxt('features/test_counts.txt')

In [None]:
print X_train.shape, X_test.shape

In [None]:
print X_train.shape, X_train_count.shape, X_cosine_feat_train.shape, X_w2v_feat_train.shape, X_extquery_count_feat_train.shape

In [None]:
print X_test.shape, X_test_count.shape, X_cosine_feat_test.shape, X_w2v_feat_test.shape, X_extquery_count_feat_test.shape

In [None]:
type(X_train)

In [None]:
# merge features
X_train = np.array(X_train)
X_train = np.hstack((X_train, X_train_count, X_cosine_feat_train, X_w2v_feat_train, X_extquery_count_feat_train))
X_test = np.array(X_test)
X_test = np.hstack((X_test, X_test_count, X_cosine_feat_test, X_w2v_feat_test, X_extquery_count_feat_test))

In [None]:
X_train.shape, X_test.shape

# Train model Random Froest

In [None]:
rfr = RandomForestRegressor()
clf = pipeline.Pipeline([('rfr', rfr)])
param_grid_old = {'rfr__n_estimators' : list(range(320, 400 ,1)), 'rfr__max_depth': list(range(8,10,1))}
param_grid = {'rfr__n_estimators' : [350],#list(range(109,110,1)), 
              'rfr__max_depth': [8], #list(range(7,8,1))
            }
model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid,
                                 n_jobs = 2, cv = 10, verbose = 1, scoring=RMSE)
model.fit(X_train, y_train)

print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

y_pred = model.predict(X_test)

In [None]:
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission/rf_fe_addCosineW2cExt_20160210_tuned.csv', index=False)

# Train using GradientBoosting

In [None]:
params = {'n_estimators': 200, 'max_depth': 7, 'min_samples_split': 1,
          'learning_rate': 0.2, 'loss': 'ls'}
clf = GradientBoostingRegressor(**params)
clf.fit(X_train, y_train)
y_pred_GBM = clf.predict(X_test)

In [None]:
def output(x):
    if x < 1:
        return 1
    elif x > 3:
        return 3
    else:
        return x

In [None]:
result = [output(x) for x in y_pred_GBM]
pd.DataFrame({"id": id_test, "relevance": result}).to_csv('submission/gbm_fe_20160129_tune.csv', index=False)

# Train using Xgboost

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)  

params ={
    'colsample_bytree': 0.7,
    'silent': 1,
    'eval_metric': 'rmse',
    'nthread': 8,
    'min_child_weight': 4.0,
    'n_estimators': 380.0,
    'subsample': 0.55,
    'eta': 0.03,
    'objective': 'reg:linear',
    'seed': 10,
    'max_depth': 6,
    'gamma': 0.75}

cv_nround=1000
cv_nfold=10
#bst_cv = xgb.cv(params, dtrain, nfold=cv_nfold, num_boost_round=cv_nround
#                , early_stopping_rounds=10, show_progress=True)
clf = xgb.train(params, dtrain, 599)
y_pred = clf.predict(dtest)

In [None]:
y_pred.max()

In [None]:
result = [output(x) for x in y_pred]
pd.DataFrame({"id": id_test, "relevance": result}).to_csv('submission/xgb_fe_20160210_tune_new.csv', index=False)

# Train using SVM regressor

In [None]:
from sklearn.svm import SVR
from sklearn import grid_search

svr = SVR()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

clf = grid_search.GridSearchCV(estimator = svr, param_grid = parameters,
                                 n_jobs = -1, cv = 3, verbose = 10, scoring=RMSE)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
result = [output(x) for x in y_pred]
pd.DataFrame({"id": id_test, "relevance": result}).to_csv('submission/svm_fe_20160129_tune.csv', index=False)