## Start-up -- **Load saved (and preprocessed) data from pickle files**

In [61]:
## Some imports first
import pandas as pd
import numpy as np

### 1. Read saved (serialized) data

In [62]:
PATH = '../input/csc-575-hw5-winter-2024/'
# saved pickle files
pkl_train_x = 'train_x.pkl'
pkl_train_y = 'train_y.pkl'
pkl_test = 'test.pkl'

train_x = pd.read_pickle(f'{PATH}{pkl_train_x}')
train_y = pd.read_pickle(f'{PATH}{pkl_train_y}')
test = pd.read_pickle(f'{PATH}{pkl_test}')

In [63]:
train_x.head(100)

Unnamed: 0,id,product_uid,product_title,search_term,product_description,attributes
0,2,100001,"[simpson, strongti, 12gaug, angl]","[angl, bracket]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
1,3,100001,"[simpson, strongti, 12gaug, angl]","[l, bracket]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
2,9,100002,"[behr, premium, textur, deckov, 1gal, sc141, t...",[deck],"[behr, premium, textur, deckov, innov, solid, ...","[applic, method, brushrollerspray, assembl, de..."
3,16,100005,"[delta, vero, 1handl, shower, faucet, trim, ki...","[rain, shower, head]","[updat, bathroom, delta, vero, singlehandl, sh...","[bath, faucet, type, combo, tub, shower, built..."
4,17,100005,"[delta, vero, 1handl, shower, faucet, trim, ki...","[shower, faucet]","[updat, bathroom, delta, vero, singlehandl, sh...","[bath, faucet, type, combo, tub, shower, built..."
...,...,...,...,...,...,...
95,343,100057,"[owen, corn, r19, kraft, face, insul, batt, 15...","[fiberglass, insul]","[ecotouch, insul, reinvent, fiberglass, insul,...","[bullet01, dimens, 614, x, 15, x, 93, 8, piec,..."
96,346,100057,"[owen, corn, r19, kraft, face, insul, batt, 15...","[owen, corn, 73]","[ecotouch, insul, reinvent, fiberglass, insul,...","[bullet01, dimens, 614, x, 15, x, 93, 8, piec,..."
97,347,100057,"[owen, corn, r19, kraft, face, insul, batt, 15...","[r, 15]","[ecotouch, insul, reinvent, fiberglass, insul,...","[bullet01, dimens, 614, x, 15, x, 93, 8, piec,..."
98,350,100057,"[owen, corn, r19, kraft, face, insul, batt, 15...","[r19, insul]","[ecotouch, insul, reinvent, fiberglass, insul,...","[bullet01, dimens, 614, x, 15, x, 93, 8, piec,..."


In [64]:
train_y.head()

0    3.00
1    2.50
2    3.00
3    2.33
4    2.67
Name: relevance, dtype: float64

In [65]:
test.head()

Unnamed: 0,id,product_uid,product_title,search_term,product_description,attributes
0,4,100001,"[simpson, strongti, 12gaug, angl]","[metal, l, bracket]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
1,5,100001,"[simpson, strongti, 12gaug, angl]","[simpson, sku, abl]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
2,6,100001,"[simpson, strongti, 12gaug, angl]","[simpson, strong, tie]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
3,7,100001,"[simpson, strongti, 12gaug, angl]","[simpson, strong, tie, hcc668]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
4,10,100003,"[sterl, ensembl, 3314, x, 60, x, 7514, bath, s...","[bath, shower, kit]","[classic, architectur, meet, contemporari, des...","[builtin, flang, ye, bullet01, slightli, narro..."


In [66]:
#import 1. Count Vectorizer and tfidf transformer for building tfidf matrices
#       2. cosine_similarity for calculating cosine of documents vectors and search term vectors
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#builds a common vocabulary for all documents and search-term for construction of matrices.
def buildVocab(train_x):
    #transform the columns to a list of documents for each field. list_ = a list tokenized documents(each doucment is a list of tokens)
    list_a = train_x["product_title"].tolist()
    list_b = train_x["search_term"].tolist() #searchterm documents
    list_c = train_x["product_description"].tolist()
    list_d = train_x["attributes"].tolist()
    #Access the combined tokens as a single string
    string1 = ' '.join(' '.join(sublist) for sublist in list_a)
    string2 = ' '.join(set(string1.split())) #remove all the duplicates.
    
    string5 = ' '.join(' '.join(sublist) for sublist in list_c)
    string6 = ' '.join(set(string5.split()))

    string7 = ' '.join(' '.join(sublist) for sublist in list_d)
    string8 = ' '.join(set(string7.split()))
    combinedstring = string2  + " " + string6 + " " + string8
    cs = list(set(combinedstring.split())) #the entire vocabulary
    print("size of the vocabulary: ", len(cs))
    
    return cs

#This function constructs a matrix for product-title, description and attributes
# the tfidfTransformer() uses parameter 1.) sublinear_tf = True (for applying 1+log(tf) formula)                                      
def constructTfidfMatForX(cs, list_a):
    list1 = [', '.join(map(str, sublist)) for sublist in list_a] 
    vectorizer = CountVectorizer(vocabulary = cs)
    X = vectorizer.fit_transform(list1) #tf
    tfidf_transformer1 = TfidfTransformer( smooth_idf=False,sublinear_tf=True)
    tfidf_matrix1 = tfidf_transformer1.fit_transform(X)
    return tfidf_matrix1

#This function constructs a matrix for search term
# the tfidfTransformer() uses parameter 1.) sublinear_tf = True (for applying 1+log(tf) formula)   
#                                       2.) use_idf = False (to set idf to 1 for search term)
def constructTfidfMatForY(cs, list_b):
    list3 = [', '.join(map(str, sublist)) for sublist in list_b]
    vectorizer1 = CountVectorizer(vocabulary = cs)
    Y = vectorizer1.fit_transform(list3)
    # print(vectorizer1.get_feature_names_out().size)
    print("Search term:")
    tfidf_transformer = TfidfTransformer(use_idf=False,smooth_idf=False, sublinear_tf=True)
    tfidf_matrix = tfidf_transformer.fit_transform(Y)
    return tfidf_matrix
    
#calculates cosine scores of given matrices between corresponding rows and appends the scores to a list
def calcCosine(tfidf_matrix1, tfidf_matrix):
    
    jo = []
    # print((tfidf_matrix1[1,:]))
    for i in range(tfidf_matrix1.shape[0]): 
        co = cosine_similarity(tfidf_matrix1[i,:],tfidf_matrix[i,:] ).tolist()[0][0]

        jo.append(co)
    return jo    
    



In [67]:
#build the vocabulary for training data
vocab = buildVocab(train_x)

size of the vocabulary:  223878


In [None]:
#calculate tfidf matrix for product title
list_a = train_x["product_title"].tolist()
productTitleTfidf = constructTfidfMatForX(vocab,list_a)


In [69]:
#calculate tfidf matrix for product description
list_c = train_x["product_description"].tolist()
proDescriptionTfidf = constructTfidfMatForX(vocab,list_c)

In [70]:
#calculate tfidf matrix for attributes
list_d = train_x["attributes"].tolist()
proAttrTfidf = constructTfidfMatForX(vocab,list_d)

In [71]:
#calculate tfidf matrix for search term
list_b = train_x["search_term"].tolist() #searchterm documents
searchQueryTfidf = constructTfidfMatForY(vocab, list_b)

Search term:


In [72]:
#calculate cosine scores for product title, description, attributes matrices- each seperately with searchterm matrix 
titleQueryScore = calcCosine(productTitleTfidf, searchQueryTfidf)
print(titleQueryScore[:10])
descriptionQueryScore = calcCosine(proDescriptionTfidf, searchQueryTfidf)
print(descriptionQueryScore[:10])
attrQuerytScore = calcCosine(proAttrTfidf, searchQueryTfidf)
print(attrQuerytScore[:10])

[0.3270574504345332, 0.0, 0.0, 0.15235747590243104, 0.3745711233237903, 0.23644700113695258, 0.24709000370670614, 0.34943803436884213, 0.4349234104983533, 0.5098970256367836]
[0.1402829926234721, 0.0, 0.15706815658556786, 0.05555874872470531, 0.13190831874574774, 0.09759034127977335, 0.0984091571801954, 0.13917156474593798, 0.15757063743545752, 0.20745983271092977]
[0.0968549381449204, 0.0, 0.16113832408067721, 0.1224291426021047, 0.22399579657657176, 0.06086183511223122, 0.13341204346673852, 0.18867312125457045, 0.24650081738147384, 0.17972072439676504]


In [73]:
#calculate jaccard scores, All scores are stored in seperate lists 
def jacScores(lista, listb):
    scores = []
    for i in range(len(lista)):
        st = set(lista[i])
        pt = set(listb[i])
        intersection = st.intersection(pt)
        union = st.union(pt)
        if(len(union) == 0):
            scores.append(0)
        else:
            score = len(intersection)/ len(union)
            scores.append(score)
    return scores

l1 = train_x["product_title"].tolist()
l2 = train_x["search_term"].tolist() #searchterm documents
l3 = train_x["product_description"].tolist()
l4 = train_x["attributes"].tolist()

#product title and searchquery
titleQuery = jacScores(l1, l2)

#product description and search query
descQuery = jacScores(l3, l2)

#product attributes and search query
attrQuery = jacScores(l4, l2)

74067


In [74]:
#calculate overlap scores. All scores are stores in seperate lists
def overlap(lista, listb):
    oscores = []
    for i in range(len(lista)):
        st = set(lista[i])
        pt = set(listb[i])
        intersection = st.intersection(pt)
        if(min(len(st),len(pt)) <= 0):
            oscores.append(0)
        else:
            score = len(intersection)/ min(len(st),len(pt) )
            oscores.append(score)
    return oscores

#product title and searchquery
titleQ = overlap(l1, l2)

#product description and search query
descQ = overlap(l3, l2)

#product attributes and search query
attrQ = overlap(l4, l2)
print(len(titleQuery))

74067


In [75]:
#Finally! load all the features -cosine scores, jaccard scores and overlap into a dataframe.
dataF = {"Product-title": titleQueryScore, "Product-Description": descriptionQueryScore, "Attributes":attrQuerytScore ,
       "product-title-j":titleQuery ,"description-j": descQuery,"Attributes-j":attrQuery, 
        "Product-title-o": titleQ, "Product-Description-o": descQ, "Attributes-o":attrQ}

df1 = pd.DataFrame(dataF)
df1.head()

Unnamed: 0,Product-title,Product-Description,Attributes,product-title-j,description-j,Attributes-j,Product-title-o,Product-Description-o,Attributes-o
0,0.327057,0.140283,0.096855,0.2,0.015385,0.015625,0.5,0.5,0.5
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.157068,0.161138,0.0,0.010526,0.007634,0.0,1.0,1.0
3,0.152357,0.055559,0.122429,0.083333,0.016393,0.020408,0.333333,0.333333,0.666667
4,0.374571,0.131908,0.223996,0.2,0.033898,0.020619,1.0,1.0,1.0


In [76]:
#set new train X set
Xtrain = df1
Xtrain

Unnamed: 0,Product-title,Product-Description,Attributes,product-title-j,description-j,Attributes-j,Product-title-o,Product-Description-o,Attributes-o
0,0.327057,0.140283,0.096855,0.200000,0.015385,0.015625,0.500000,0.500000,0.500000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.157068,0.161138,0.000000,0.010526,0.007634,0.000000,1.000000,1.000000
3,0.152357,0.055559,0.122429,0.083333,0.016393,0.020408,0.333333,0.333333,0.666667
4,0.374571,0.131908,0.223996,0.200000,0.033898,0.020619,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...
74062,0.082219,0.065504,0.044206,0.062500,0.018868,0.012500,0.333333,0.333333,0.333333
74063,0.609000,0.187211,0.243446,0.375000,0.035714,0.032967,1.000000,0.666667,1.000000
74064,0.134252,0.048644,0.118511,0.071429,0.010526,0.020979,0.166667,0.166667,0.500000
74065,0.323270,0.115215,0.156058,0.200000,0.074074,0.023256,0.666667,0.666667,0.333333


In [77]:
#set new train Y set.
Ytrain = pd.DataFrame(train_y)
Ytrain

Unnamed: 0,relevance
0,3.00
1,2.50
2,3.00
3,2.33
4,2.67
...,...
74062,1.00
74063,3.00
74064,2.33
74065,3.00


In [78]:
#we will be using XGBRegressor to model the data
from xgboost import XGBRegressor

In [137]:
#tune the parameters to get the desirable predictions
xgReg = XGBRegressor(objective='reg:squarederror', eval_metric='rmse', n_estimators = 100, 
                     learning_rate=0.05, min_child_weight=1, reg_lambda = 2)

In [138]:
#split the trainng data to train set and test set to perform evaluation of the performance of the model
from sklearn.model_selection import train_test_split
# Ytrain1.head(5)
Xtr, Xte, Ytr, Yte = train_test_split(Xtrain, Ytrain,test_size = 0.25, random_state=100)
Xte

Unnamed: 0,Product-title,Product-Description,Attributes,product-title-j,description-j,Attributes-j,Product-title-o,Product-Description-o,Attributes-o
60512,0.205981,0.192970,0.029373,0.100000,0.031915,0.008333,0.250000,0.750000,0.250000
46968,0.151073,0.069395,0.000000,0.090909,0.023810,0.000000,0.333333,0.333333,0.000000
18098,0.355025,0.162996,0.046485,0.230769,0.033708,0.015385,1.000000,1.000000,0.333333
15047,0.415493,0.163094,0.176405,0.200000,0.021053,0.015748,0.666667,0.666667,0.666667
67134,0.330489,0.219620,0.317159,0.200000,0.062500,0.033898,0.666667,0.666667,0.666667
...,...,...,...,...,...,...,...,...,...
73649,0.348746,0.000000,0.064319,0.200000,0.000000,0.011364,0.500000,0.000000,0.250000
29067,0.000000,0.000000,0.126742,0.000000,0.000000,0.008772,0.000000,0.000000,0.333333
40287,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
30718,0.095214,0.071134,0.091483,0.100000,0.016949,0.009804,0.333333,0.333333,0.333333


In [139]:
#fit the training set
xgReg.fit(Xtr, Ytr)
pred = xgReg.predict(Xte)

#include ids
id = list(range(pred.shape[0]))
print(len(id))

#load the predictions to a data frame
data = {"id": id, "relevance": pred.tolist()}
dffinal = pd.DataFrame(data)
dffinal

18517
18517


Unnamed: 0,id,relevance
0,0,2.280030
1,1,2.181930
2,2,2.559608
3,3,2.369152
4,4,2.290821
...,...,...
18512,18512,2.154439
18513,18513,2.136649
18514,18514,2.022710
18515,18515,2.158283


In [140]:
#calcualte the rmse score of the prediction model
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(pred, Yte)
rms = np.sqrt(rmse) 
print(rms)

0.4823930950303158


In [83]:
#Now that the model is working, repeat the process for test data to predict the scores

#start with convertind the columns to lists
list_a1 = test["product_title"].tolist()
list_b1 = test["search_term"].tolist()
list_c1 = test["product_description"].tolist()
list_d1 = test["attributes"].tolist()

In [None]:
#build vocab
testVocab = buildVocab(test)

#construct tfidf matrices
productTitleTfidfT = constructTfidfMatForX(testVocab,list_a1)
proDescriptionTfidfT = constructTfidfMatForX(testVocab,list_c1)
proAttrTfidfT = constructTfidfMatForX(testVocab,list_d1)
searchQueryTfidfT = constructTfidfMatForY(testVocab, list_b1)

#Calculate Cosine Scores
titleQueryScoreT = calcCosine(productTitleTfidfT, searchQueryTfidfT)
print(titleQueryScoreT[:10])
descriptionQueryScoreT = calcCosine(proDescriptionTfidfT, searchQueryTfidfT)
print(descriptionQueryScoreT[:10])
attrQuerytScoreT = calcCosine(proAttrTfidfT, searchQueryTfidfT)
print(attrQuerytScoreT[:10])

In [85]:
#calculate the jaccard scores
titleQueryT = jacScores(list_a1, list_b1)
descQueryT = jacScores(list_c1, list_b1)
attrQueryT = jacScores(list_d1, list_b1)

#calculate the overlap scores
titleQT = overlap(list_a1, list_b1)
descQT = overlap(list_c1, list_b1)
attrQT = overlap(list_d1, list_b1)


112067


In [86]:
#load all the features to a data frame.
dataTF = {"Product-title": titleQueryScoreT, "Product-Description": descriptionQueryScoreT, "Attributes":attrQuerytScoreT ,
       "product-title-j":titleQueryT ,"description-j": descQueryT,"Attributes-j":attrQueryT ,
         "Product-title-o": titleQT, "Product-Description-o": descQT, "Attributes-o": attrQT}
df2 = pd.DataFrame(dataTF)
df2.head()

Unnamed: 0,Product-title,Product-Description,Attributes,product-title-j,description-j,Attributes-j,Product-title-o,Product-Description-o,Attributes-o
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.268572,0.077439,0.104465,0.166667,0.015152,0.015385,0.333333,0.333333,0.333333
2,0.268572,0.077439,0.183516,0.166667,0.015152,0.03125,0.333333,0.333333,0.666667
3,0.268572,0.077439,0.183516,0.142857,0.014925,0.030769,0.25,0.25,0.5
4,0.340184,0.05257,0.150668,0.25,0.017544,0.020202,1.0,0.333333,0.666667


In [87]:
#set the new testing data
Xtest = df2
Xtest

Unnamed: 0,Product-title,Product-Description,Attributes,product-title-j,description-j,Attributes-j,Product-title-o,Product-Description-o,Attributes-o
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.268572,0.077439,0.104465,0.166667,0.015152,0.015385,0.333333,0.333333,0.333333
2,0.268572,0.077439,0.183516,0.166667,0.015152,0.031250,0.333333,0.333333,0.666667
3,0.268572,0.077439,0.183516,0.142857,0.014925,0.030769,0.250000,0.250000,0.500000
4,0.340184,0.052570,0.150668,0.250000,0.017544,0.020202,1.000000,0.333333,0.666667
...,...,...,...,...,...,...,...,...,...
112062,0.000000,0.192954,0.163716,0.000000,0.024000,0.014085,0.000000,1.000000,0.666667
112063,0.287284,0.124787,0.082035,0.153846,0.048780,0.011765,0.666667,0.666667,0.333333
112064,0.771851,0.144958,0.332266,0.666667,0.114286,0.083333,1.000000,0.666667,0.833333
112065,0.402619,0.112150,0.000000,0.222222,0.029851,0.000000,0.500000,0.500000,0.000000


In [88]:
#predict with the test data
predF = xgReg.predict(Xtest)


In [89]:
#fianlly, convert the predicted scores to a data frame with their respective IDs in seperate columns
idtest = test['id']
data = {"id": idtest, "relevance": predF.tolist()}
dFTest = pd.DataFrame(data)
dFTest


112067


Unnamed: 0,id,relevance
0,4,2.022710
1,5,2.133813
2,6,2.197936
3,7,2.131490
4,10,2.470410
...,...,...
112062,221467,2.264502
112063,221468,2.091426
112064,221469,2.696015
112065,221470,2.368380


In [90]:
#write to a csv file for submission
dFTest.to_csv('submissionfinal3.csv', index=False)

#-----------END-------------END----------------END-----------END