In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.naive_bayes import MultinomialNB

In [2]:
trainingPath = '/home/mert/Downloads/train.csv'
testPath = '/home/mert/Downloads/test.csv'

train = pd.read_csv(trainingPath)
#id	query	product_title	product_description 	median_relevance	relevance_variance

test = pd.read_csv(testPath)
#id	query	product_title	product_description 

print (train.shape)
print (train)

# Drop the ID columns
idx = test.id.values.astype(int)
train, test = train.drop('id', axis=1), test.drop('id', axis=1)

# create labels and drop variance
y = train.median_relevance.values

train = train.drop(['median_relevance', 'relevance_variance'], axis=1)

(10158, 6)
          id                        query  \
0          1    bridal shower decorations   
1          2         led christmas lights   
2          4                    projector   
3          5                    wine rack   
4          7                   light bulb   
5          8       oakley polarized radar   
6         10              boyfriend jeans   
7         13     screen protector samsung   
8         17            pots and pans set   
9         20                 waffle maker   
10        28                 oakley radar   
11        31    workout clothes for women   
12        32           decorative pillows   
13        35                  wall clocks   
14        46       cuisinart coffee maker   
15        52             thomas the train   
16        54              silver necklace   
17        56       bluray hobbit extended   
18        58                    cat grass   
19        59                  soda stream   
20        63                    microwave   

In [3]:
print (train.shape)
print (train)


(10158, 3)
                             query  \
0        bridal shower decorations   
1             led christmas lights   
2                        projector   
3                        wine rack   
4                       light bulb   
5           oakley polarized radar   
6                  boyfriend jeans   
7         screen protector samsung   
8                pots and pans set   
9                     waffle maker   
10                    oakley radar   
11       workout clothes for women   
12              decorative pillows   
13                     wall clocks   
14          cuisinart coffee maker   
15                thomas the train   
16                 silver necklace   
17          bluray hobbit extended   
18                       cat grass   
19                     soda stream   
20                       microwave   
21                      aqua shoes   
22          leather mens briefcase   
23        girls halloween costumes   
24                knife victorinox   
2

In [4]:
trainjoin1 = list(train.apply(lambda x:'%s' % (x['query']),axis=1))
trainjoin2 = list(train.apply(lambda x:'%s %s' % (x['product_title'], x['query']),axis=1))

testjoin1 = list(test.apply(lambda x:'%s' % (x['query']),axis=1))
testjoin2 = list(test.apply(lambda x:'%s %s' % (x['product_title'], x['query']),axis=1))

In [5]:
#print (trainjoin1)

In [6]:
#print (trainjoin2)

In [7]:
# get TF idf values
#TF idf is a numerical statistic that is intended to reflect how important a word is to a document in a collection.
#It is often used as a weighting factor in information retrieval, text mining, and user modeling.
tfv = TfidfVectorizer(min_df=2,  max_features=None, strip_accents='unicode', 
        analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 5), use_idf=1,
        smooth_idf=1,sublinear_tf=1, stop_words = 'english')


In [8]:
tfv.fit(trainjoin1)
# transform = parameters generated from fit() method,applied upon model to generate transformed data set.
X1, X1_test =  tfv.transform(trainjoin1), tfv.transform(testjoin1)
tfv.fit(trainjoin2)
X2, X2_test =  tfv.transform(trainjoin2), tfv.transform(testjoin2)
#hstack : Horizontal stacking
X, X_test = hstack([X1, X2]), hstack([X1_test, X2_test]) 


In [9]:
print (X1)

  (0, 791)	0.408248290464
  (0, 790)	0.408248290464
  (0, 227)	0.408248290464
  (0, 131)	0.408248290464
  (0, 130)	0.408248290464
  (0, 129)	0.408248290464
  (1, 516)	0.415301206101
  (1, 505)	0.415301206101
  (1, 504)	0.415301206101
  (1, 503)	0.370977817473
  (1, 170)	0.415301206101
  (1, 169)	0.415301206101
  (2, 686)	1.0
  (3, 964)	0.595385811648
  (3, 963)	0.595385811648
  (3, 709)	0.539473327029
  (4, 515)	0.57735026919
  (4, 514)	0.57735026919
  (4, 133)	0.57735026919
  (5, 710)	0.396137403195
  (5, 666)	0.432623615231
  (5, 665)	0.353073275982
  (5, 611)	0.432623615231
  (5, 610)	0.432623615231
  (5, 609)	0.396137403195
  :	:
  (10152, 260)	0.633215797204
  (10152, 259)	0.58612459099
  (10153, 656)	0.412846768639
  (10153, 655)	0.412846768639
  (10153, 654)	0.412846768639
  (10153, 409)	0.384431695002
  (10153, 202)	0.412846768639
  (10153, 201)	0.412846768639
  (10154, 830)	1.0
  (10155, 929)	0.602245667152
  (10155, 659)	0.602245667152
  (10155, 657)	0.524023198717
  (10156, 

In [10]:
print (X)

  (0, 791)	0.408248290464
  (0, 790)	0.408248290464
  (0, 227)	0.408248290464
  (0, 131)	0.408248290464
  (0, 130)	0.408248290464
  (0, 129)	0.408248290464
  (1, 516)	0.415301206101
  (1, 505)	0.415301206101
  (1, 504)	0.415301206101
  (1, 503)	0.370977817473
  (1, 170)	0.415301206101
  (1, 169)	0.415301206101
  (2, 686)	1.0
  (3, 964)	0.595385811648
  (3, 963)	0.595385811648
  (3, 709)	0.539473327029
  (4, 515)	0.57735026919
  (4, 514)	0.57735026919
  (4, 133)	0.57735026919
  (5, 710)	0.396137403195
  (5, 666)	0.432623615231
  (5, 665)	0.353073275982
  (5, 611)	0.432623615231
  (5, 610)	0.432623615231
  (5, 609)	0.396137403195
  :	:
  (10155, 1898)	0.219092281338
  (10156, 41359)	0.336399474547
  (10156, 36614)	0.30536722776
  (10156, 29751)	0.398735930487
  (10156, 17120)	0.352585606159
  (10156, 17114)	0.328085362551
  (10156, 9442)	0.580329749738
  (10156, 4961)	0.256625552607
  (10157, 39902)	0.187892440897
  (10157, 31633)	0.311125792253
  (10157, 31627)	0.305034366489
  (10157, 

In [11]:
print (X_test)

  (0, 371)	0.594814986983
  (0, 280)	0.594814986983
  (0, 278)	0.540731229468
  (1, 638)	0.467267422155
  (1, 637)	0.467267422155
  (1, 636)	0.467267422155
  (1, 533)	0.332032760636
  (1, 185)	0.359196458714
  (1, 182)	0.325139383171
  (2, 745)	0.408248290464
  (2, 744)	0.408248290464
  (2, 743)	0.408248290464
  (2, 327)	0.408248290464
  (2, 326)	0.408248290464
  (2, 22)	0.408248290464
  (3, 772)	0.57735026919
  (3, 57)	0.57735026919
  (3, 56)	0.57735026919
  (4, 876)	0.316227766017
  (4, 875)	0.316227766017
  (4, 874)	0.316227766017
  (4, 313)	0.316227766017
  (4, 312)	0.316227766017
  (4, 311)	0.316227766017
  (4, 310)	0.316227766017
  :	:
  (22510, 19422)	0.160690399757
  (22510, 19027)	0.214728945753
  (22510, 18714)	0.160042598786
  (22510, 18713)	0.214728945753
  (22510, 18712)	0.214728945753
  (22510, 18711)	0.214728945753
  (22510, 18708)	0.190156050102
  (22510, 6227)	0.221717058817
  (22510, 6226)	0.221717058817
  (22510, 6223)	0.176879904034
  (22511, 43355)	0.543402187355
 

In [12]:
nbmodel = MultinomialNB(alpha=.0003)
# Multinomial Naive Bayes is a specialized version of Naive Bayes that is designed more for text documents.
# It estimates the conditional probability of a particular word given a class as the relative frequency of term t in 
# documents belonging to class(c). The variation takes into account the number 
# of occurrences of term t in training documents from class (c),including multiple occurrences.
nbmodel.fit(X, y)
preds = nbmodel.predict(X_test)

In [13]:
submission = pd.DataFrame({"id": idx, "prediction": preds})
submission.to_csv("naive_bayes.csv", index=False)