In [1]:
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv('./BOW_DATA/train_all.txt', header=None, sep='\t')
test = pd.read_csv('./BOW_DATA/test_all.txt', header=None, sep='\t')
train.head(5)    

Unnamed: 0,0,1
0,earn,champion products ch approves stock split cham...
1,acq,computer terminal systems cpml completes sale ...
2,earn,cobanco inc cbco year net shr cts vs dlrs net ...
3,earn,am international inc am nd qtr jan oper shr lo...
4,earn,brown forman inc bfd th qtr net shr one dlr vs...


In [3]:
test.head(5)

Unnamed: 0,0,1
0,trade,asian exporters fear damage from u s japan rif...
1,grain,china daily says vermin eat pct grain stocks a...
2,ship,australian foreign ship ban ends but nsw ports...
3,acq,sumitomo bank aims at quick recovery from merg...
4,earn,amatil proposes two for five bonus share issue...


In [4]:
train.columns = ['label', 'context']
test.columns = ['label', 'context']

train.head(5)

Unnamed: 0,label,context
0,earn,champion products ch approves stock split cham...
1,acq,computer terminal systems cpml completes sale ...
2,earn,cobanco inc cbco year net shr cts vs dlrs net ...
3,earn,am international inc am nd qtr jan oper shr lo...
4,earn,brown forman inc bfd th qtr net shr one dlr vs...


In [5]:
test.head(5)

Unnamed: 0,label,context
0,trade,asian exporters fear damage from u s japan rif...
1,grain,china daily says vermin eat pct grain stocks a...
2,ship,australian foreign ship ban ends but nsw ports...
3,acq,sumitomo bank aims at quick recovery from merg...
4,earn,amatil proposes two for five bonus share issue...


In [6]:
class GloveVectorizer():
    
    def __init__(self):
        
        word2vec = {}
        idx2word = []
        embedding = []
        
        for line in open('./GLOVE/glove.6B.50d.txt'):
            values = line.split()
            word = values[0]
            word2vec[word] = np.asarray(values[1:], dtype=np.float32)
            idx2word.append(word)
            embedding.append(np.asarray(values[1:], dtype=np.float32))
            
        embedding = np.array(embedding)
        self.V, self.D = embedding.shape
        self.embedding = embedding
        self.word2vec = word2vec
        self.idx2word = idx2word
    
    def fit(self, data):
        pass
    
    def transform(self, data):
        X = np.zeros((len(data), self.D))
        n = 0
        empty_count = 0
        for sentence in data:
            words = sentence.lower().split()
            vecs = []
            for word in words:
                if word in self.word2vec:
                    vec = self.word2vec[word]
                    vecs.append(vec)
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = np.mean(vecs, axis = 0)
            else:
                empty_count += 1
            
            n+= 1
        
        print("Number of empty words found in data : {} / {}".format(empty_count , len(data)))
        return X
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)
    
    
                
        

In [7]:
Xtrain, ytrain = train["context"].values, train["label"].values
Xtest, ytest = test["context"].values, test["label"].values

In [8]:
vectorizer = GloveVectorizer()
Xtrain = vectorizer.fit_transform(Xtrain)

Number of empty words found in data : 0 / 671


In [9]:
model = LogisticRegression()
model.fit(Xtrain, ytrain)
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>ACCURACY<<<<<<<<<<<<<<<<<<<<<<<<<<")
print("Score in training : {:.7f}".format(model.score(Xtrain, ytrain)))
print("Score in testing : {:.7f}".format(model.score(vectorizer.fit_transform(Xtest), ytest)))
print(">>>>>>>>>>>>>>>>>>>>>>>ERROR RATE<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
print("Error rate in training : {:.7f}".format(1 - model.score(Xtrain, ytrain)))
print("Error rate in testing : {:.7f}".format(1 - model.score(vectorizer.fit_transform(Xtest), ytest)))

>>>>>>>>>>>>>>>>>>>>>>>>>>>ACCURACY<<<<<<<<<<<<<<<<<<<<<<<<<<
Score in training : 0.9225037


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Number of empty words found in data : 0 / 2189
Score in testing : 0.8679762
>>>>>>>>>>>>>>>>>>>>>>>ERROR RATE<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
Error rate in training : 0.0774963
Number of empty words found in data : 0 / 2189
Error rate in testing : 0.1320238


In [12]:
model = RandomForestClassifier(n_estimators=200)
model.fit(Xtrain, ytrain)
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>ACCURACY<<<<<<<<<<<<<<<<<<<<<<<<<<")
print("Score in training : {}".format(model.score(Xtrain, ytrain)))
print("Score in testing : {}".format(model.score(vectorizer.fit_transform(Xtest), ytest)))
print(">>>>>>>>>>>>>>>>>>>>>>>ERROR RATE<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
print("Error rate in training : {}".format(1 - model.score(Xtrain, ytrain)))
print("Error rate in testing : {}".format(1 - model.score(vectorizer.fit_transform(Xtest), ytest)))

>>>>>>>>>>>>>>>>>>>>>>>>>>>ACCURACY<<<<<<<<<<<<<<<<<<<<<<<<<<
Score in training : 1.0
Number of empty words found in data : 0 / 2189
Score in testing : 0.860666971219735
>>>>>>>>>>>>>>>>>>>>>>>ERROR RATE<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
Error rate in training : 0.0
Number of empty words found in data : 0 / 2189
Error rate in testing : 0.139333028780265
