In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.punkt import PunktSentenceTokenizer
from gensim.models import word2vec
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [2]:
# Data cleaning for the words in text
def review_words(review,frequency):
    lmtzr = WordNetLemmatizer()
    snowball = SnowballStemmer('english')
    review_text = BeautifulSoup(review).get_text()
    #print(review_text)
    regex = re.compile('[^a-zA-Z]')
    letters = regex.sub(' ',review_text)
    #print(letters)
    words = letters.lower().split()
    #print(words)
    stops = set(stopwords.words('english'))
    meaningful_words = [w for w in words if not w in stops]
    lemmatized_words = [lmtzr.lemmatize(w) for w in meaningful_words]
    #print(lemmatized_words)
    stemmed_words = [snowball.stem(w) for w in lemmatized_words]
    output_words = ' '.join(stemmed_words)
    
    if frequency == False:
        return stemmed_words
    else:
        return output_words

In [3]:
#Divide reviews into sentences
def review_sentence(review,tokenizer,frequency):
    raw_sentences = tokenizer.tokenize(review.strip())
    #print(len(raw_sentences))
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_words(raw_sentence,frequency))
    return sentences

In [4]:
#Extract features from reviews 
def review_vector(file,frequency):
    tokenizer = PunktSentenceTokenizer()
    data = pd.read_csv(file, header = 0, delimiter = '\t', quoting =3)
    raw_review = data['review']
    size = len(data['review'])
    review_all = []
    for i in range(size):
        review_clean = review_sentence(raw_review[i],tokenizer,frequency)
        review_all += review_clean
        if i%5000 == 0:
            print('review set finished:{}'.format(i))
            
        
    num_features = 200
    min_word_count = 50
    num_workers = 6
    context = 10
    downsampling = 1e-3
        
    model = word2vec.Word2Vec(review_all, size = num_features, min_count = min_word_count,workers = num_workers, window = context, sample = downsampling)
    model.init_sims(replace=True)
    model_name = 'sentiment_vector'
    model.save(model_name)    
    
    return review_all

In [5]:
#Read data from unlabeledTrainData
review_all = review_vector('unlabeledTrainData.tsv',False)
data = word2vec.Word2Vec.load('sentiment_vector')

df = pd.DataFrame(review_all)
df.to_csv('words.csv',index=False,header=False)
print('review file finished')



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


review set finished:0


  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


review set finished:5000
review set finished:10000
review set finished:15000
review set finished:20000


  ' that document to Beautiful Soup.' % decoded_markup


review set finished:25000
review set finished:30000
review set finished:35000


  ' that document to Beautiful Soup.' % decoded_markup


review set finished:40000
review set finished:45000


  ' that document to Beautiful Soup.' % decoded_markup


review file finished


In [6]:
review_all = np.array(review_all)
print(review_all.shape)

(532928,)


In [7]:
#Feed word2vec model and extract features
model_new = word2vec.Word2Vec.load('sentiment_vector')
matrix = model_new.wv.syn0
print(matrix.shape)

(8360, 200)


In [8]:
print(len(model_new.wv.vocab))

8360


In [9]:
model_new.doesnt_match('man woman watch school money'.split())

'watch'

In [10]:
model_new.most_similar('money')

[('buck', 0.6363558769226074),
 ('dollar', 0.6173630952835083),
 ('cash', 0.579927921295166),
 ('fund', 0.5651991963386536),
 ('debt', 0.5647455453872681),
 ('fee', 0.552809476852417),
 ('profit', 0.5428676605224609),
 ('salari', 0.5420151352882385),
 ('ticket', 0.5404407382011414),
 ('expens', 0.5176376104354858)]

In [11]:
len(model_new.wv.vocab)

8360

In [12]:
#Divide reviews into sentences
def review_bag(file,frequency):
    tokenizer = PunktSentenceTokenizer()
    data = pd.read_csv(file, header = 0, delimiter = '\t', quoting =3)
    raw_review = data['review']
    size = len(data['review'])
    review_all = []
    for i in range(size):
        review_clean = review_sentence(raw_review[i],tokenizer,frequency)
        review_all.append(review_clean)
        if i%5000 == 0:
            print('review set finished:{}'.format(i))
    
    return review_all        

In [13]:
#Find cluster center and form the features of the reviews
def word_center(matrix,model,num_centers):
    kmeans = KMeans(n_clusters=num_centers,random_state=0)
    kmeans.fit(matrix)
    centerid = kmeans.predict(matrix)
    words = model.wv.index2word
    wordscenter = dict(zip(words,centerid))
    #print(wordscenter.keys())
    return wordscenter


def center_vector(words,wordscenter,num_centers):
    vector = np.zeros(num_centers,dtype='float32')
    words_new = [w.split() for w in words]
    for item in words_new:
        for word in item:
            if word in wordscenter.keys():
                index = wordscenter[word]
                vector[index] += 1
                #print(word)
                #print(vector)
    return vector

def review_center(review,matrix,model):
    review_center = []
    num_centers = int(len(matrix)/10)
    num_centers = 1000
    wordscenter = word_center(matrix,model,num_centers)
    for words in review:
        result = center_vector(words,wordscenter,num_centers)
        review_center.append(result)
    return review_center

In [14]:
#Read data from labeledTrainData
reviews = review_bag('labeledTrainData.tsv',True)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


review set finished:0


  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


review set finished:5000
review set finished:10000
review set finished:15000
review set finished:20000


In [17]:
#Find clusters and form features
review_cluster = review_center(reviews,matrix,model_new)
#print(review_cluster[:100])

In [18]:
review_cluster = np.array(review_cluster)
print(review_cluster.shape)
#print(review_cluster)
review_new = normalize(review_cluster)
#print(review_new)

(25000, 1000)


In [19]:
#Read labels from labeledTrainData
sentiment_p = pd.read_csv('labeledTrainData.tsv',header = 0, delimiter = '\t', quoting =3)
sentiment = sentiment_p['sentiment']

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_validate, y_train, y_validate = train_test_split(review_cluster, sentiment, test_size=0.4, random_state=0)
svc = SVC(kernel = 'linear')
svc.fit(X_train,y_train)
#bayes = GaussianNB()
#bayes = bayes.fit(X_train,y_train)
pred = svc.predict(X_validate)
score = accuracy_score(y_validate,pred)
print(score)

In [None]:
#Feed models
#forest = RandomForestClassifier(n_estimators = 100)
#forest = forest.fit(review_cluster,sentiment)
bayes = GaussianNB()
bayes = bayes.fit(review_cluster,sentiment)
#SVM= SVC(kernel='linear')
#SVM = SVM.fit(review_cluster,sentiment)

In [21]:
#Read test data and extract features
test = review_bag('testData.tsv',True)
test_cluster = review_center(test,matrix,model_new)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


review set finished:0


  'Beautiful Soup.' % markup)


review set finished:5000
review set finished:10000
review set finished:15000
review set finished:20000


In [22]:
test_new = normalize(test_cluster)

In [25]:
#Make predictions
k = pd.read_csv('testData.tsv',header = 0, delimiter = '\t', quoting =3)
result = bayes.predict(test_cluster)
output = pd.DataFrame({'id':k['id'],'sentiment':result})
output.to_csv('Bag_of_Words_model_l.csv', index=False,quoting=3)