In [6]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Data cleaning for the words in text
def review_words(review):
    lmtzr = WordNetLemmatizer()
    snowball = SnowballStemmer('english')
    review_text = BeautifulSoup(review).get_text()
    #print(review_text)
    regex = re.compile('[^a-zA-Z]')
    letters = regex.sub(' ',review_text)
    #print(letters)
    words = letters.lower().split()
    #print(words)
    stops = set(stopwords.words('english'))
    meaningful_words = [w for w in words if w not in stops]
    lemmatized_words = [lmtzr.lemmatize(w) for w in meaningful_words]
    #print(lemmatized_words[0])
    stemmed_words = [snowball.stem(w) for w in lemmatized_words]
    output_words = ' '.join(stemmed_words)
    
    return output_words


In [2]:
#Send all the reviews in a list
def review_set(file):
    data = pd.read_csv(file, header = 0, delimiter = '\t', quoting = 3)
    raw_review = data['review']
    size = len(data['review'])
    review_all = []
    for i in range(size):
        review_clean = review_words(raw_review[i])
        review_all.append(review_clean)
    if 'sentiment' in data.columns.values:        
        raw_sentiment = data['sentiment']
        print(review_all[:3])
        return review_all, raw_sentiment
    else:
        return review_all

In [3]:
#Count words frequency for each review
def feature_set(data):
    vectorizer = CountVectorizer(max_features = 50)
    data_features = vectorizer.fit_transform(data)
    name = vectorizer.get_feature_names()
    features = data_features.toarray()
    return features, name

In [None]:
#Read data from labeledTrainData
text,sentiment = review_set('labeledTrainData.tsv')
text = np.array(text)
features,names = feature_set(text)

In [4]:
#Read data from labeledTrainData
text,sentiment = review_set('labeledTrainData.tsv')
text = np.array(text)
features,names = feature_set(text)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


['stuff go moment mj start listen music watch odd documentari watch wiz watch moonwalk mayb want get certain insight guy thought realli cool eighti mayb make mind whether guilti innoc moonwalk part biographi part featur film rememb go see cinema origin releas subtl messag mj feel toward press also obvious messag drug bad kay visual impress cours michael jackson unless remot like mj anyway go hate find bore may call mj egotist consent make movi mj fan would say made fan true realli nice actual featur film bit final start minut exclud smooth crimin sequenc joe pesci convinc psychopath power drug lord want mj dead bad beyond mj overheard plan nah joe pesci charact rant want peopl know suppli drug etc dunno mayb hate mj music lot cool thing like mj turn car robot whole speed demon sequenc also director must patienc saint came film kiddi bad sequenc usual director hate work one kid let alon whole bunch perform complex danc scene bottom line movi peopl like mj one level anoth think peopl sta

In [14]:
#Train classifier
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(features,sentiment)
#svc = SVC(kernel = 'linear')
#svc.fit(features,sentiment)
#bayes = GaussianNB()
#bayes = bayes.fit(features,sentiment)

In [9]:
#Read Test Data and extract features
data = review_set('testData.tsv')
test,names = feature_set(data)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [15]:
#Make predictions on test data
k = pd.read_csv('testData.tsv',header = 0, delimiter = '\t', quoting = 3)
result = forest.predict(test)
#result = bayes.predict(test)
output = pd.DataFrame({'id':k['id'],'sentiment':result})
output.to_csv('Bag_of_Words_model_x.csv', index=False,quoting=3)