In [23]:
import pandas as pd
from pandas import DataFrame
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

# function to create dataframes for text and sentiment
def sentimentDataFrame(filename):
    
    # import train data
    file = open(filename, "r")
    lines = file.readlines()
    lines.pop(0)
    file.close()
    
    # data structure for text and sentiment
    reviews = []
    sentiments = []
    
    # a DataFrame for the text and the corresponding sentiment
    for line in lines: 
        
        index = 0
        char = ''
        
        # find the start of the comment
        while char != ',' and index < len(line):
            
            char = line[index]
            index += 1
        
        
        # score of the the text turned into pos/neg
        score = int(line[-4])
        
        # get the text 
        if score != 3:
            
            review = line[index + 1:-7]
            
            if score >= 4:
                reviews.append(review)
                sentiments.append(1)
            else:
                # append negative reviews more since they are scarce
                # in the data set
                for i in range(7):
                    reviews.append(review)
                    sentiments.append(0)
            
    # creating DataFrame out of text and sentiment
    df = DataFrame({'reviews':reviews, 'sentiments':sentiments})
    return df



# TFIDF Vectorizer - used to convert reviews from text to features
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True,
                            strip_accents='ascii', stop_words=stopset)

# dataframes for train and test dataset
train_df = sentimentDataFrame("lab_train.txt")
X_train, y_train = vectorizer.fit_transform(train_df.reviews), train_df.sentiments

test_df = sentimentDataFrame("lab_test.txt")
X_test, y_test = vectorizer.transform(test_df.reviews), test_df.sentiments

# train using naive bayes classifier
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

# test models accuracy
print ("Model Accuracy: ", roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))

# import bookings.com comments
comments_df = pd.read_excel("evaluation_dataset.xlsx", header=None, names=['reviews'])
comments_vector = vectorizer.transform(comments_df['reviews'])
comments_df['sentiments'] = clf.predict(comments_vector)

comments_df

Model Accuracy:  0.7172995780590717


Unnamed: 0,reviews,sentiments
0,The check in staff were very friendly and coul...,0
1,The room was great - modern & clean. Robes & s...,0
2,This is a great hotel. The staff are very frie...,1
3,The price of the room which we stayed in was £...,0
4,The parking facilities are excellent but did f...,0
5,I come to this hotel regularly and I find the ...,0
6,Rooms starting to feel their age a little but ...,1
7,The place was well located and we were very ha...,0
8,Stayed here a few times and it can be a little...,0
9,Lovely beds and complimentary water in the roo...,0
