In [35]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import csv
import string
from sentistrength import PySentiStr
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import LabelPowerset
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
stopword = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

senti = PySentiStr()
senti.setSentiStrengthPath('SentiStrength.jar')
senti.setSentiStrengthLanguageFolderPath('SentiStrength_Data/')

f1 = open("stop_word_list.txt", "r")
stop_data = f1.read()
stop_list = stop_data.split('\n')
f1.close()

def clean_text(text):
	text = ''.join(c for c in text if not c.isdigit())
	text = ''.join(c for c in text if c not in string.punctuation)
	text = text.lower() # lowercase text
	text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
	text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
	return text

def get_verbs(text):
	present = 0
	past = 0
	future = 0
	verb = 0
	wordsList = nltk.word_tokenize(text)
	wordsList = [w for w in wordsList if not w in stopword]
	tagged = nltk.pos_tag(wordsList)
	for pos in tagged:
		if pos[1] in ['VBG', 'VBP', 'VBZ']:
			present += 1
			verb += 1
		elif pos[1] in ['VBD', 'VBN']:
			past += 1
			verb += 1
		elif pos[1] in ['VB']:
			future += 1
			verb += 1
	if verb == 0:
		return 0, 0, 0
	present /= verb
	past /= verb
	future /= verb
	return present, past, future

def get_present(rev_str):
	return get_verbs(rev_str)[0]

def get_past(rev_str):
	return get_verbs(rev_str)[1]

def get_future(rev_str):
	return get_verbs(rev_str)[2]

def get_senti_score(text):
	result = senti.getSentiment(text, score='dual')
	return result

def get_pos_senti_score(rev_str):
	return get_senti_score(rev_str)[0][0]

def get_neg_senti_score(rev_str):
	return get_senti_score(rev_str)[0][1]

def length_text(text):
	length = len(text)
	return length

def process_text(text):
	words = nltk.word_tokenize(text)
	removed_stopwords = [word for word in words if word not in stopword]
	removed_stopwords = [word for word in removed_stopwords if word not in stop_list]
	lemmatized = [lemmatizer.lemmatize(word) for word in removed_stopwords]
	refined_review = ' '.join([str(elem) for elem in lemmatized])
	return refined_review

In [3]:
dataset = pd.read_csv("labelled.csv")
col = ['Review', 'UpVotes', 'Rating', 'Label']
dataset = dataset[col]
dataset = dataset[pd.notnull(dataset['Review'])]

dataset['Review'] = dataset['Review'].apply(clean_text)
dataset['Length'] = dataset['Review'].apply(length_text)
dataset['Positive'] = dataset['Review'].apply(get_pos_senti_score)
dataset['Negative'] = dataset['Review'].apply(get_neg_senti_score)
dataset['Present'] = dataset['Review'].apply(get_present)
dataset['Past'] = dataset['Review'].apply(get_past)
dataset['Future'] = dataset['Review'].apply(get_future)
dataset['Review'] = dataset['Review'].apply(process_text)

In [4]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit_transform(dataset['Label'])

y = multilabel_binarizer.transform(dataset['Label'])

for idx, label in enumerate(multilabel_binarizer.classes_):
    dataset[label] = y[:,idx]

dataset.to_csv('dataset_new.csv')