In [1]:

import re
import csv

fh=open("dataset.csv","r")

In [2]:
# The delimiter in the csv file is '+' instead of comma. This was done to compromise with the commas in the sentence in the sentence of the dataset used.
reader = csv.reader(fh, delimiter='+')

# It is the dictionary that has the data : { label(positive/negative) : { word : count of number of occurences of the word } }
dataset={}

# It is the dictionary that keeps the count of records that are labeled a label l for each label l
# That is, { label l : No. of records that are labeled l }
no_of_items={}

# This is the dictionary that contains the count of the occurences of word under each label
# That is, { word : { label l : count of the occurence of word with label l } }
feature_set={}

In [9]:
# For each sentence in dataset
for row in reader:
	# Initialize the label in the dictionary if not present already
	no_of_items.setdefault(row[1],0)
	# Increase the count of occurence of label by 1 for every occurence
	no_of_items[row[1]]+=1
	# Initialize the dictionary for a label if not present
	dataset.setdefault(row[1],{})
	# Split the sentence with respect to non-characters, and donot split if apostophe is present
	split_data=re.split('[^a-zA-Z\']',row[0])
	# For every word in split data
	for i in split_data:
		# Removing stop words to a small extent by ignoring words with length less than 3
		if len(i) > 2:
			# Initialize the word count in dataset
			dataset[row[1]].setdefault(i.lower(),0)
			# Increase the word count on its occurence with label row[1]
			dataset[row[1]][i.lower()]+=1
			# Initialze a dictionary for a newly found word in feature set
			feature_set.setdefault(i.lower(),{})
			# If the label was found for the word, for the first time, initialize corresponding count value for word as key
			feature_set[i.lower()].setdefault(row[1],0)
			# Increment the count for the word in that label 
			feature_set[i.lower()][row[1]]+=1

	

	
# To calculate the basic probability of a word for a category
def calc_prob(word,category):

	if word not in feature_set or word not in dataset[category]:
		return 0

	return float(dataset[category][word])/no_of_items[category]


# Weighted probability of a word for a category
def weighted_prob(word,category):
	# basic probability of a word - calculated by calc_prob
	basic_prob=calc_prob(word,category)

	# total_no_of_appearances - in all the categories
	if word in feature_set:
		tot=sum(feature_set[word].values())
	else:
		tot=0
		
	# Weighted probability is given by the formula
	# (weight*assumedprobability + total_no_of_appearances*basic_probability)/(total_no_of_appearances+weight)
	# weight by default is taken as 1.0
	# assumed probability is 0.5 here
	weight_prob=((1.0*0.5)+(tot*basic_prob))/(1.0+tot)
	return weight_prob


# To get probability of the test data for the given category
def test_prob(test,category):
	# Split the test data
	split_data=re.split('[^a-zA-Z][\'][ ]',test)
	
	data=[]
	for i in split_data:
		if ' ' in i:
			i=i.split(' ')
			for j in i:
				if j not in data:
					data.append(j.lower())
		elif len(i) > 2 and i not in data:
			data.append(i.lower())

	p=1
	for i in data:
		p*=weighted_prob(i,category)
	return p

# Naive Bayes implementation
def naive_bayes(test):
	'''
		p(A|B) = p(B|A) * p(A) / p(B)

		Assume A - Category
			   B - Test data
			   p(A|B) - Category given the Test data

		Here ignoring p(B) in the denominator (Since it remains same for every category)
	'''
	results={}
	for i in dataset.keys():
		# Category Probability
		# Number of items in category/total number of items
		cat_prob=float(no_of_items[i])/sum(no_of_items.values())

		# p(test data | category)
		test_prob1=test_prob(test,i)

		results[i]=test_prob1*cat_prob

	return results

print('Enter the sentence')
text=input()
result=naive_bayes(text)

if result['1'] > result['-1']:
	print('positive')
else:
	print('negative')



Enter the sentence
drinking is bad habbit
negative
