In [1]:
# files.
import os
import codecs 

import sys

def read_directory(directory):
	read = []

	# Files counter.
	file_count = 0

	# Loop for files.
	for filename in os.listdir(directory):
		
		file = os.path.join(directory, filename)
		if os.path.isfile(file):
			with codecs.open(file, "r", encoding="utf-8") as f:
				
				# File name parsing to get id and rating.
				split_extension = filename.split(".")
				split_id_rating = split_extension[0].split("_")
				id_str = split_id_rating[0]
				rating_str = split_id_rating[1]
				rating = -1

				try:
					rating = int(rating_str)
				except ValueError:
					sys.exit("Error casting rating to int")

				text = f.read()
				read.append((text, rating))
				file_count += 1
				
	print("file_count : {}".format(file_count))
	return read

In [2]:
from pprint import pprint

pos = read_directory("./data/train/pos/")
neg = read_directory("./data/train/neg/")
train = pos + neg
# pprint(train)

file_count : 12500
file_count : 12500


In [3]:
from bs4 import BeautifulSoup

def remove_html_tags(liste):
	liste_tmp = []
	for text, rating in liste:
		text_tmp = BeautifulSoup(text, "html.parser").text
		liste_tmp.append((text_tmp, rating))
	return liste_tmp

In [4]:
train_tmp = remove_html_tags(train)



In [5]:
pprint(train_tmp[:1])

[('Bromwell High is a cartoon comedy. It ran at the same time as some other '
  'programs about school life, such as "Teachers". My 35 years in the teaching '
  "profession lead me to believe that Bromwell High's satire is much closer to "
  'reality than is "Teachers". The scramble to survive financially, the '
  "insightful students who can see right through their pathetic teachers' "
  'pomp, the pettiness of the whole situation, all remind me of the schools I '
  'knew and their students. When I saw the episode in which a student '
  'repeatedly tried to burn down the school, I immediately recalled ......... '
  "at .......... High. A classic line: INSPECTOR: I'm here to sack one of your "
  'teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of '
  "my age think that Bromwell High is far fetched. What a pity that it isn't!",
  9)]


In [6]:
from nltk.tokenize import word_tokenize

def tokenize(liste):
	liste_tmp = []
	for text, rating in liste:
		text_tmp = word_tokenize(text)
		liste_tmp.append((text_tmp, rating))
	return liste_tmp

In [7]:
train_tmp = tokenize(train_tmp)

In [8]:
pprint(train_tmp[:1])

[(['Bromwell',
   'High',
   'is',
   'a',
   'cartoon',
   'comedy',
   '.',
   'It',
   'ran',
   'at',
   'the',
   'same',
   'time',
   'as',
   'some',
   'other',
   'programs',
   'about',
   'school',
   'life',
   ',',
   'such',
   'as',
   '``',
   'Teachers',
   "''",
   '.',
   'My',
   '35',
   'years',
   'in',
   'the',
   'teaching',
   'profession',
   'lead',
   'me',
   'to',
   'believe',
   'that',
   'Bromwell',
   'High',
   "'s",
   'satire',
   'is',
   'much',
   'closer',
   'to',
   'reality',
   'than',
   'is',
   '``',
   'Teachers',
   "''",
   '.',
   'The',
   'scramble',
   'to',
   'survive',
   'financially',
   ',',
   'the',
   'insightful',
   'students',
   'who',
   'can',
   'see',
   'right',
   'through',
   'their',
   'pathetic',
   'teachers',
   "'",
   'pomp',
   ',',
   'the',
   'pettiness',
   'of',
   'the',
   'whole',
   'situation',
   ',',
   'all',
   'remind',
   'me',
   'of',
   'the',
   'schools',
   'I',
   'knew',
   '

In [9]:
import nltk
from nltk.tag import pos_tag

def get_tags(liste):
	liste_tmp = []
	for text, rating in liste:
		text_tmp = [(word, tag) for word, tag in pos_tag(text)]
		liste_tmp.append((text_tmp, rating))
	return liste_tmp

In [10]:
train_tmp = get_tags(train_tmp)

In [11]:
pprint(train_tmp[:1])

[([('Bromwell', 'NNP'),
   ('High', 'NNP'),
   ('is', 'VBZ'),
   ('a', 'DT'),
   ('cartoon', 'NN'),
   ('comedy', 'NN'),
   ('.', '.'),
   ('It', 'PRP'),
   ('ran', 'VBD'),
   ('at', 'IN'),
   ('the', 'DT'),
   ('same', 'JJ'),
   ('time', 'NN'),
   ('as', 'IN'),
   ('some', 'DT'),
   ('other', 'JJ'),
   ('programs', 'NNS'),
   ('about', 'IN'),
   ('school', 'NN'),
   ('life', 'NN'),
   (',', ','),
   ('such', 'JJ'),
   ('as', 'IN'),
   ('``', '``'),
   ('Teachers', 'NNPS'),
   ("''", "''"),
   ('.', '.'),
   ('My', 'PRP$'),
   ('35', 'CD'),
   ('years', 'NNS'),
   ('in', 'IN'),
   ('the', 'DT'),
   ('teaching', 'NN'),
   ('profession', 'NN'),
   ('lead', 'VB'),
   ('me', 'PRP'),
   ('to', 'TO'),
   ('believe', 'VB'),
   ('that', 'IN'),
   ('Bromwell', 'NNP'),
   ('High', 'NNP'),
   ("'s", 'POS'),
   ('satire', 'NN'),
   ('is', 'VBZ'),
   ('much', 'RB'),
   ('closer', 'RBR'),
   ('to', 'TO'),
   ('reality', 'NN'),
   ('than', 'IN'),
   ('is', 'VBZ'),
   ('``', '``'),
   ('Teachers', 'NN

In [12]:
import copy
train_tmp2 = copy.deepcopy(train_tmp)

In [13]:
import string

def remove_ponctuation(liste):
	liste_tmp = []
	for text, rating in liste:
		text_tmp = [(word, tag) for word, tag in text if word not in ["?", ",", ".", ";", "!", "/", ":", "\"\"", "\'\'", "``"]]
		liste_tmp.append((text_tmp, rating))
	return liste_tmp

In [14]:
train_tmp2 = remove_ponctuation(train_tmp2)

In [15]:
pprint(train_tmp2[:1])

[([('Bromwell', 'NNP'),
   ('High', 'NNP'),
   ('is', 'VBZ'),
   ('a', 'DT'),
   ('cartoon', 'NN'),
   ('comedy', 'NN'),
   ('It', 'PRP'),
   ('ran', 'VBD'),
   ('at', 'IN'),
   ('the', 'DT'),
   ('same', 'JJ'),
   ('time', 'NN'),
   ('as', 'IN'),
   ('some', 'DT'),
   ('other', 'JJ'),
   ('programs', 'NNS'),
   ('about', 'IN'),
   ('school', 'NN'),
   ('life', 'NN'),
   ('such', 'JJ'),
   ('as', 'IN'),
   ('Teachers', 'NNPS'),
   ('My', 'PRP$'),
   ('35', 'CD'),
   ('years', 'NNS'),
   ('in', 'IN'),
   ('the', 'DT'),
   ('teaching', 'NN'),
   ('profession', 'NN'),
   ('lead', 'VB'),
   ('me', 'PRP'),
   ('to', 'TO'),
   ('believe', 'VB'),
   ('that', 'IN'),
   ('Bromwell', 'NNP'),
   ('High', 'NNP'),
   ("'s", 'POS'),
   ('satire', 'NN'),
   ('is', 'VBZ'),
   ('much', 'RB'),
   ('closer', 'RBR'),
   ('to', 'TO'),
   ('reality', 'NN'),
   ('than', 'IN'),
   ('is', 'VBZ'),
   ('Teachers', 'NNPS'),
   ('The', 'DT'),
   ('scramble', 'JJ'),
   ('to', 'TO'),
   ('survive', 'VB'),
   ('finan

In [16]:
def remove_numbers(liste):
	liste_tmp = []
	for text, rating in liste:
		text_tmp = [(word, tag) for word, tag in text if tag != "CD"]
		liste_tmp.append((text_tmp, rating))
	return liste_tmp

In [17]:
train_tmp2 = remove_numbers(train_tmp2)

In [18]:
pprint(train_tmp2[:1])

[([('Bromwell', 'NNP'),
   ('High', 'NNP'),
   ('is', 'VBZ'),
   ('a', 'DT'),
   ('cartoon', 'NN'),
   ('comedy', 'NN'),
   ('It', 'PRP'),
   ('ran', 'VBD'),
   ('at', 'IN'),
   ('the', 'DT'),
   ('same', 'JJ'),
   ('time', 'NN'),
   ('as', 'IN'),
   ('some', 'DT'),
   ('other', 'JJ'),
   ('programs', 'NNS'),
   ('about', 'IN'),
   ('school', 'NN'),
   ('life', 'NN'),
   ('such', 'JJ'),
   ('as', 'IN'),
   ('Teachers', 'NNPS'),
   ('My', 'PRP$'),
   ('years', 'NNS'),
   ('in', 'IN'),
   ('the', 'DT'),
   ('teaching', 'NN'),
   ('profession', 'NN'),
   ('lead', 'VB'),
   ('me', 'PRP'),
   ('to', 'TO'),
   ('believe', 'VB'),
   ('that', 'IN'),
   ('Bromwell', 'NNP'),
   ('High', 'NNP'),
   ("'s", 'POS'),
   ('satire', 'NN'),
   ('is', 'VBZ'),
   ('much', 'RB'),
   ('closer', 'RBR'),
   ('to', 'TO'),
   ('reality', 'NN'),
   ('than', 'IN'),
   ('is', 'VBZ'),
   ('Teachers', 'NNPS'),
   ('The', 'DT'),
   ('scramble', 'JJ'),
   ('to', 'TO'),
   ('survive', 'VB'),
   ('financially', 'RB'),
 

In [19]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stop_words(liste):
	liste_tmp = []
	for text, rating in liste:
		text_tmp = [(word, tag) for word, tag in text if word not in stop_words]
		liste_tmp.append((text_tmp, rating))
	return liste_tmp

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\micka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
train_tmp2 = remove_stop_words(train_tmp2)

In [21]:
pprint(train_tmp2[:1])

[([('Bromwell', 'NNP'),
   ('High', 'NNP'),
   ('cartoon', 'NN'),
   ('comedy', 'NN'),
   ('It', 'PRP'),
   ('ran', 'VBD'),
   ('time', 'NN'),
   ('programs', 'NNS'),
   ('school', 'NN'),
   ('life', 'NN'),
   ('Teachers', 'NNPS'),
   ('My', 'PRP$'),
   ('years', 'NNS'),
   ('teaching', 'NN'),
   ('profession', 'NN'),
   ('lead', 'VB'),
   ('believe', 'VB'),
   ('Bromwell', 'NNP'),
   ('High', 'NNP'),
   ("'s", 'POS'),
   ('satire', 'NN'),
   ('much', 'RB'),
   ('closer', 'RBR'),
   ('reality', 'NN'),
   ('Teachers', 'NNPS'),
   ('The', 'DT'),
   ('scramble', 'JJ'),
   ('survive', 'VB'),
   ('financially', 'RB'),
   ('insightful', 'JJ'),
   ('students', 'NNS'),
   ('see', 'VB'),
   ('right', 'RB'),
   ('pathetic', 'JJ'),
   ('teachers', 'NNS'),
   ("'", 'POS'),
   ('pomp', 'NN'),
   ('pettiness', 'NN'),
   ('whole', 'JJ'),
   ('situation', 'NN'),
   ('remind', 'VBP'),
   ('schools', 'NNS'),
   ('I', 'PRP'),
   ('knew', 'VBD'),
   ('students', 'NNS'),
   ('When', 'WRB'),
   ('I', 'PRP')

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize(liste):
	liste_tmp = []
	verbs = ["VBP", "VBN", "VBG", "VBD", "VB"]
	for text, rating in liste:
		text_tmp = []
		for word, tag in text:
			if tag in verbs:
				word_tmp = lemmatizer.lemmatize(word, pos = "v")
				tag_tmp = "VBZ"
			else:
				word_tmp = lemmatizer.lemmatize(word)
				tag_tmp = tag
			text_tmp.append((word_tmp, tag_tmp))
		liste_tmp.append((text_tmp, rating))
	return liste_tmp

In [23]:
train_tmp2 = lemmatize(train_tmp2)

In [24]:
pprint(train_tmp2[:1])

[([('Bromwell', 'NNP'),
   ('High', 'NNP'),
   ('cartoon', 'NN'),
   ('comedy', 'NN'),
   ('It', 'PRP'),
   ('run', 'VBZ'),
   ('time', 'NN'),
   ('program', 'NNS'),
   ('school', 'NN'),
   ('life', 'NN'),
   ('Teachers', 'NNPS'),
   ('My', 'PRP$'),
   ('year', 'NNS'),
   ('teaching', 'NN'),
   ('profession', 'NN'),
   ('lead', 'VBZ'),
   ('believe', 'VBZ'),
   ('Bromwell', 'NNP'),
   ('High', 'NNP'),
   ("'s", 'POS'),
   ('satire', 'NN'),
   ('much', 'RB'),
   ('closer', 'RBR'),
   ('reality', 'NN'),
   ('Teachers', 'NNPS'),
   ('The', 'DT'),
   ('scramble', 'JJ'),
   ('survive', 'VBZ'),
   ('financially', 'RB'),
   ('insightful', 'JJ'),
   ('student', 'NNS'),
   ('see', 'VBZ'),
   ('right', 'RB'),
   ('pathetic', 'JJ'),
   ('teacher', 'NNS'),
   ("'", 'POS'),
   ('pomp', 'NN'),
   ('pettiness', 'NN'),
   ('whole', 'JJ'),
   ('situation', 'NN'),
   ('remind', 'VBZ'),
   ('school', 'NNS'),
   ('I', 'PRP'),
   ('know', 'VBZ'),
   ('student', 'NNS'),
   ('When', 'WRB'),
   ('I', 'PRP'),


In [25]:
def remove_capital(liste):
	liste_tmp = []
	for text, rating in liste:
		text_tmp = [(word.lower(), tag) for word, tag in text]
		liste_tmp.append((text_tmp, rating))
	return liste_tmp

In [26]:
train_tmp2 = remove_capital(train_tmp2)

In [27]:
pprint(train_tmp2[:1])

[([('bromwell', 'NNP'),
   ('high', 'NNP'),
   ('cartoon', 'NN'),
   ('comedy', 'NN'),
   ('it', 'PRP'),
   ('run', 'VBZ'),
   ('time', 'NN'),
   ('program', 'NNS'),
   ('school', 'NN'),
   ('life', 'NN'),
   ('teachers', 'NNPS'),
   ('my', 'PRP$'),
   ('year', 'NNS'),
   ('teaching', 'NN'),
   ('profession', 'NN'),
   ('lead', 'VBZ'),
   ('believe', 'VBZ'),
   ('bromwell', 'NNP'),
   ('high', 'NNP'),
   ("'s", 'POS'),
   ('satire', 'NN'),
   ('much', 'RB'),
   ('closer', 'RBR'),
   ('reality', 'NN'),
   ('teachers', 'NNPS'),
   ('the', 'DT'),
   ('scramble', 'JJ'),
   ('survive', 'VBZ'),
   ('financially', 'RB'),
   ('insightful', 'JJ'),
   ('student', 'NNS'),
   ('see', 'VBZ'),
   ('right', 'RB'),
   ('pathetic', 'JJ'),
   ('teacher', 'NNS'),
   ("'", 'POS'),
   ('pomp', 'NN'),
   ('pettiness', 'NN'),
   ('whole', 'JJ'),
   ('situation', 'NN'),
   ('remind', 'VBZ'),
   ('school', 'NNS'),
   ('i', 'PRP'),
   ('know', 'VBZ'),
   ('student', 'NNS'),
   ('when', 'WRB'),
   ('i', 'PRP'),
