# Data Mining Project.

https://ai.stanford.edu/~amaas/data/sentiment/

# Importations.

In [1]:
import numpy as np # array, ...
import pandas as pd # DataFrame, ...
import matplotlib as plt # countplot, barplot, ...

In [None]:
import re # regex.
import os # files and directories manipulation.
import codecs # open files.
import string # translate, maketrans
import collections
import copy # deepcopy, ...
from enum import Enum
import sys
from statistics import mean
from random import shuffle

In [3]:
# Remove HTML tags.
from bs4 import BeautifulSoup

In [4]:
#
import nltk

# Frequency distribution.
from nltk import FreqDist

# Wordnet.
from nltk.corpus import wordnet as wn
nltk.download('omw-1.4')
# from PyDictionary import PyDictionary
# dictionary=PyDictionary()

# nltk stopwords.
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# lemmatization.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Tokenization.
from nltk.tokenize import WordPunctTokenizer, word_tokenize

# Words nature.
from nltk.tag import pos_tag

# Sentiment.
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\micka\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\micka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\micka\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
# Sklearn classifiers.
from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.model_selection import train_test_split

# Classes.

## Clean_arg.

In [6]:
class Clean_arg(Enum):
	"""
	#! Documentation
	"""

	HTML = 1
	PUNCTUATION = 2
	NUMBER = 3
	STOPWORDS = 4
	USLESS = 5
	LEMMA = 6
	LOWERCASE = 7

## Document.

In [7]:
class Document():
	"""
	#! Documentation
	"""
	
	def __init__(self, id: int, compound, rating: int, text: str):
		self.id = id
		self.compound = compound
		self.rating = rating
		self.text = text

		self.words = []
		self.tags = []

	def str(self, arg: str):
		if arg == "token":
			words = ", ".join([str(word) for word in self.words])
			return "id : {}, rating : {}, words : {}".format(self.id, self.rating, words)
		elif arg == "tag":
			tags = ", ".join(["({}, {})".format(word, tag) for word, tag in zip(self.words, self.tags)])
			return "id : {}, rating : {}, tags : {}".format(self.id, self.rating, tags)
		elif arg == "text":
			return "id : {}, rating : {}, text : {}".format(self.id, self.rating, self.text)
		else:
			return "Unknown argument"

	def tokenize(self):
		self.words = word_tokenize(self.text)

	def get_tag(self):
		self.tags = [tag for _, tag in pos_tag(self.words)]

# Functions.

## clean_text.

In [8]:
def clean_text(_set : list, clean_arg : Clean_arg):
	"""
	#! Documentation.
	"""

	#		
	texts_cleaned = 0

	# Loop for texts.
	for i in range(len(_set)):
		
		# Saving informations.
		doc = _set[i]
		text = old_text = doc.text
		words = doc.words
		tags = doc.tags

		# Remove HTML tags.
		if clean_arg == Clean_arg.HTML:
			text = BeautifulSoup(text, "html.parser").text
		
		# Remove punctuations.
		elif clean_arg == Clean_arg.PUNCTUATION:
			text = text.translate(str.maketrans('', '', string.punctuation))
		
		# Remove numbers.
		elif clean_arg == Clean_arg.NUMBER:
			words = text.split()
			text = ' '.join([word for word in words if not word.isnumeric()])

		# Remove stop words.
		elif clean_arg == Clean_arg.STOPWORDS:
			words_tmp = []
			tags_tmp = []

			#
			for index, word in enumerate(words):
				if word.lower() not in stop_words:
					words_tmp.append(word)
					try:
						tag = tags[index]
						tags_tmp.append(tag)
					except IndexError:
						print("ERROR INDEX : id : {}, word : {}, index : {}, len(tags) : {}".format(id, word, index, len(tags)))
			
			words = words_tmp
			tags = tags_tmp

		#
		elif clean_arg == Clean_arg.USLESS:
			words_tmp = []
			tags_tmp = []
			usless_tags = ["NNS", "NNP", "NNPS"]
			usless_words = ["story", "movie", "film"]

			#
			for index, (word, tag) in enumerate(zip(words, tags)):
				if tag not in usless_tags and word.lower() not in usless_words:
					words_tmp.append(word)
					tags_tmp.append(tag)

			words = words_tmp
			tags = tags_tmp
		
		#
		elif clean_arg == Clean_arg.LOWERCASE:
			words = [word.lower() for word in words]

		# Remove LEMMA.
		elif clean_arg == Clean_arg.LEMMA:
			words_tmp = []
			tags_tmp = []
			verbes_forms = ["VBP", "VBN", "VBG", "VBD", "VB"]

			for index, (word, tag) in enumerate(zip(words, tags)):
				# lemmetize verbes.
				if tag in verbes_forms:
					verb = lemmatizer.lemmatize(word, pos = "v")
					words_tmp.append(verb)
					tags_tmp.append("VBZ")
				else:
					words_tmp.append(lemmatizer.lemmatize(word))
					tags_tmp.append(tag)

			words = words_tmp
			tags = tags_tmp			

		# Update value.
		_set[i] = Document(doc.id, doc.rating, doc.compound, text)
		_set[i].words = copy.deepcopy(words)
		_set[i].tags = copy.deepcopy(tags)


		# Counting modifications.
		if old_text != text:
			texts_cleaned += 1

	# Printing.
	if clean_arg not in [Clean_arg.STOPWORDS, Clean_arg.LEMMA, Clean_arg.LOWERCASE, Clean_arg.USLESS]:
		print("{} texts cleaned".format(texts_cleaned))
	return _set

In [9]:
def is_positive(rating: int):
	return rating >= 7

In [10]:
def is_negative(rating: int):
	return rating <= 4

In [11]:
def extract_features(doc, top_100_positive):
    features = {}
    wordcount = 0
    compound_scores = []
    positive_scores = []

    for word in doc.words:
        if word in top_100_positive:
            wordcount += 1
    
    compound_scores.append(doc.compound)
    positive_scores.append(doc.rating)

    # Adding 1 to the final compound score to always have positive numbers
    # since some classifiers you'll use later don't work with negative numbers.
    features["mean_compound"] = mean(compound_scores) + 1
    features["mean_positive"] = mean(positive_scores)
    features["wordcount"] = wordcount

    return features

# Files reading.

In [12]:
    # Stock trainset and testset texts.
    docs_train = []
    docs_test = []

    # Directories.
    sets = ["train", "test"]
    str_ratings = ["neg", "pos"]

    # Loop for training and testing directories.
    for _set in sets:
        print("set: {}".format(_set))

        # Loop for "neg" and "pos" directories.
        for str_rating in str_ratings:

            # Creaeting path.
            path = f"./data/{_set}/{str_rating}"

            # 
            print("str_rating: {}".format(str_rating))
            print("path: {}".format(path))

            # Files counter.
            file_count = 0

            # Loop for files.
            for filename in os.listdir(path):
                
                file = os.path.join(path, filename)
                if os.path.isfile(file):
                    with codecs.open(file, "r", encoding="utf-8") as f:
                        
                        # File name parsing to get id and rating.
                        split_extension = filename.split(".")
                        split_id_rating = split_extension[0].split("_")
                        id_str = split_id_rating[0]
                        rating_str = split_id_rating[1]
                        id = rating = -1

                        try:
                            id = int(id_str)
                        except ValueError:
                            sys.exit("Error casting id to int")

                        try:
                            rating = int(rating_str)
                        except ValueError:
                            sys.exit("Error casting rating to int")

                        text = f.read()
                        doc = Document(id, rating, sia.polarity_scores(text)["compound"], text)
                        if _set == "train":
                            docs_train.append(doc)
                        else:
                            docs_test.append(doc)
                        file_count += 1
                        
            print("file_count : {}".format(file_count))

set: train
str_rating: neg
path: ./data/train/neg
file_count : 12500
str_rating: pos
path: ./data/train/pos
file_count : 12500
set: test
str_rating: neg
path: ./data/test/neg
file_count : 12500
str_rating: pos
path: ./data/test/pos
file_count : 12500


In [13]:
"train", [word.str("text") for word in docs_train][0], "test", [word.str("text") for word in docs_test][0]

('train',
 "id : 0, rating : 0.7003, text : Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.",
 'test',
 "id : 0, rating : -0.5349, text : Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's c

# Text mining process.

# Text cleanup.

In [14]:
docs_train_cleaned, docs_test_cleaned = copy.deepcopy(docs_train), copy.deepcopy(docs_test)

## Remove HTML tags.

In [15]:
docs_train_cleaned = clean_text(docs_train_cleaned, Clean_arg.HTML)



14669 texts cleaned


In [16]:
docs_test_cleaned = clean_text(docs_test_cleaned, Clean_arg.HTML)

14537 texts cleaned


In [17]:
"train", [word.str("text") for word in docs_train_cleaned][0], "test", [word.str("text") for word in docs_test_cleaned][0]

('train',
 "id : 0, rating : 3, text : Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.",
 'test',
 "id : 0, rating : 2, text : Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character ar

## Remove punctuation.

In [18]:
docs_train_cleaned = clean_text(docs_train_cleaned, Clean_arg.PUNCTUATION)

24999 texts cleaned


In [19]:
docs_test_cleaned = clean_text(docs_test_cleaned, Clean_arg.PUNCTUATION)

24996 texts cleaned


In [20]:
"train", [word.str("text") for word in docs_train_cleaned][0], "test", [word.str("text") for word in docs_test_cleaned][0]

('train',
 'id : 0, rating : 0.7003, text : Story of a man who has unnatural feelings for a pig Starts out with a opening scene that is a terrific example of absurd comedy A formal orchestra audience is turned into an insane violent mob by the crazy chantings of its singers Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting Even those from the era should be turned off The cryptic dialogue would make Shakespeare seem easy to a third grader On a technical level its better than you might think with some good cinematography by future great Vilmos Zsigmond Future stars Sally Kirkland and Frederic Forrest can be seen briefly',
 'test',
 'id : 0, rating : -0.5349, text : Once again Mr Costner has dragged out a movie for far longer than necessary Aside from the terrific sea rescue sequences of which there are very few I just did not care about any of the characters Most of us have ghosts in the closet and Costners character are real

## Remove numbers.

In [21]:
docs_train_cleaned = clean_text(docs_train_cleaned, Clean_arg.NUMBER)

15331 texts cleaned


In [22]:
docs_test_cleaned = clean_text(docs_test_cleaned, Clean_arg.NUMBER)

14991 texts cleaned


In [23]:
"train", [word.str("text") for word in docs_train_cleaned][0], "test", [word.str("text") for word in docs_test_cleaned][0]

('train',
 'id : 0, rating : 3, text : Story of a man who has unnatural feelings for a pig Starts out with a opening scene that is a terrific example of absurd comedy A formal orchestra audience is turned into an insane violent mob by the crazy chantings of its singers Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting Even those from the era should be turned off The cryptic dialogue would make Shakespeare seem easy to a third grader On a technical level its better than you might think with some good cinematography by future great Vilmos Zsigmond Future stars Sally Kirkland and Frederic Forrest can be seen briefly',
 'test',
 'id : 0, rating : 2, text : Once again Mr Costner has dragged out a movie for far longer than necessary Aside from the terrific sea rescue sequences of which there are very few I just did not care about any of the characters Most of us have ghosts in the closet and Costners character are realized early 

# Pre-processing.

## Tokenization.

In [24]:
docs_train_prepro_token, docs_test_prepro_token = copy.deepcopy(docs_train_cleaned), copy.deepcopy(docs_test_cleaned)

In [25]:
for doc in docs_train_prepro_token:
	doc.tokenize()

In [26]:
for doc in docs_test_prepro_token:
	doc.tokenize()

In [27]:
docs_train, docs_test = docs_train_prepro_token, docs_test_prepro_token

In [28]:
"train", [word.str("token") for word in docs_train][0], "test", [word.str("token") for word in docs_test][0]

('train',
 'id : 0, rating : 3, words : Story, of, a, man, who, has, unnatural, feelings, for, a, pig, Starts, out, with, a, opening, scene, that, is, a, terrific, example, of, absurd, comedy, A, formal, orchestra, audience, is, turned, into, an, insane, violent, mob, by, the, crazy, chantings, of, its, singers, Unfortunately, it, stays, absurd, the, WHOLE, time, with, no, general, narrative, eventually, making, it, just, too, off, putting, Even, those, from, the, era, should, be, turned, off, The, cryptic, dialogue, would, make, Shakespeare, seem, easy, to, a, third, grader, On, a, technical, level, its, better, than, you, might, think, with, some, good, cinematography, by, future, great, Vilmos, Zsigmond, Future, stars, Sally, Kirkland, and, Frederic, Forrest, can, be, seen, briefly',
 'test',
 'id : 0, rating : 2, words : Once, again, Mr, Costner, has, dragged, out, a, movie, for, far, longer, than, necessary, Aside, from, the, terrific, sea, rescue, sequences, of, which, there, are

## Get words tag.

In [29]:
docs_train_tags, docs_test_tags = copy.deepcopy(docs_train), copy.deepcopy(docs_test)

In [30]:
for doc in docs_train_tags:
	doc.get_tag()

In [31]:
for doc in docs_test_tags:
	doc.get_tag()

KeyboardInterrupt: 

In [None]:
"train", [word.str("tag") for word in docs_train_tags][0], "test", [word.str("tag") for word in docs_test_tags][0]

## Stop words removal.

In [None]:
docs_train_stop_words = copy.deepcopy(docs_train_tags)
docs_test_stop_words = copy.deepcopy(docs_test_tags)

In [None]:
docs_train_stop_words = clean_text(docs_train_stop_words, Clean_arg.STOPWORDS)

In [None]:
docs_test_stop_words = clean_text(docs_test_stop_words, Clean_arg.STOPWORDS)

In [None]:
"train", [word.str("token") for word in docs_train_stop_words][0], "test", [word.str("token") for word in docs_test_stop_words][0]

## Remove usless tags / word.

In [None]:
docs_train_rem_usless = copy.deepcopy(docs_train_stop_words)
docs_test_rem_usless = copy.deepcopy(docs_test_stop_words)

In [None]:
docs_train_rem_usless = clean_text(docs_train_rem_usless, Clean_arg.USLESS)

In [None]:
docs_test_rem_usless = clean_text(docs_test_rem_usless, Clean_arg.USLESS)

In [None]:
"train", [word.str("token") for word in docs_train_rem_usless][0], "test", [word.str("token") for word in docs_test_rem_usless][0]

## Lemmatization.

In [None]:
docs_train_lemma = copy.deepcopy(docs_train_rem_usless)
docs_test_lemma = copy.deepcopy(docs_test_rem_usless)

In [None]:
docs_train_lemma = clean_text(docs_train_lemma, Clean_arg.LEMMA)

In [None]:
docs_test_lemma = clean_text(docs_test_lemma, Clean_arg.LEMMA)

In [None]:
"train", [word.str("tag") for word in docs_train_lemma][0], "test", [word.str("tag") for word in docs_test_lemma][0]

## Normalization.

### Lowercase.

In [None]:
docs_train_lower = copy.deepcopy(docs_train_lemma)
docs_test_lower = copy.deepcopy(docs_test_lemma)

In [None]:
docs_train_lower = clean_text(docs_train_lower, Clean_arg.LOWERCASE)

In [None]:
docs_test_lower = clean_text(docs_test_lower, Clean_arg.LOWERCASE)

In [None]:
"train", [word.str("token") for word in docs_train_lower][0], "test", [word.str("token") for word in docs_test_lower][0]

# Text transformation.

In [None]:
docs_train, docs_test = docs_train_lower, docs_test_lower

In [None]:
positive_words_train = [doc.words for doc in docs_train if is_positive(doc.rating)]
negative_words_train = [doc.words for doc in docs_train if is_negative(doc.rating)]

In [None]:
positive_words_train = [item for l in positive_words_train for item in l]
negative_words_train = [item for l in negative_words_train for item in l]

In [None]:
positive_words_train[:10], negative_words_train[:10]

## Frenquency distribution.

In [None]:
positive_fd_train = nltk.FreqDist(positive_words_train)
negative_fd_train = nltk.FreqDist(negative_words_train)

In [None]:
positive_fd_train, negative_fd_train

# Remove common words.

In [None]:
common_set = set(positive_fd_train).intersection(negative_fd_train)

for word in common_set:
    del positive_fd_train[word]
    del negative_fd_train[word]

In [None]:
positive_fd_train, negative_fd_train

In [None]:
top_100_positive_train = {word for word, count in positive_fd_train.most_common(100)}
top_100_negative_train = {word for word, count in negative_fd_train.most_common(100)}

In [None]:
top_100_positive_train, top_100_negative_train

# Classifications.

In [None]:
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

In [None]:
suf
trinset, testset = train_test_split(doc, test_size=0.2, random_state=1)