# Text mining project
## Goals
• Gain practical experience with the complete data mining process

• Get to know additional problem-specific

• Pre-processing methods

• data mining methods
## Expectation
• Select an interesting data mining problem of your choice

• Solve the problem using

• the data mining methods that we have learned so far, including

• proper parameter optimization

• problem-specific pre-processing and smart feature creation

• additional data mining methods which might be helpful for solving the problem

# Importations.

## NLTK

In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from nltk.tag import pos_tag

from nltk.tokenize import word_tokenize

from nltk.probability import FreqDist

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\micka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Sklearn

In [2]:
# Sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import learning_curve, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline

## Classifier
from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, classification_report

## Autre

In [3]:
import os # manipulation des fichiers et des dossiers.
import codecs # lire les fichiers.
import sys # quitter le programme en cas d'erreur.
import re # utilisation de regex.
import string # suppression de la ponctuation.
from bs4 import BeautifulSoup # suppression des balises html.
import contractions
from tqdm import tqdm
import string
import copy

In [4]:
import pandas as pd # DataFrame, ...
import numpy as np # array, ...

In [5]:
# Visualistion
import seaborn as sns
from matplotlib import pyplot as plt
from wordcloud import WordCloud

# Variables globales.

In [6]:
DEBUG = False
TEST = 3000
MAX_FEATURES = 1500
OUT_CSV = "./out/csv/"
OUT_IMG = "./out/img/"

# Fonctions.

## Outils.

### Lecture des fichiers et création des *datasets*.

In [7]:
def create_dataset(directory):
	"""
	Renvoie un dataframe avec les données lu dans le fichier pointe par "directory".
	@param directory: le repertoire ou se trouve les fichiers à lire.
	@return dataset: contenant les données du fichier. 
	"""
	data = []

	# Files counter.
	file_count = 0

	# Loop for files.
	for filename in os.listdir(directory):
		
		file = os.path.join(directory, filename)
		if os.path.isfile(file):
			with codecs.open(file, "r", encoding="utf-8") as f:
				
				# File name parsing to get id and rating.
				split_extension = filename.split(".")
				split_id_rating = split_extension[0].split("_")
				id_str = split_id_rating[0]
				rating_str = split_id_rating[1]
				rating = -1

				try:
					rating = int(rating_str)
				except ValueError:
					sys.exit("Error casting rating to int")

				review = f.read()
				sentiment = 1 if directory.__contains__("pos") else 0
				data.append([review, rating, sentiment])
				file_count += 1
				
	print("file_count : {}".format(file_count))
	return pd.DataFrame(data, columns=["review", "rating", "sentiment"])

### *Preprocessing*.

### Suppression des mots communs entre les *positive reviews* et *negative reviews*.

In [8]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [9]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [10]:
def lemmatize(text):
	# lemmatisation des mots.
	## si c'est un verbre on le mets à l'infinif.
	## si c'est un nom propre on le supprime.
	## pos_tag donne le type de chaque mot.
	verbs = ["VBP", "VBN", "VBG", "VBD", "VB", "VBZ"]
	proper_noun = ["NNS", "NNP", "NNPS"]
	text = ' '.join(
		[lemmatizer.lemmatize(word, pos = "v") if tag in verbs 
		else lemmatizer.lemmatize(word) 
		for word, tag in pos_tag(word_tokenize(text)) 
		if tag not in proper_noun])
	return text

In [11]:
def remove_stopwords(text):
	tokens = word_tokenize(text)
	filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
	filtered_text = ' '.join(filtered_tokens)  
	return filtered_text

In [12]:
def remove_special_characters(text):
    pattern = r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [13]:
def remove_contraction(text):
	return contractions.fix(text)

In [14]:
def remove_usless_word(text):
	tokens = word_tokenize(text)
	usless_word = ["movie", "film", "one", "story"]
	filtered_tokens = [token for token in tokens if token.lower() not in usless_word]
	filtered_text = ' '.join(filtered_tokens)
	return filtered_text

In [15]:
def remove_common_word(df):
	"""
	Supprime les mots communs entre les textes positifs et les textes negatifs.
	@param df: un Dataframe contenant les textes.
	@return Dataframe contenant les textes sans les mots communs.
	"""

	# decoupage des textes positifs et des textes negatifs.
	df_positive_words = df[df["sentiment"] == 1]
	df_negative_words = df[df["sentiment"] == 0]
	positive_reviews = df_positive_words["review"].values
	negative_reviews = df_negative_words["review"].values

	#
	positive_words = [word_tokenize(review) for review in positive_reviews]
	negative_words = [word_tokenize(review) for review in negative_reviews]
	positive_words_flatten = [word for word in positive_words for word in word]
	negative_words_flatten = [word for word in negative_words for word in word]

	# construction de set avec chaque mot et leur frequence.
	positive_fd = set(FreqDist(positive_words_flatten))
	negative_fd = set(FreqDist(negative_words_flatten))

	# calcul des mots communs.
	common_set = positive_fd.intersection(negative_fd)

	# suppression des mots communs.
	old_reviews = df["review"].values
	new_reviews = []
	new_sentiment = []
	words_removed = 0
	for index, review in enumerate(old_reviews):
		new_review = []
		for word in review.split():
			if word not in common_set:
				new_review.append(word)
			else:
				words_removed += 1
		if len(new_review) != 0:
			new_reviews.append(" ".join(word for word in new_review))
			new_sentiment.append(df.iloc[index]["sentiment"])

	# affichage.
	print("{} words removed !".format(words_removed))
	
	data = {"review": new_reviews, "sentiment": new_sentiment}
	new_df = pd.DataFrame(data)
	return new_df

## *Data visualization*

In [16]:
def plot_most_common_words(df):
	all_words = []
	for comment in df['review']:
		words = word_tokenize(comment)
		all_words.extend(words)

	fdist = FreqDist(all_words)
	nb_common_word = 50

	words = [word[0] for word in fdist.most_common(nb_common_word)]
	counts = [word[1] for word in fdist.most_common(nb_common_word)]

	plt.figure(figsize=(15,5))
	plt.bar(words, counts)
	plt.xlabel('Words')
	plt.ylabel('Counts')
	plt.title(f'{nb_common_word} Most Common Words')
	plt.xticks(rotation=90)
	plt.show()

In [17]:
def my_word_cloud(df):
	text = " ".join(review for review in df.review)

	#
	wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                min_font_size = 10).generate(text)
	
	# plot the WordCloud image                      
	plt.figure(figsize = (5, 5), facecolor = None)
	plt.imshow(wordcloud)
	plt.axis("off")
	plt.tight_layout(pad = 0)
	plt.show()

## Entrainement.

In [18]:
def entrainement(classifiers, X_train, X_test, y_train, y_test):
	"""
	Entraine les modeles et predits les classes.
	@param classifiers: dictionnaire contenant le nom et un modele de classifications.
	@param X_train: les donnees du train set.
	@param X_test: les donnees du test set.
	@pamam y_train: les classes du train set.
	@param y_test: les classes du test set.
	@return liste_predictions: une liste contenant le nom, le modele entraine et les predictions.
	"""
	liste_predictions = []
	
	for nom, classifier in classifiers.items():
		print(nom)

		# entrainement des modeles.
		print("fitting...", end="")
		pipeline_classifier = Pipeline([
			("vectorize", CountVectorizer(max_features=MAX_FEATURES)),
			("tfidf", TfidfTransformer()),
			("classifier", classifier),
		])
		pipeline_classifier = pipeline_classifier.fit(X_train, y_train)
		print("Done")

		# prediction des classes du test set.
		print("predicting labels...", end="")
		y_pred_test = pipeline_classifier.predict(X_test)
		print("Done")
	
		liste_predictions.append((nom, pipeline_classifier, y_pred_test))
		print("")
	
	return liste_predictions

## Evaluation.

### Rapport de classification.

In [19]:
def my_classification_report_plot(model_name, df, class_name, pred, y_test, figsize):
	"""
	Affiche le rapport de classification.
	@param model_name: le nom du modele.
	@param df: le Dataframe avec les textes.
	@param class_name: le nom de la classe.
	@param pred: le tableau de prediction.
	@param y_test: les classes du test set.
	@param figsize: la taille de la figure.
	"""
	
	labels = np.unique(df[class_name].values)
	class_report = classification_report(y_true=y_test, y_pred=pred, target_names=labels, output_dict=True)
	plt.figure(figsize=figsize)
	sns.heatmap(pd.DataFrame(class_report).iloc[:-1, :].T, annot=True)
	plt.title(f"Rapport de classification pour {model_name}")
	plt.savefig(f"{OUT_IMG}classification_raport_{model_name}.png", bbox_inches='tight')
	plt.show()

In [20]:
def my_classification_report(model_name, pred, y_test):
	"""
	Affiche le rapport de classification.
	@param model_name: le nom du modele.
	@param pred: le tableau de prediction.
	@param y_test: les classes du test set.
	"""
	
	print("Rapport de classification pour {}".format(model_name))
	print(classification_report(y_true=y_test, y_pred=pred))

### Matrice de confusion.

In [21]:
def my_confusion_matrix(model_name, df, class_name, pred, y_test, figsize):
    """
	Affiche le rapport de classification.
	@param model_name: le nom du modele.
	@param df: le Dataframe avec les textes.
	@param class_name: le nom de la classe.
	@param pred: le tableau de prediction.
	@param y_test: les classes du test set.
	@param figsize: la taille de la figure.
	"""
    
    labels = np.unique(df[class_name].values)
    conf_matrix = confusion_matrix(y_test, pred, labels=labels)
    df_conf_matrix = pd.DataFrame(conf_matrix, columns=labels)
    df_conf_matrix["index"] = labels
    df_conf_matrix = df_conf_matrix.set_index("index")

    plt.figure(figsize=figsize)
    sns.heatmap(df_conf_matrix, annot=True, fmt="d", cmap="coolwarm")
    plt.title(f"Matrice de confusion pour {model_name}")
    plt.savefig(f"{OUT_IMG}confusion_matrix_{model_name}.png", bbox_inches='tight')
    plt.show()

### Courbe d'apprentissage.

In [22]:
def my_learning_curve(model_name, model, x_train, y_train, figsize):
    """
	Affiche le rapport de classification.
	@param model_name: le nom du modele.
	@param model: le modele de classification.
    @param x_train: les donnees du train set.
    @param y_train: les classes du train set.
    @param figsize: la taille de la figure.
	"""

    train_sizes, train_scores, test_scores = learning_curve(
        estimator=model, X=x_train, y=y_train, 
        train_sizes=np.linspace(0.1, 1.0, 4),
        n_jobs=10, verbose=0, random_state=42)

    #
    # Calculate training and test mean and std
    #
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    #
    # Plot the learning curve
    #
    plt.figure(figsize=figsize)
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training Accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean, color='green', marker='+', markersize=5, linestyle='--', label='Validation Accuracy')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
    plt.title("Learning curve pour {}".format(model_name))
    plt.xlabel('Training Data Size')
    plt.ylabel('Model accuracy')
    plt.grid()
    plt.legend(loc='lower right')
    plt.savefig(f"{OUT_IMG}learning_curve_{model_name}.png", bbox_inches='tight')
    plt.show()

### *Roc curve*.

In [23]:
def plot_roc_curve(model_name, model_fit, X_test, y_test, figsize): 
	"""
	Affiche le rapport de classification.
	@param model_name: le nom du modele.
	@param model_fit: le modele de classification.
	@param X_test: les donnees du test set.
	@param y_test: les classes du test set.
	@param figsize: la taille de la figure.
	"""

	probs = model_fit.predict_proba(X_test)  
	probs = probs[:, 1]
	fper, tper, thresholds = roc_curve(y_test, probs) 

	plt.figure(figsize=figsize)
	plt.plot(fper, tper, color='orange', label='ROC')
	plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
	plt.xlabel('False Positive Rate')
	plt.ylabel('True Positive Rate')
	plt.title("ROC curve pour {}".format(model_name))
	plt.savefig(f"{OUT_IMG}roc_curve_{model_name}.png", bbox_inches='tight')
	plt.legend()
	plt.show()

### Evaluation global.

In [24]:
def evaluation(liste_prediction, df, class_name, X_train, X_test, y_train, y_test):
	"""
	Affiche le rapport de classification.
	@param liste_prediction: la liste contenant les modeles.
	@param df: le Dataframe avec les textes.
	@param class_name: le nom de la classe.
	@param X_train: les donnees du train set.
	@param X_test: les donnees du test set.
	@param y_train: les classes du train set.
	@param y_test: les classes du test set.
	"""
	for nom, model_fit, y_pred_test in liste_prediction:
		figsize = (4,3)
		my_classification_report(nom, y_pred_test, y_test)

		my_confusion_matrix(nom, df, class_name, y_pred_test, y_test, figsize)

		my_learning_curve(nom, model_fit, X_train, y_train, figsize)

		plot_roc_curve(nom, model_fit, X_test, y_test, figsize)

# Lecture des fichiers et création des *datasets*.

## *Trainset*.

### *Positive reviews*

In [25]:
df_train_pos = create_dataset("./data/train/pos/")
df_train_pos

file_count : 12500


Unnamed: 0,review,rating,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,9,1
1,Homelessness (or Houselessness as George Carli...,8,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,10,1
3,This is easily the most underrated film inn th...,7,1
4,This is not the typical Mel Brooks film. It wa...,8,1
...,...,...,...
12495,"Seeing as the vote average was pretty low, and...",9,1
12496,"The plot had some wretched, unbelievable twist...",8,1
12497,I am amazed at how this movie(and most others ...,10,1
12498,A Christmas Together actually came before my t...,8,1


### *Negative reviews*

In [26]:
df_train_neg = create_dataset("./data/train/neg/")
df_train_neg

file_count : 12500


Unnamed: 0,review,rating,sentiment
0,Story of a man who has unnatural feelings for ...,3,0
1,Airport '77 starts as a brand new luxury 747 p...,4,0
2,This film lacked something I couldn't put my f...,4,0
3,"Sorry everyone,,, I know this is supposed to b...",1,0
4,When I was little my parents took me along to ...,1,0
...,...,...,...
12495,"Towards the end of the movie, I felt it was to...",4,0
12496,This is the kind of movie that my enemies cont...,3,0
12497,I saw 'Descent' last night at the Stockholm Fi...,3,0
12498,Some films that you pick up for a pound turn o...,1,0


### Création du *trainset*.

In [27]:
df_train = pd.concat([df_train_pos, df_train_neg], axis=0)
df_train.reset_index(drop=True, inplace=True)
df_train

Unnamed: 0,review,rating,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,9,1
1,Homelessness (or Houselessness as George Carli...,8,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,10,1
3,This is easily the most underrated film inn th...,7,1
4,This is not the typical Mel Brooks film. It wa...,8,1
...,...,...,...
24995,"Towards the end of the movie, I felt it was to...",4,0
24996,This is the kind of movie that my enemies cont...,3,0
24997,I saw 'Descent' last night at the Stockholm Fi...,3,0
24998,Some films that you pick up for a pound turn o...,1,0


### Ecriture dans un fichier.

In [28]:
df_train.to_csv(f"{OUT_CSV}df_train.csv", index=False)

## *Testset*.

### *Positive reviews*

In [29]:
df_test_pos = create_dataset("./data/test/pos/")
df_test_pos

file_count : 12500


Unnamed: 0,review,rating,sentiment
0,I went and saw this movie last night after bei...,10,1
1,Actor turned director Bill Paxton follows up h...,7,1
2,As a recreational golfer with some knowledge o...,9,1
3,"I saw this film in a sneak preview, and it is ...",8,1
4,Bill Paxton has taken the true story of the 19...,8,1
...,...,...,...
12495,I was extraordinarily impressed by this film. ...,8,1
12496,"Although I'm not a golf fan, I attended a snea...",10,1
12497,"From the start of ""The Edge Of Love"", the view...",8,1
12498,"This movie, with all its complexity and subtle...",10,1


### *Negative reviews*

In [30]:
df_test_neg = create_dataset("./data/test/neg/")
df_train_neg

file_count : 12500


Unnamed: 0,review,rating,sentiment
0,Story of a man who has unnatural feelings for ...,3,0
1,Airport '77 starts as a brand new luxury 747 p...,4,0
2,This film lacked something I couldn't put my f...,4,0
3,"Sorry everyone,,, I know this is supposed to b...",1,0
4,When I was little my parents took me along to ...,1,0
...,...,...,...
12495,"Towards the end of the movie, I felt it was to...",4,0
12496,This is the kind of movie that my enemies cont...,3,0
12497,I saw 'Descent' last night at the Stockholm Fi...,3,0
12498,Some films that you pick up for a pound turn o...,1,0


### Création du *testset*.

In [31]:
df_test = pd.concat([df_test_pos, df_test_neg], axis=0)
df_test.reset_index(drop=True, inplace=True)
df_test

Unnamed: 0,review,rating,sentiment
0,I went and saw this movie last night after bei...,10,1
1,Actor turned director Bill Paxton follows up h...,7,1
2,As a recreational golfer with some knowledge o...,9,1
3,"I saw this film in a sneak preview, and it is ...",8,1
4,Bill Paxton has taken the true story of the 19...,8,1
...,...,...,...
24995,I occasionally let my kids watch this garbage ...,1,0
24996,When all we have anymore is pretty much realit...,1,0
24997,The basic genre is a thriller intercut with an...,3,0
24998,Four things intrigued me as to this film - fir...,3,0


### Ecriture dans un fichier externe.

In [32]:
df_test.to_csv(f"{OUT_CSV}df_test.csv", index=False)

# *Preprocessing*.

In [33]:
df_train_prepro = copy.deepcopy(df_train)
df_test_prepro = copy.deepcopy(df_test)

## *Trainset*.

In [34]:
if DEBUG:
	df_train_prepro = df_train.sample(TEST)

In [35]:
df_train_prepro["review"] = df_train_prepro["review"].apply(strip_html)
df_train_prepro.head()



Unnamed: 0,review,rating,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,9,1
1,Homelessness (or Houselessness as George Carli...,8,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,10,1
3,This is easily the most underrated film inn th...,7,1
4,This is not the typical Mel Brooks film. It wa...,8,1


In [36]:
df_train_prepro["review"] = df_train_prepro["review"].apply(remove_contraction)
df_train_prepro.head()

Unnamed: 0,review,rating,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,9,1
1,Homelessness (or Houselessness as George Carli...,8,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,10,1
3,This is easily the most underrated film inn th...,7,1
4,This is not the typical Mel Brooks film. It wa...,8,1


In [37]:
df_train_prepro["review"] = df_train_prepro["review"].apply(remove_punctuation)
df_train_prepro.head()

Unnamed: 0,review,rating,sentiment
0,Bromwell High is a cartoon comedy It ran at th...,9,1
1,Homelessness or Houselessness as George Carlin...,8,1
2,Brilliant overacting by Lesley Ann Warren Best...,10,1
3,This is easily the most underrated film inn th...,7,1
4,This is not the typical Mel Brooks film It was...,8,1


In [38]:
df_train_prepro["review"] = df_train_prepro["review"].apply(lemmatize)
df_train_prepro.head()

In [None]:
df_train_prepro["review"] = df_train_prepro["review"].apply(remove_stopwords)
df_train_prepro.head()

In [None]:
df_train_prepro["review"] = df_train_prepro["review"].apply(remove_special_characters)
df_train_prepro.head()

In [None]:
df_train_prepro["review"] = df_train_prepro["review"].apply(remove_usless_word)
df_train_prepro.head()

### Ecriture dans un fichier.

In [None]:
df_train_prepro.to_csv(f"{OUT_CSV}df_train_prepro.csv", index=False)

## *Testset*.

In [None]:
if DEBUG:
	df_test_prepro = df_test_prepro.sample(TEST)

In [None]:
df_test_prepro["review"] = df_test_prepro["review"].apply(strip_html)
df_test_prepro.head()

In [None]:
df_test_prepro["review"] = df_test_prepro["review"].apply(remove_contraction)
df_test_prepro.head()

In [None]:
df_test_prepro["review"] = df_test_prepro["review"].apply(remove_punctuation)
df_test_prepro.head()

In [None]:
df_test_prepro["review"] = df_test_prepro["review"].apply(lemmatize)
df_test_prepro.head()

In [None]:
df_test_prepro["review"] = df_test_prepro["review"].apply(remove_stopwords)
df_test_prepro.head()

In [None]:
df_test_prepro["review"] = df_test_prepro["review"].apply(remove_special_characters)
df_test_prepro.head()

In [None]:
df_test_prepro["review"] = df_test_prepro["review"].apply(remove_usless_word)
df_test_prepro.head()

### Ecriture dans un fichier.

In [None]:
df_test_prepro.to_csv(f"{OUT_CSV}df_test_prepro.csv", index=False)

## Suppression des mots communs.

In [None]:
# df_train_prepro2 = pd.read_csv(f"{OUT_CSV}df_train_prepro.csv")
# df_test_prepro2 = pd.read_csv(f"{OUT_CSV}df_test_prepro.csv")

### *Trainset*.

In [None]:
# df_train_prepro2 = remove_common_word(df_train_prepro2)

In [None]:
# df_train_prepro2.to_csv(f"{OUT_CSV}df_train_prepro2.csv", index=False)

### *Testset*.

In [None]:
# df_test_prepro2 = remove_common_word(df_test_prepro2)

In [None]:
# df_test_prepro2.to_csv(f"{OUT_CSV}df_test_prepro2.csv", index=False)

# *Data visualization*.

In [None]:
df_train_visu = pd.read_csv(f"{OUT_CSV}df_train_prepro.csv")
df_test_visu = pd.read_csv(f"{OUT_CSV}df_test_prepro.csv")

## Mots plus communs.

In [None]:
plot_most_common_words(df_train_visu)

In [None]:
plot_most_common_words(df_test_visu)

## Nuage de mots.

In [None]:
my_word_cloud(df_train_visu)

In [None]:
my_word_cloud(df_test_visu)

# Classification.

In [None]:
df_train_classify = pd.read_csv(f"{OUT_CSV}df_train_prepro.csv")
df_test_classify = pd.read_csv(f"{OUT_CSV}df_test_prepro.csv")

In [None]:
classifiers = {
    "LogisticRegression": LogisticRegression(), # 87%
    "MultinomialNB": MultinomialNB(), # 85%
    "ComplementNB": ComplementNB(), # 85%
    "BernoulliNB": BernoulliNB(), # 84%
    # "RandomForestClassifier": RandomForestClassifier(), # 83%
    # "AdaBoostClassifier": AdaBoostClassifier(), # 80%
    # "KNeighborsClassifier": KNeighborsClassifier(), # 71%
    # "DecisionTreeClassifier": DecisionTreeClassifier(), # 70%
}

## Découpage en X et y.

In [None]:
X_train, y_train = df_train_classify["review"].values, df_train_classify["sentiment"].values
X_test, y_test = df_test_classify["review"].values, df_test_classify["sentiment"].values

## Paramètre par défaut.

### Entrainement.

In [None]:
liste_predictions = entrainement(classifiers, X_train, X_test, y_train, y_test)

### Evaluation.

In [None]:
evaluation(liste_predictions, df_test_classify, "sentiment", X_train, X_test, y_train, y_test)

## Amélioration des paramètres.

In [None]:
model = Pipeline([
	("vectorize", CountVectorizer()),
	("tfidf", TfidfTransformer()),
	("logistic", LogisticRegression(max_iter=1000)),
])

max_features = [1000, 1500, 3000]
tdidf = ["l1","l2"]

param_grid_lbfgs = {
	"vectorize__max_features": max_features,
	"tfidf__norm": tdidf,
	"logistic__penalty": ["l2"],
	"logistic__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
	"logistic__solver": ["lbfgs"]
}

param_grid_sag= {
	"vectorize__max_features": max_features,
	"tfidf__norm": tdidf,
	"logistic__penalty": ["l2"],
	"logistic__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
	"logistic__solver": ["sag"]
}

param_grid_saga= {
	"vectorize__max_features": max_features,
	"tfidf__norm": tdidf,
	"logistic__penalty": ["elasticnet", "l1", "l2"],
	"logistic__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
	"logistic__solver": ["saga"]
}

param_grid = [param_grid_lbfgs, param_grid_sag, param_grid_sag]
grid = RandomizedSearchCV(model, param_grid, n_iter=50)
# grid = GridSearchCV(model, param_grid)
grid.fit(X_train, y_train)
grid.best_estimator_

### Entrainement.

In [None]:
# grid.best_estimator_
# Pipeline(steps=[('vectorize', CountVectorizer(max_features=3000)),
#                 ('tfidf', TfidfTransformer()),
#                 ('logistic', LogisticRegression(C=10, max_iter=1000, solver='sag'))])

best_model = grid.best_estimator_

print("fitting...", end="")
best_model.fit(X_train, y_train)
print("Done")

print("predicting labels...", end="")
y_pred = best_model.predict(X_test)
print("Done")

### Evaluation.

In [None]:
my_classification_report("LogisticRegression", y_pred, y_test)