# Text mining project
## Goals
• Gain practical experience with the complete data mining process

• Get to know additional problem-specific

• Pre-processing methods

• data mining methods
## Expectation
• Select an interesting data mining problem of your choice

• Solve the problem using

• the data mining methods that we have learned so far, including

• proper parameter optimization

• problem-specific pre-processing and smart feature creation

• additional data mining methods which might be helpful for solving the problem

# Importations.

## NLTK

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from nltk.tag import pos_tag

from nltk.tokenize import word_tokenize

from nltk.probability import FreqDist

## Sklearn

In [None]:
# Sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import learning_curve, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline

## Classifier
from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.inspection import DecisionBoundaryDisplay

## Autre

In [None]:
from bs4 import BeautifulSoup # suppression des balises html.
import codecs # lire les fichiers.
import contractions
import copy
import os # manipulation des fichiers et des dossiers.
import re # utilisation de regex.
import sys # quitter le programme en cas d'erreur.
import string # suppression de la ponctuation.
from tqdm import tqdm

In [None]:
import pandas as pd # DataFrame, ...
import numpy as np # array, ...

In [None]:
# Visualistion
import seaborn as sns
from matplotlib import pyplot as plt
from wordcloud import WordCloud

# Variables globales.

In [None]:
DEBUG = False
TEST = 3000
MAX_FEATURES = 1500
OUT_CSV = "./out/csv/"
OUT_IMG = "./out/img/"

# Fonctions.

## Outils.

### Lecture des fichiers et création des *datasets*.

In [None]:
def create_dataset(directory):
	"""
	Renvoie un dataframe avec les données lu dans le fichier pointe par "directory".
	@param directory: le repertoire ou se trouve les fichiers à lire.
	@return dataset: contenant les données du fichier. 
	"""
	data = []

	# Files counter.
	file_count = 0

	# Loop for files.
	for filename in os.listdir(directory):
		
		file = os.path.join(directory, filename)
		if os.path.isfile(file):
			with codecs.open(file, "r", encoding="utf-8") as f:
				
				# File name parsing to get id and rating.
				split_extension = filename.split(".")
				split_id_rating = split_extension[0].split("_")
				id_str = split_id_rating[0]
				rating_str = split_id_rating[1]
				rating = -1

				try:
					rating = int(rating_str)
				except ValueError:
					sys.exit("Error casting rating to int")

				review = f.read()
				sentiment = 1 if directory.__contains__("pos") else 0
				data.append([review, rating, sentiment])
				file_count += 1
				
	print("file_count : {}".format(file_count))
	return pd.DataFrame(data, columns=["review", "rating", "sentiment"])

## *Preprocessing*.

Remplace les contractions englaises.

In [None]:
def remove_contraction(text):
	return contractions.fix(text)

Suppression des balises html.

In [None]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

Suppresion de la ponctuation.

In [None]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

Lemmatisation.

In [None]:
def lemmatize(text):
	# lemmatisation des mots.
	## si c'est un verbre on le mets à l'infinif.
	## si c'est un nom propre on le supprime.
	## pos_tag donne le type de chaque mot.
	verbs = ["VBP", "VBN", "VBG", "VBD", "VB", "VBZ"]
	proper_noun = ["NNS", "NNP", "NNPS"]
	text = ' '.join(
		[lemmatizer.lemmatize(word, pos = "v") if tag in verbs 
		else lemmatizer.lemmatize(word) 
		for word, tag in pos_tag(word_tokenize(text)) 
		if tag not in proper_noun])
	return text

Suppresion des *stop words*.

In [None]:
def remove_stopwords(text):
	tokens = word_tokenize(text)
	filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
	filtered_text = ' '.join(filtered_tokens)  
	return filtered_text

Suppression de caractères spéciaux et des chiffres.

In [None]:
def remove_special_characters(text):
    pattern = r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

Suppression des mots inutiles.

In [None]:
def remove_usless_word(text):
	tokens = word_tokenize(text)
	usless_word = ["movie", "film", "one", "story"]
	filtered_tokens = [token for token in tokens if token.lower() not in usless_word]
	filtered_text = ' '.join(filtered_tokens)
	return filtered_text

Suppression des mots communs entre les *positive reviews* et *negative reviews*.

In [None]:
def remove_common_word(df):
	"""
	Supprime les mots communs entre les textes positifs et les textes negatifs.
	@param df: un Dataframe contenant les textes.
	@return Dataframe contenant les textes sans les mots communs.
	"""

	# decoupage des textes positifs et des textes negatifs.
	df_positive_words = df[df["sentiment"] == 1]
	df_negative_words = df[df["sentiment"] == 0]
	positive_reviews = df_positive_words["review"].values
	negative_reviews = df_negative_words["review"].values

	#
	positive_words = [word_tokenize(review) for review in positive_reviews]
	negative_words = [word_tokenize(review) for review in negative_reviews]
	positive_words_flatten = [word for word in positive_words for word in word]
	negative_words_flatten = [word for word in negative_words for word in word]

	# construction de set avec chaque mot et leur frequence.
	positive_fd = set(FreqDist(positive_words_flatten))
	negative_fd = set(FreqDist(negative_words_flatten))

	# calcul des mots communs.
	common_set = positive_fd.intersection(negative_fd)

	# suppression des mots communs.
	old_reviews = df["review"].values
	new_reviews = []
	new_sentiment = []
	words_removed = 0
	for index, review in enumerate(old_reviews):
		new_review = []
		for word in review.split():
			if word not in common_set:
				new_review.append(word)
			else:
				words_removed += 1
		if len(new_review) != 0:
			new_reviews.append(" ".join(word for word in new_review))
			new_sentiment.append(df.iloc[index]["sentiment"])

	# affichage.
	print("{} words removed !".format(words_removed))
	
	data = {"review": new_reviews, "sentiment": new_sentiment}
	new_df = pd.DataFrame(data)
	return new_df

## *Data visualization*

In [None]:
def plot_most_common_words(df, name):
	all_words = []
	for comment in df['review']:
		words = word_tokenize(comment)
		all_words.extend(words)

	fdist = FreqDist(all_words)
	nb_common_word = 50

	words = [word[0] for word in fdist.most_common(nb_common_word)]
	counts = [word[1] for word in fdist.most_common(nb_common_word)]

	plt.figure(figsize=(15,5))
	plt.bar(words, counts)
	plt.xlabel('Words')
	plt.ylabel('Counts')
	plt.title(f'{nb_common_word} Most Common Words')
	plt.xticks(rotation=90)
	plt.savefig(f"{OUT_IMG}plot_most_common_word_{name}.png", bbox_inches='tight')
	plt.show()

In [None]:
def my_word_cloud(df, name):
	text = " ".join(review for review in df.review)

	# création du word cloud
	wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                min_font_size = 10).generate(text)
	
	# affichage de l image                    
	plt.figure(figsize = (5, 5), facecolor = None)
	plt.imshow(wordcloud)
	plt.axis("off")
	plt.tight_layout(pad = 0)
	plt.savefig(f"{OUT_IMG}word_cloud_{name}.png", bbox_inches='tight')
	plt.show()

## Entrainement.

In [None]:
def entrainement(classifiers, X_train, X_test, y_train, y_test):
	"""
	Entraine les modeles et predits les classes.
	@param classifiers: dictionnaire contenant le nom et un modele de classifications.
	@param X_train: les donnees du train set.
	@param X_test: les donnees du test set.
	@pamam y_train: les classes du train set.
	@param y_test: les classes du test set.
	@return liste_predictions: une liste contenant le nom, le modele entraine et les predictions.
	"""
	liste_predictions = []
	
	for nom, classifier in classifiers.items():
		print(nom)

		# entrainement des modeles.
		print("fitting...", end="")
		pipeline_classifier = Pipeline([
			("vectorize", CountVectorizer(max_features=MAX_FEATURES)),
			("tfidf", TfidfTransformer()),
			("classifier", classifier),
		])
		pipeline_classifier = pipeline_classifier.fit(X_train, y_train)
		print("Done")

		# prediction des classes du test set.
		print("predicting labels...", end="")
		y_pred_test = pipeline_classifier.predict(X_test)
		print("Done")
	
		liste_predictions.append((nom, pipeline_classifier, y_pred_test))
		print("")
	
	return liste_predictions

## Evaluation.

### Rapport de classification.

In [None]:
def my_classification_report_plot(model_name, df, class_name, pred, y_test, figsize):
	"""
	Affiche le rapport de classification.
	@param model_name: le nom du modele.
	@param df: le Dataframe avec les textes.
	@param class_name: le nom de la classe.
	@param pred: le tableau de prediction.
	@param y_test: les classes du test set.
	@param figsize: la taille de la figure.
	"""
	
	labels = np.unique(df[class_name].values)
	class_report = classification_report(y_true=y_test, y_pred=pred, target_names=labels, output_dict=True)
	plt.figure(figsize=figsize)
	sns.heatmap(pd.DataFrame(class_report).iloc[:-1, :].T, annot=True)
	plt.title(f"Rapport de classification pour {model_name}")
	plt.savefig(f"{OUT_IMG}classification_raport_{model_name}.png", bbox_inches='tight')
	plt.show()

In [None]:
def my_classification_report(model_name, pred, y_test):
	"""
	Affiche le rapport de classification.
	@param model_name: le nom du modele.
	@param pred: le tableau de prediction.
	@param y_test: les classes du test set.
	"""
	
	print("Rapport de classification pour {}".format(model_name))
	print(classification_report(y_true=y_test, y_pred=pred))

### Matrice de confusion.

In [None]:
def my_confusion_matrix(model_name, df, class_name, pred, y_test, figsize):
    """
	Affiche le rapport de classification.
	@param model_name: le nom du modele.
	@param df: le Dataframe avec les textes.
	@param class_name: le nom de la classe.
	@param pred: le tableau de prediction.
	@param y_test: les classes du test set.
	@param figsize: la taille de la figure.
	"""
    
    labels = np.unique(df[class_name].values)
    conf_matrix = confusion_matrix(y_test, pred, labels=labels)
    df_conf_matrix = pd.DataFrame(conf_matrix, columns=labels)
    df_conf_matrix["index"] = labels
    df_conf_matrix = df_conf_matrix.set_index("index")

    plt.figure(figsize=figsize)
    sns.heatmap(df_conf_matrix, annot=True, fmt="d", cmap="coolwarm")
    plt.title(f"Matrice de confusion pour {model_name}")
    plt.savefig(f"{OUT_IMG}confusion_matrix_{model_name}.png", bbox_inches='tight')
    plt.show()

### Courbe d'apprentissage.

In [None]:
def my_learning_curve(model_name, model, x_train, y_train, figsize):
    """
	Affiche le rapport de classification.
	@param model_name: le nom du modele.
	@param model: le modele de classification.
    @param x_train: les donnees du train set.
    @param y_train: les classes du train set.
    @param figsize: la taille de la figure.
	"""

    train_sizes, train_scores, test_scores = learning_curve(
        estimator=model, X=x_train, y=y_train, 
        train_sizes=np.linspace(0.1, 1.0, 4),
        n_jobs=10, verbose=0, random_state=42)

    #
    # Calculate training and test mean and std
    #
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    #
    # Plot the learning curve
    #
    plt.figure(figsize=figsize)
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training Accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean, color='green', marker='+', markersize=5, linestyle='--', label='Validation Accuracy')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
    plt.title("Learning curve pour {}".format(model_name))
    plt.xlabel('Training Data Size')
    plt.ylabel('Model accuracy')
    plt.grid()
    plt.legend(loc='lower right')
    plt.savefig(f"{OUT_IMG}learning_curve_{model_name}.png", bbox_inches='tight')
    plt.show()

### *Roc curve*.

In [None]:
def plot_roc_curve(model_name, model_fit, X_test, y_test, figsize): 
	"""
	Affiche le rapport de classification.
	@param model_name: le nom du modele.
	@param model_fit: le modele de classification.
	@param X_test: les donnees du test set.
	@param y_test: les classes du test set.
	@param figsize: la taille de la figure.
	"""

	probs = model_fit.predict_proba(X_test)  
	probs = probs[:, 1]
	fper, tper, thresholds = roc_curve(y_test, probs) 

	plt.figure(figsize=figsize)
	plt.plot(fper, tper, color='orange', label='ROC')
	plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
	plt.xlabel('False Positive Rate')
	plt.ylabel('True Positive Rate')
	plt.title("ROC curve pour {}".format(model_name))
	plt.savefig(f"{OUT_IMG}roc_curve_{model_name}.png", bbox_inches='tight')
	plt.legend()
	plt.show()

### Evaluation global.

In [None]:
def evaluation(liste_prediction, df, class_name, X_train, X_test, y_train, y_test):
	"""
	Affiche le rapport de classification.
	@param liste_prediction: la liste contenant les modeles.
	@param df: le Dataframe avec les textes.
	@param class_name: le nom de la classe.
	@param X_train: les donnees du train set.
	@param X_test: les donnees du test set.
	@param y_train: les classes du train set.
	@param y_test: les classes du test set.
	"""
	for nom, model_fit, y_pred_test in liste_prediction:
		figsize = (4,3)
		my_classification_report(nom, y_pred_test, y_test)

		my_confusion_matrix(nom, df, class_name, y_pred_test, y_test, figsize)

		my_learning_curve(nom, model_fit, X_train, y_train, figsize)

		plot_roc_curve(nom, model_fit, X_test, y_test, figsize)

# Lecture des fichiers et création des *datasets*.

## *Trainset*.

### *Positive reviews*

In [None]:
df_train_pos = create_dataset("./data/train/pos/")
df_train_pos

### *Negative reviews*

In [None]:
df_train_neg = create_dataset("./data/train/neg/")
df_train_neg

### Création du *trainset*.

In [None]:
df_train = pd.concat([df_train_pos, df_train_neg], axis=0)
df_train.reset_index(drop=True, inplace=True)
df_train

### Ecriture dans un fichier.

In [None]:
df_train.to_csv(f"{OUT_CSV}df_train.csv", index=False)

## *Testset*.

### *Positive reviews*

In [None]:
df_test_pos = create_dataset("./data/test/pos/")
df_test_pos

### *Negative reviews*

In [None]:
df_test_neg = create_dataset("./data/test/neg/")
df_train_neg

### Création du *testset*.

In [None]:
df_test = pd.concat([df_test_pos, df_test_neg], axis=0)
df_test.reset_index(drop=True, inplace=True)
df_test

### Ecriture dans un fichier externe.

In [None]:
df_test.to_csv(f"{OUT_CSV}df_test.csv", index=False)

# *Preprocessing*.

In [None]:
df_train_prepro = copy.deepcopy(df_train)
df_test_prepro = copy.deepcopy(df_test)

## *Trainset*.

In [None]:
if DEBUG:
	df_train_prepro = df_train.sample(TEST)

In [None]:
df_train_prepro["review"] = df_train_prepro["review"].apply(strip_html)
df_train_prepro.head()

In [None]:
df_train_prepro["review"] = df_train_prepro["review"].apply(remove_contraction)
df_train_prepro.head()

In [None]:
df_train_prepro["review"] = df_train_prepro["review"].apply(remove_punctuation)
df_train_prepro.head()

In [None]:
df_train_prepro["review"] = df_train_prepro["review"].apply(lemmatize)
df_train_prepro.head()

In [None]:
df_train_prepro["review"] = df_train_prepro["review"].apply(remove_stopwords)
df_train_prepro.head()

In [None]:
df_train_prepro["review"] = df_train_prepro["review"].apply(remove_special_characters)
df_train_prepro.head()

In [None]:
df_train_prepro["review"] = df_train_prepro["review"].apply(remove_usless_word)
df_train_prepro.head()

### Ecriture dans un fichier.

In [None]:
df_train_prepro.to_csv(f"{OUT_CSV}df_train_prepro.csv", index=False)

## *Testset*.

In [None]:
if DEBUG:
	df_test_prepro = df_test_prepro.sample(TEST)

In [None]:
df_test_prepro["review"] = df_test_prepro["review"].apply(strip_html)
df_test_prepro.head()

In [None]:
df_test_prepro["review"] = df_test_prepro["review"].apply(remove_contraction)
df_test_prepro.head()

In [None]:
df_test_prepro["review"] = df_test_prepro["review"].apply(remove_punctuation)
df_test_prepro.head()

In [None]:
df_test_prepro["review"] = df_test_prepro["review"].apply(lemmatize)
df_test_prepro.head()

In [None]:
df_test_prepro["review"] = df_test_prepro["review"].apply(remove_stopwords)
df_test_prepro.head()

In [None]:
df_test_prepro["review"] = df_test_prepro["review"].apply(remove_special_characters)
df_test_prepro.head()

In [None]:
df_test_prepro["review"] = df_test_prepro["review"].apply(remove_usless_word)
df_test_prepro.head()

### Ecriture dans un fichier.

In [None]:
df_test_prepro.to_csv(f"{OUT_CSV}df_test_prepro.csv", index=False)

# *Data visualization*.

In [None]:
df_train_visu = pd.read_csv(f"{OUT_CSV}df_train_prepro.csv")
df_test_visu = pd.read_csv(f"{OUT_CSV}df_test_prepro.csv")

## Mots plus communs.

In [None]:
plot_most_common_words(df_train_visu, "train")

In [None]:
df_train_visu2 = copy.deepcopy(df_train_visu)
df_train_visu2 = remove_common_word(df_train_visu2)

In [None]:
df_train_pos_visu = df_train_visu2[df_train_visu2["sentiment"] == 1]
plot_most_common_words(df_train_pos_visu, "train_pos")

In [None]:
df_train_neg_visu = df_train_visu2[df_train_visu2["sentiment"] == 0]
plot_most_common_words(df_train_neg_visu, "train_neg")

In [None]:
plot_most_common_words(df_test_visu, "test")

## Nuage de mots.

In [None]:
my_word_cloud(df_train_visu, "train")

In [None]:
my_word_cloud(df_test_visu, "test")

# Classification.

In [None]:
df_train_classify = pd.read_csv(f"{OUT_CSV}df_train_prepro.csv")
df_test_classify = pd.read_csv(f"{OUT_CSV}df_test_prepro.csv")

In [None]:
classifiers = {
    "LogisticRegression": LogisticRegression(), # 87%
    "MultinomialNB": MultinomialNB(), # 85%
    "ComplementNB": ComplementNB(), # 85%
    "BernoulliNB": BernoulliNB(), # 84%
    # "RandomForestClassifier": RandomForestClassifier(), # 83%
    # "AdaBoostClassifier": AdaBoostClassifier(), # 80%
    # "KNeighborsClassifier": KNeighborsClassifier(), # 71%
    # "DecisionTreeClassifier": DecisionTreeClassifier(), # 70%
}

## Découpage en X et y.

In [None]:
X_train, y_train = df_train_classify["review"].values, df_train_classify["sentiment"].values
X_test, y_test = df_test_classify["review"].values, df_test_classify["sentiment"].values

## Paramètre par défaut.

### Entrainement.

In [None]:
liste_predictions = entrainement(classifiers, X_train, X_test, y_train, y_test)

### Evaluation.

In [None]:
evaluation(liste_predictions, df_test_classify, "sentiment", X_train, X_test, y_train, y_test)

In [None]:
# import eli5
# from eli5.lime import TextExplainer

# text = df_test_classify.review.to_list()[0]
# pipeline = liste_predictions[0][1]
# prediction = pipeline.predict([text])[0]

# explainer = TextExplainer(random_state=0)
# explainer.fit(text, pipeline.predict_proba)

# print("Prediction:", df_test_classify.sentiment[prediction])
# print("Explanation:")
# explainer.show_prediction(target_names=df_test_classify.sentiment.tolist())

## Comparaison.

### Matrice de confusion.

In [None]:
noms = []
vrai_positif = []
vrai_negatif = []
for nom, _, y_pred_test in liste_predictions:
	labels = np.unique(df_train_classify["sentiment"].values)
	conf_matrix = confusion_matrix(y_test, y_pred_test, labels=labels)

	noms.append(nom)
	vrai_positif.append(conf_matrix[0][0])
	vrai_negatif.append(conf_matrix[1][1])

data = {"nom": noms, "vrai_positif": vrai_positif, "vrai_negatif": vrai_negatif}
df_matric_confu = pd.DataFrame(data, columns=["nom", "vrai_positif", "vrai_negatif"])
df_matric_confu.plot.bar(x="nom", rot=30, title="Comparaison des vrais positifs et des vrais negatifs de chaque algorithme")
plt.legend(loc="lower right")
plt.savefig(f"{OUT_IMG}comparaison_matrice_confusion.png", bbox_inches='tight')
plt.show()

### *ROC curve*.

In [None]:
plt.figure(figsize=(5,5))
for index, (nom, model_fit, _) in enumerate(liste_predictions):
	colors = ["blue", "red", "green", "yellow"]
	probs = model_fit.predict_proba(X_test) 
	probs = probs[:, 1]
	fper, tper, thresholds = roc_curve(y_test, probs) 
	plt.plot(fper, tper, color=colors[index], label=f"{nom}")

plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("Comparaison des ROC curve de chaque algorithme")
plt.legend()
plt.savefig(f"{OUT_IMG}comparaison_roc_curve.png", bbox_inches='tight')
plt.show()

## Amélioration des paramètres.

In [None]:
model = Pipeline([
	("vectorize", CountVectorizer()),
	("tfidf", TfidfTransformer()),
	("logistic", LogisticRegression(max_iter=1000)),
])

max_features = [1000, 1500, 3000]
c = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
tdidf = ["l1","l2"]

param_grid_lbfgs_sag = {
	"vectorize__max_features": max_features,
	"tfidf__norm": tdidf,
	"logistic__solver": ["lbfgs", "sag"],
	"logistic__penalty": ["l2"],
	"logistic__C": c
}

param_grid_saga= {
	"vectorize__max_features": max_features,
	"tfidf__norm": tdidf,
	"logistic__solver": ["saga"],
	"logistic__penalty": ["elasticnet", "l1", "l2"],
	"logistic__C": c
}

param_grid = [param_grid_lbfgs_sag, param_grid_saga]
grid = RandomizedSearchCV(model, param_grid, n_iter=100)
# grid = GridSearchCV(model, param_grid)
grid.fit(X_train, y_train)
grid.best_estimator_

### Entrainement.

In [None]:
# grid.best_estimator_
# Pipeline(steps=[('vectorize', CountVectorizer(max_features=3000)),
#                 ('tfidf', TfidfTransformer()),
#                 ('logistic', LogisticRegression(C=10, max_iter=1000, solver='sag'))])

best_model = grid.best_estimator_

print("fitting...", end="")
best_model.fit(X_train, y_train)
print("Done")

print("predicting labels...", end="")
y_pred = best_model.predict(X_test)
print("Done")

### Evaluation.

In [None]:
my_classification_report("LogisticRegression", y_pred, y_test)