# TP3_GROUPE-7 :Classification des courriers indesirables avec KNN


###  Telechargement de la librairie stopword du module nltk si c est pas installer

In [None]:
# telecharger stopword pour que le code s'execute en prenant en compte les entrees ou son utilisation apparait 
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/njonou65/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import os 
import string 
from nltk.corpus import stopwords 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score #pour calculer le taux de generalisation de l'algorithme 
import numpy as np 

In [None]:
def load_data():
    """ fonction pour importer les spams et hams """
    print("Loading data...")
    
    ham_files_location = os.listdir("dataset/ham/")
    spam_files_location = os.listdir("dataset/spam/")
    data = []
    # charger les mails ham 
    for file_path in ham_files_location:
      f = open("dataset/ham/" + file_path, "r", encoding="utf-8", errors="ignore")
      text = str(f.read())
      data.append([text, "ham"])
        
    #charger les mails spam 
    for file_path in spam_files_location:
      f = open("dataset/spam/" + file_path, "r", encoding="utf-8", errors="ignore")
      text = str(f.read())
      data.append([text, "spam"])

    data = np.array(data)
    
    print("flag 1: loaded data")
    return data


## Pretraitement des donnees

###  suppression des bruits 

In [None]:
def preprocess_data(data):
    print("Preprocessing data...")
    
    punc = string.punctuation           # liste des ponctuation
    sw = stopwords.words('english')     # liste de mots de fins
    for record in data:
        # retirer les virgules et les symboles 
        for item in punc:
            record[0] = record[0].replace(item, "")
             # mettre toutes les lettres en miniscule supprimer les mots vides 
        splittedWords = record[0].split()
        newText = ""
        for word in splittedWords:
            if word not in sw:
                word = word.lower()
                newText = newText + " " + word  
        record[0] = newText
        
    print("flag 2: preprocessed data")        
    return data
    # split permet de renvoyer une liste de tous les mots de l'email 
    # cette fonction permet de retirer tous les elements indesirables et renvoie un dataset plus coherent 

## Division du jeux de donnees en 2 * test + train

In [None]:
def split_data(data):
    print("Splitting data...")
    
    features = data[:, 0]   # featurers va contenir tous les corps d'email 
    labels = data[:, 1]     # les labels des emails
    print(labels)
    training_data, test_data, training_labels, test_labels =\
        train_test_split(features, labels, test_size = 0.27, random_state = 42)
    
    print("flag 3: splitted data")
    return training_data, test_data, training_labels, test_labels

## **Algorithme KNN**

In [None]:
# permet de compter la frequence de chaque mot dans un mail et renvoie le resultat dans in dictionnaire 
def get_count(text): 
    wordCounts = dict() 
    for word in text.split(): 
        if word in wordCounts: 
            wordCounts[word] += 1 
        else: 
            wordCounts[word] = 1 
    
    return wordCounts

In [None]:
def euclidean_difference(test_WordCounts, training_WordCounts):
    """calcul la distance euclidienne entre chaque emails"""
  total = 0
  for word in test_WordCounts:
    if word in test_WordCounts and word in training_WordCounts:
      total += (test_WordCounts[word] - training_WordCounts[word])**2
      del training_WordCounts[word]
    else:
      total += test_WordCounts[word]**2
  for word in training_WordCounts:
    total += training_WordCounts[word]**2
  return total**0.5

In [None]:
def get_class(selected_Kvalues):
    """classer les mails en fonctions des k voisins les plus proches"""
    spam_count = 0
    ham_count = 0
    for value in selected_Kvalues:
        if value[0] == "spam":
            spam_count += 1
        else:
            ham_count += 1
    if spam_count > ham_count:
        return "spam"
    else:
        return "ham"

In [None]:
def knn_classifier(training_data, training_labels, test_data, K, tsize):
    """fonction de classification des emails """
    
    print("Running KNN Classifier...")
    
    result = []
    counter = 1
    
    training_WordCounts = [] # compteur de mot pour les donnees d'apprentissage

    for training_text in training_data:
      training_WordCounts.append(get_count(training_text))

    for test_text in test_data:
      similarity = [] # Liste des distances euclidienne 
      test_WordCounts = get_count(test_text)  # compteur de mots pour les donnees de test
        
    # obtenir la difference euclidienne  
      for index in range(len(training_data)):
        euclidean_diff =\
        euclidean_difference(test_WordCounts, training_WordCounts[index])
        similarity.append([training_labels[index], euclidean_diff])
        
    # on trie les distances par ordre croissant pour voir les plus proches
      similarity = sorted(similarity, key = lambda i:i[1])
        
        
    # choisir les k plus proches voisns 
      selected_Kvalues = [] 
      for i in range(K):
        selected_Kvalues.append(similarity[i])
        
    # predire la classe de l'email 
      result.append(get_class(selected_Kvalues))
    return result

# **Fonction principale -- execution de notre programme** 

In [None]:
def main(K):
    data = load_data()
    data = preprocess_data(data)
    training_data, test_data, training_labels, test_labels = split_data(data)
    tsize = len(test_data)
    result = knn_classifier(training_data, training_labels, test_data[:tsize], K, tsize) 
    accuracy = accuracy_score(test_labels[:tsize], result)
    print("taille des données d'entraînement\t : " + str(len(training_data))) 
    print("test data size\t\t : " + str(len(test_data))) 
    print("valeur K\t\t\ t\t : " + str(K)) 
    print("Échantillons testés\t\t : " + str(tsize)) 
    print("% précision\t\t\t : " + str(accuracy * 100)) 
    print("Nombre correct\t\t : " + str(int(accuracy * tsize))) 
    print("Nombre erroné\t\t : " + str(int((1 - accuracy) * tsize)))

In [None]:
main(11) 

Loading data...
flag 1: loaded data
Preprocessing data...
flag 2: preprocessed data
Splitting data...
['ham' 'ham' 'ham' ... 'spam' 'spam' 'spam']
flag 3: splitted data
Running KNN Classifier...
taille des données d'entraînement	 : 4275
test data size		 : 1582
valeur K		\ t	 : 11
Échantillons testés		 : 1582
% précision			 : 76.23261694058155
Nombre correct		 : 1206
Nombre erroné		 : 376
