# Lib

In [21]:
import pandas as pd
import chardet
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# Detection de l'encodage

In [2]:


with open('spam.csv', 'rb') as fichier:
    contenu_brut = fichier.read()
    resultat = chardet.detect(contenu_brut)
    print(resultat)  # Affiche l'encodage détecté

{'encoding': 'Windows-1252', 'confidence': 0.7257971165545478, 'language': ''}


# Lecture du fichier

In [3]:
path = 'spam.csv'

data=pd.read_csv(path,sep=',',encoding='Windows-1252')

data.info

<bound method DataFrame.info of         v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will Ì_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        NaN

# Préparation des Données

## Chargement des Données

In [4]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


##  Nettoyage des Données

### Gestion des valeurs manquantes

In [5]:
data.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [6]:
data = data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])


### Normalisation des textes

#### Convertir en minuscules

In [8]:
data[['v1', 'v2']] = data[['v1', 'v2']].apply(lambda x: x.str.lower())


#### Supprimer les ponctuations et caractères spéciaux

In [11]:
data[['v1', 'v2']] = data[['v1', 'v2']].applymap(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)

  data[['v1', 'v2']] = data[['v1', 'v2']].applymap(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)


#### Supprimer les chiffres

In [13]:
data[['v1', 'v2']] = data[['v1', 'v2']].applymap(lambda x: re.sub(r'\d+', '', x) if isinstance(x, str) else x)

  data[['v1', 'v2']] = data[['v1', 'v2']].applymap(lambda x: re.sub(r'\d+', '', x) if isinstance(x, str) else x)


#### Supprimer les stopwords

In [24]:
nltk.download('stopwords')

# Use French stopwords instead of English, if needed
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ludovic/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
data['v2'] = data['v2'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

#### Lemmatization


In [26]:
# Download necessary resources
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /home/ludovic/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ludovic/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [27]:
data['v2'] = data['v2'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))


###  Extraction des Caractéristiques