<a href="https://colab.research.google.com/github/mrzebest/MachineDeepLearning/blob/main/DetecteurDeSpamYT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import des libraires
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam

# Chargement de la données

In [None]:
# Chargement de la données
file_path = "/content/sample_data/Youtube_Spam_Dataset.csv"
df = pd.read_csv(file_path)

# Affichage des infos
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1956 entries, 0 to 1955
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   COMMENT_ID  1956 non-null   object
 1   AUTHOR      1956 non-null   object
 2   DATE        1711 non-null   object
 3   CONTENT     1956 non-null   object
 4   VIDEO_NAME  1956 non-null   object
 5   CLASS       1956 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 91.8+ KB


In [None]:
#Affichages des données
df.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,VIDEO_NAME,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",PSY - GANGNAM STYLE(?????) M/V,1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,PSY - GANGNAM STYLE(?????) M/V,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,PSY - GANGNAM STYLE(?????) M/V,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,PSY - GANGNAM STYLE(?????) M/V,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,PSY - GANGNAM STYLE(?????) M/V,1


# Analyse et nettoyage

In [None]:
# Garder uniquement les colonnes utiles
df = df[['CONTENT', 'CLASS']]

# Nettoyage du texte : suppression des balises, liens, caractères spéciaux, etc.
def clean_text(text):
    text = text.lower()  # minuscule
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # URLs
    text = re.sub(r'\[.*?\]', '', text)  # texte entre crochets
    text = re.sub(r'<.*?>+', '', text)  # balises HTML
    text = re.sub(r'[^a-z\s]', '', text)  # caractères non alphabétiques
    text = re.sub(r'\s+', ' ', text).strip()  # espaces multiples
    return text

df['CLEAN_CONTENT'] = df['CONTENT'].apply(clean_text)

# Afficher un aperçu après nettoyage
df[['CONTENT', 'CLEAN_CONTENT', 'CLASS']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CLEAN_CONTENT'] = df['CONTENT'].apply(clean_text)


Unnamed: 0,CONTENT,CLEAN_CONTENT,CLASS
0,"Huh, anyway check out this you[tube] channel: ...",huh anyway check out this you channel kobyoshi,1
1,Hey guys check out my new channel and our firs...,hey guys check out my new channel and our firs...,1
2,just for test I have to say murdev.com,just for test i have to say murdevcom,1
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,me shaking my sexy ass on my channel enjoy,1
4,watch?v=vtaRGgvGtWQ Check this out .﻿,watchvvtarggvgtwq check this out,1


In [None]:
# Pré-traitement
vocab_size = 5000
max_len = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df['CLEAN_CONTENT'])
sequences = tokenizer.texts_to_sequences(df['CLEAN_CONTENT'])
X = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['CLASS'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Modèle CNN

In [None]:
# Modélisation (CNN)
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



In [None]:
# entraînement du modèle
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

Epoch 1/5
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 74ms/step - accuracy: 0.6936 - loss: 0.6200 - val_accuracy: 0.8280 - val_loss: 0.4375
Epoch 2/5
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - accuracy: 0.8784 - loss: 0.2965 - val_accuracy: 0.8917 - val_loss: 0.2805
Epoch 3/5
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.9520 - loss: 0.1433 - val_accuracy: 0.8917 - val_loss: 0.2424
Epoch 4/5
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.9786 - loss: 0.0591 - val_accuracy: 0.9172 - val_loss: 0.2427
Epoch 5/5
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - accuracy: 0.9841 - loss: 0.0393 - val_accuracy: 0.9299 - val_loss: 0.2759


In [None]:
# Sauvegarde au format HDFS (.h5)
model.save('Dl_on_text_classification_CNN.h5')



In [None]:
# Sauvegarde du Tokenizer :
import json

# Sérialisation du tokenizer en JSON
tokenizer_json = tokenizer.to_json()

# Sauvegarde dans un fichier
with open("tokenizer.json", "w", encoding="utf-8") as f:
    f.write(tokenizer_json)

In [None]:
# Evaluation
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))

In [None]:
# courbre de performance
plt.plot(history.history['accuracy'], label='train acc')
plt.plot(history.history['val_accuracy'], label='val acc')
plt.title("Accuracy")
plt.legend()
plt.show()

In [None]:
# Prédiction sur un nouveau message
new_message = ["Congratulations! You've won a $1000 Walmart gift card."]

# Nettoyage du texte
cleaned_message = [clean_text(msg) for msg in new_message]

# Transformation en séquence
new_sequence = tokenizer.texts_to_sequences(cleaned_message)

# Padding
new_sequence_padded = pad_sequences(new_sequence, maxlen=max_len, padding='post')

# Prédiction
prediction = model.predict(new_sequence_padded)

# Interprétation
print("Probabilité de spam:", prediction[0][0])
print("Spam détecté" if prediction[0][0] > 0.5 else "Non spam")

# Modèle MLP

In [None]:
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense


# Paramètres
vocab_size = 5000
max_len = 100

# Création du modèle MLP
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compilation du modèle
model.compile(
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Affichage du modèle
model.summary()

In [None]:
# entraînement du modèle
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

In [None]:
# Evaluation
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))

In [None]:
# courbre de performance
plt.plot(history.history['accuracy'], label='train acc')
plt.plot(history.history['val_accuracy'], label='val acc')
plt.title("Accuracy")
plt.legend()
plt.show()

In [None]:
# Prédiction sur un nouveau message
new_message = ["Congratulations! You've won a $1000 Walmart gift card."]

# Nettoyage du texte
cleaned_message = [clean_text(msg) for msg in new_message]

# Transformation en séquence
new_sequence = tokenizer.texts_to_sequences(cleaned_message)

# Padding
new_sequence_padded = pad_sequences(new_sequence, maxlen=max_len, padding='post')

# Prédiction
prediction = model.predict(new_sequence_padded)

# Interprétation
print("Probabilité de spam:", prediction[0][0])
print("Spam détecté" if prediction[0][0] > 0.5 else "Non spam")

# Modèle LSTM

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Paramètres
vocab_size = 5000
max_len = 100

# Modèle LSTM adapté pour la classification binaire (spam / non-spam)
model = Sequential()

# Couche d'embedding
model.add(Embedding(
    input_dim=vocab_size,
    output_dim=128,
    input_shape=(max_len,)
))

# Couche LSTM
model.add(LSTM(
    units=128,
    return_sequences=False,
    dropout=0.2,
    recurrent_dropout=0.2
))

# Couche finale : 1 neurone pour la classification binaire
model.add(Dense(
    units=1,
    activation='sigmoid'
))

# Compilation du modèle
model.compile(
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Affichage du résumé du modèle
model.summary()


In [None]:
# Évaluation sur les données de test
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))


In [None]:
# Entraînement du modèle
history = model.fit(
    X_train,
    y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.1
)


In [None]:
# Évaluation sur les données de test
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))

In [None]:
# courbre de performance
plt.plot(history.history['accuracy'], label='train acc')
plt.plot(history.history['val_accuracy'], label='val acc')
plt.title("Accuracy")
plt.legend()
plt.show()

In [None]:
# Nouveau commentaire à prédire
new_message = ["I love your video, wonderfull"]

# Nettoyage
cleaned_message = [clean_text(msg) for msg in new_message]

# Tokenisation + Padding
new_sequence = tokenizer.texts_to_sequences(cleaned_message)
new_sequence_padded = pad_sequences(new_sequence, maxlen=max_len, padding='post')

# Prédiction
prediction = model.predict(new_sequence_padded)
print("Probabilité de spam :", prediction[0][0])
print("Spam détecté" if prediction[0][0] > 0.5 else "Non spam")


# Modèle RNN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam

# Paramètres
vocab_size = 5000  # Taille du vocabulaire basé sur le tokenizer
max_len = 100      # Longueur maximale des séquences

# Définition du modèle RNN
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_shape=(max_len,)),
    SimpleRNN(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Compilation du modèle
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Affichage du résumé
model.summary()


In [None]:
# Entraînement sur les commentaires YouTube
history = model.fit(
    X_train,
    y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.1
)


In [None]:
# Accuracy
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.title('Accuracy (RNN)')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
# Évaluation sur données de test
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))


In [None]:
new_message = ["Subscribe to my channel for free iPhones!!!"]

cleaned = [clean_text(msg) for msg in new_message]
seq = tokenizer.texts_to_sequences(cleaned)
padded = pad_sequences(seq, maxlen=max_len, padding='post')

pred = model.predict(padded)
print("Probabilité de spam :", pred[0][0])
print("Spam détecté" if pred[0][0] > 0.5 else "Non spam")
