In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd

nltk.download('stopwords')
nltk.download('punkt')

# Chargement des données
df = pd.read_csv('/content/youtube_data_vf.csv')

# Nettoyage des données
def clean_text(text):
    text = text.lower()  # Transformation en minuscules
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Enlever les URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Enlever les mentions et hashtags
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)  # Enlever les caractères spéciaux
    text = re.sub(r'\d+', '', text)  # Enlever les chiffres
    tokens = word_tokenize(text)  # Tokenisation
    tokens = [word for word in tokens if word not in stopwords.words('french')]  # Enlever les stop words
    return " ".join(tokens)

# Appliquer le nettoyage aux colonnes pertinentes
df['cleaned_title'] = df['title'].apply(lambda x: clean_text(x) if pd.notnull(x) else "")
df['cleaned_description'] = df['description'].apply(lambda x: clean_text(x) if pd.notnull(x) else "")
df['cleaned_comments'] = df['comments'].apply(lambda x: clean_text(x) if pd.notnull(x) else "")

# Afficher les premières lignes du DataFrame après nettoyage
print(df.head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                 channel_id               playlist_id     video_id  \
0  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  B4U7vNtXBTI   
1  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  JuCIU6FpQLs   
2  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  tsOtjNdMMaI   
3  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  jritw4L1A_4   
4  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  TmuDsq4m4Ts   

                                               title  \
0        7 Truths I Know at 30 But Wish I Knew at 20   
1          This is how journaling can beat stress ✍️   
2                 Positive emotions create energy ⚡️   
3      This is why writers track their word count ✍️   
4  How I Made $10m - An Ultimate Guide to Online ...   

                                         description  \
0  Sign up to Morning Brew for free today! https:...   
1  Check out my New York Times bestselling book a...   
2  Check out my New York Times bestselling book a...   
3  Check out my Ne

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd

nltk.download('stopwords')
nltk.download('punkt')

# Chargement des données
df = pd.read_csv('/content/youtube_data_vf.csv')

# Afficher les premières lignes du DataFrame avant le nettoyage
print("Data before cleaning:")
print(df.head())

# Nettoyage des données
def clean_text(text):
    text = text.lower()  # Transformation en minuscules
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Enlever les URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Enlever les mentions et hashtags
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)  # Enlever les caractères spéciaux
    text = re.sub(r'\d+', '', text)  # Enlever les chiffres
    tokens = word_tokenize(text)  # Tokenisation
    tokens = [word for word in tokens if word not in stopwords.words('french')]  # Enlever les stop words
    return " ".join(tokens)

# Appliquer le nettoyage aux colonnes pertinentes
df['cleaned_title'] = df['title'].apply(lambda x: clean_text(x) if pd.notnull(x) else "")
df['cleaned_description'] = df['description'].apply(lambda x: clean_text(x) if pd.notnull(x) else "")
df['cleaned_comments'] = df['comments'].apply(lambda x: clean_text(x) if pd.notnull(x) else "")

# Afficher les premières lignes du DataFrame après nettoyage
print("Data after cleaning:")
print(df.head())


Data before cleaning:
                 channel_id               playlist_id     video_id  \
0  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  B4U7vNtXBTI   
1  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  JuCIU6FpQLs   
2  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  tsOtjNdMMaI   
3  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  jritw4L1A_4   
4  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  TmuDsq4m4Ts   

                                               title  \
0        7 Truths I Know at 30 But Wish I Knew at 20   
1          This is how journaling can beat stress ✍️   
2                 Positive emotions create energy ⚡️   
3      This is why writers track their word count ✍️   
4  How I Made $10m - An Ultimate Guide to Online ...   

                                         description  \
0  Sign up to Morning Brew for free today! https:...   
1  Check out my New York Times bestselling book a...   
2  Check out my New York Times bestselling book a...

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Data after cleaning:
                 channel_id               playlist_id     video_id  \
0  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  B4U7vNtXBTI   
1  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  JuCIU6FpQLs   
2  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  tsOtjNdMMaI   
3  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  jritw4L1A_4   
4  UCoOae5nYA7VqaXzerajD0lg  UUoOae5nYA7VqaXzerajD0lg  TmuDsq4m4Ts   

                                               title  \
0        7 Truths I Know at 30 But Wish I Knew at 20   
1          This is how journaling can beat stress ✍️   
2                 Positive emotions create energy ⚡️   
3      This is why writers track their word count ✍️   
4  How I Made $10m - An Ultimate Guide to Online ...   

                                         description  \
0  Sign up to Morning Brew for free today! https:...   
1  Check out my New York Times bestselling book a...   
2  Check out my New York Times bestselling book a... 

In [None]:
# Sélectionner les colonnes à sauvegarder
columns_to_save = [
    'channel_id', 'playlist_id', 'video_id', 'cleaned_title', 'cleaned_description', 'cleaned_comments'
]

# Enregistrer le DataFrame dans un fichier CSV
csv_filename = 'youtube_data_prepared_for_huggingface.csv'
df[columns_to_save].to_csv(csv_filename, index=False)

print(f"Data saved to {csv_filename}")


Data saved to youtube_data_prepared_for_huggingface.csv
