In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

# Téléchargement des ressources NLTK nécessaires
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Chargement des données depuis le fichier CSV
df = pd.read_csv('sample.csv')

# Affichage des premières lignes du DataFrame
print("Premières lignes du DataFrame :")
print(df.head())

# Fonction de nettoyage du texte
def clean_text(text):
    # Mise en minuscules
    text = text.lower()
    # Suppression des ponctuations
    text = re.sub(r'[^\w\s]', '', text)
    # Suppression des mots vides
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    # Stemming
    porter = PorterStemmer()
    stemmed_text = [porter.stem(word) for word in filtered_text]
    # Lemmatisation
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in stemmed_text]
    return ' '.join(lemmatized_text)

# Nettoyage du texte dans la colonne 'text' et création de la colonne 'text_cleaned'
df['text_cleaned'] = df['text'].apply(clean_text)

# Affichage du DataFrame avec les colonnes 'tweet_id', 'text' et 'text_cleaned'
cleaned_df = df[['tweet_id', 'text', 'text_cleaned']]
print("\nDataFrame avec les colonnes 'tweet_id', 'text' et 'text_cleaned' :")
print(cleaned_df.head())

# Sauvegarde du DataFrame nettoyé dans un nouveau fichier CSV
cleaned_df.to_csv('cleaned_customer_support_tweets.csv', index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oumaima\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oumaima\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\oumaima\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Premières lignes du DataFrame :
   tweet_id     author_id  inbound                      created_at  \
0    119237        105834     True  Wed Oct 11 06:55:44 +0000 2017   
1    119238  ChaseSupport    False  Wed Oct 11 13:25:49 +0000 2017   
2    119239        105835     True  Wed Oct 11 13:00:09 +0000 2017   
3    119240  VirginTrains    False  Tue Oct 10 15:16:08 +0000 2017   
4    119241        105836     True  Tue Oct 10 15:17:21 +0000 2017   

                                                text response_tweet_id  \
0  @AppleSupport causing the reply to be disregar...            119236   
1  @105835 Your business means a lot to us. Pleas...               NaN   
2  @76328 I really hope you all change but I'm su...            119238   
3  @105836 LiveChat is online at the moment - htt...            119241   
4  @VirginTrains see attached error message. I've...            119243   

   in_response_to_tweet_id  
0                      NaN  
1                 119239.0  
2              