## Importation des bibliothèques nécessaires

In [2]:
# Librairies de manipulation de données et de visualisation
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Librairie pour la classification de texte
import gensim
import langid

# Librairies de traitement de texte
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import ftfy
import contractions

#




# Librairies d'apprentissage automatique
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, classification_report, roc_curve, auc
from sklearn.dummy import DummyClassifier


# Librairies spécifiques à MLFlow
import mlflow
from mlflow.models.signature import infer_signature
import mlflow.keras

# Librairies spécifiques à TensorFlow
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
import tensorflow_hub as hub
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout, Input, GRU, Lambda, SimpleRNN, LSTM, Bidirectional
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC
from transformers import TFBertModel, BertTokenizer


# Librairies systéme
import os
import joblib
import numpy as np
import threading
import time
import psutil

# Librairie pour l'optimisation de modèles Keras
import keras_tuner as kt

# Librairie pour le traitement des émojis
import emoji




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Librairie pour la gestion des erreurs
import traceback



## Chargement des données

In [6]:

# Chargement des données brutes
data = pd.read_csv("data/training.1600000.processed.noemoticon.csv",names=["target", "id", "date", "flag", "user", "text"], encoding='latin-1')
    

In [8]:
data

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


The file contains 1,600,000 user tweets extracted using Twitter's API. The tweets have been annotated (0 = negative, 4 = positive) and can be used to detect the sentiment of a tweet.

The DataFrame is characterized by 6 variables:

1. target: the sentiment of the tweet (0 = negative, 4 = positive)
2. id: tweet id (2087)
3. date: tweet date (Sat May 16 23:58:44 UTC 2009)
4. flag: The query (LyX). If there is no query then the value will be NO_QUERY
5. user: the user who tweeted (robotickilldozr)
6. text: the text of the tweet (with LyX)

## Prétraitement des données

### Suppression des tweets en doublons

In [10]:
# Suppression des tweets en doublons
data = data.drop_duplicates(subset=['id'])
data = data.drop_duplicates(subset=['date', 'user','text'])

data

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


### Suppression des colonnes inutiles

In [12]:

data = data.drop(columns=['id','flag'])
display(data)

Unnamed: 0,target,date,user,text
0,0,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...
2,0,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire
4,0,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...
1599995,4,Tue Jun 16 08:40:49 PDT 2009,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,Tue Jun 16 08:40:49 PDT 2009,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Tue Jun 16 08:40:49 PDT 2009,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Tue Jun 16 08:40:49 PDT 2009,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


The nltk library allows us to remove stopwords. These are words that recur in a language but do not provide additional information for understanding the text.

In [14]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\devil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.tokenize import word_tokenize

nltk.download('punkt')

data['text_token'] = data['text'].apply(word_tokenize)


[nltk_data] Downloading package punkt to C:\Users\devil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
print(data.iloc[0, :])

In [19]:
def remove_stopwords(word_list):
    stop_words = set(stopwords.words('english'))
    filtered_words = []
    for word in word_list:
        if word not in stop_words:
            filtered_words.append(word)
    return filtered_words

In [21]:
data['text_token'] = data['text_token'].apply(remove_stopwords)

In [22]:
print(data.iloc[0, :])

target                                                        0
date                               Mon Apr 06 22:19:45 PDT 2009
user                                            _TheSpecialOne_
text          @switchfoot http://twitpic.com/2y1zl - Awww, t...
text_token    [@, switchfoot, http, :, //twitpic.com/2y1zl, ...
Name: 0, dtype: object


In [23]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('punkt')# segmentation phrases
nltk.download('averaged_perceptron_tagger') # étiquettes grammaticales
nltk.download('wordnet')# synonymes

[nltk_data] Downloading package punkt to C:\Users\devil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\devil/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\devil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
def lemmatize_words(word_list):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, get_pos(word)) for word in word_list]
    return lemmatized_words

def get_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV, "J": wordnet.ADJ}
    return tag_dict.get(tag, wordnet.NOUN)

In [25]:
data['text_token'] = data['text_token'].apply(lemmatize_words)

In [26]:
print(data.iloc[0, :])

target                                                        0
date                               Mon Apr 06 22:19:45 PDT 2009
user                                            _TheSpecialOne_
text          @switchfoot http://twitpic.com/2y1zl - Awww, t...
text_token    [@, switchfoot, http, :, //twitpic.com/2y1zl, ...
Name: 0, dtype: object


In [27]:
word_dataset = 'data/normalized_dataset.csv'
data.to_csv(word_dataset, index=False)

In [3]:
df = pd.read_csv("data/normalized_dataset.csv")

In [4]:
df


Unnamed: 0,target,date,user,text,text_token
0,0,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","['@', 'switchfoot', 'http', ':', '//twitpic.co..."
1,0,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...,"['upset', 'ca', ""n't"", 'update', 'Facebook', '..."
2,0,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...,"['@', 'Kenichan', 'I', 'dive', 'many', 'time',..."
3,0,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire,"['whole', 'body', 'feel', 'itchy', 'like', 'fi..."
4,0,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all....","['@', 'nationwideclass', ',', ""'s"", 'behaving'..."
...,...,...,...,...,...
1598122,4,Tue Jun 16 08:40:49 PDT 2009,AmandaMarie1028,Just woke up. Having no school is the best fee...,"['Just', 'woke', '.', 'Having', 'school', 'bes..."
1598123,4,Tue Jun 16 08:40:49 PDT 2009,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...,"['TheWDB.com', '-', 'Very', 'cool', 'hear', 'o..."
1598124,4,Tue Jun 16 08:40:49 PDT 2009,bpbabe,Are you ready for your MoJo Makeover? Ask me f...,"['Are', 'ready', 'MoJo', 'Makeover', '?', 'Ask..."
1598125,4,Tue Jun 16 08:40:49 PDT 2009,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...,"['Happy', '38th', 'Birthday', 'boo', 'alll', '..."


In [5]:
import ast
import re

array_list = df['text_token'].values
data_list = []
for item in array_list:
    data_list.append(ast.literal_eval(item))

df_list = pd.DataFrame({'text_token': data_list})
df = df.drop(columns=['text_token'])
df['text_token'] = df_list['text_token']
df['words'] = df['text_token'].apply(lambda x: ' '.join(x))
df = df.drop(columns=['text_token'])

KeyboardInterrupt: 

In [30]:
print(df.iloc[0, :])

target                                                    0
date                           Mon Apr 06 22:19:45 PDT 2009
user                                        _TheSpecialOne_
text      @switchfoot http://twitpic.com/2y1zl - Awww, t...
words     @ switchfoot http : //twitpic.com/2y1zl - Awww...
Name: 0, dtype: object


In [31]:
word_dataset = 'data/cleaned_dataset.csv'
df.to_csv(word_dataset, index=False)

In [6]:
df = pd.read_csv("data/cleaned_dataset.csv")

In [7]:
df

Unnamed: 0,target,date,user,text,words
0,0,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",@ switchfoot http : //twitpic.com/2y1zl - Awww...
1,0,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...,upset ca n't update Facebook texting ... might...
2,0,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...,@ Kenichan I dive many time ball . Managed sav...
3,0,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire,whole body feel itchy like fire
4,0,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all....","@ nationwideclass , 's behaving . 'm mad . ? I..."
...,...,...,...,...,...
1598122,4,Tue Jun 16 08:40:49 PDT 2009,AmandaMarie1028,Just woke up. Having no school is the best fee...,Just woke . Having school best feel ever
1598123,4,Tue Jun 16 08:40:49 PDT 2009,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...,TheWDB.com - Very cool hear old Walt interview...
1598124,4,Tue Jun 16 08:40:49 PDT 2009,bpbabe,Are you ready for your MoJo Makeover? Ask me f...,Are ready MoJo Makeover ? Ask detail
1598125,4,Tue Jun 16 08:40:49 PDT 2009,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...,Happy 38th Birthday boo alll time ! ! ! Tupac ...


In [9]:
df.value_counts('target')
df.value_counts('target', normalize=True) * 100

target
0    50.053281
4    49.946719
Name: proportion, dtype: float64

In [10]:
df_neg = df[df['target']== 0].sample(8000)
df_pos = df[df['target']== 4].sample(8000)
df_pos['target'] = 1
liste_concat = [df_neg, df_pos]
df_sample = pd.concat([df_neg, df_pos], ignore_index=True)
df_sample = df_sample.sample(frac=1).reset_index(drop=True)
df_sample

Unnamed: 0,target,date,user,text,words
0,0,Sat Jun 20 13:59:08 PDT 2009,MonsieurCharles,at home watching Weeds I really want to make m...,home watch Weeds I really want make believe I ...
1,0,Sat Jun 20 04:38:36 PDT 2009,ms_dr_marc,only because of the rain i couldn't play tenni...,rain could n't play tennis today ! !
2,0,Thu Jun 18 22:03:34 PDT 2009,glamonicaa,@tinamarelina Thanks darlin'! They were soaked...,@ tinamarelina Thanks darlin ' ! They soak I g...
3,1,Sat Jun 06 12:12:05 PDT 2009,LynnPiccoli,"Listening to Nat on Perry &amp; Price , singin...","Listening Nat Perry & amp ; Price , sing fav s..."
4,0,Tue Jun 16 16:43:16 PDT 2009,lo_fye,Any #wordpress gurus out there? I upgraded to ...,Any # wordpress guru ? I upgraded 2.8 /feed/ r...
...,...,...,...,...,...
15995,1,Sat May 30 06:08:46 PDT 2009,adrence,@CateP36 Nope. I skipped them. Can't waste my ...,@ CateP36 Nope . I skip . Ca n't waste time .
15996,0,Fri Jun 19 00:24:20 PDT 2009,allyeezy,"so bored. they're drinkinn, im not.","bore . 're drinkinn , im ."
15997,1,Mon Jun 01 02:18:36 PDT 2009,tania_lx,@mileycyrus you earned it!!!! enjoy....,@ mileycyrus earn ! ! ! ! enjoy ....
15998,1,Fri May 29 07:54:02 PDT 2009,notoriousPIG,@RagnarTornquist Think about your poor weapons...,@ RagnarTornquist Think poor weapon guy . He m...


In [11]:
sample_df = 'data/sample_dataset.csv'
df_sample.to_csv(sample_df, index=False)