<a href="https://colab.research.google.com/github/yahia-kplr/Google_AI/blob/main/Sentiment_analysis_twitter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 0.Depencencies

In [1]:
pip install nltk==3.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import nltk
nltk.download('twitter_samples') # Le corpus Twitter de NLTK contient actuellement un échantillon de 20k Tweets (nommés ' twitter_samples ') récupérés à partir de l'API Twitter Streaming.
nltk.download('punkt') # Punkt Sentence Tokenizer. Ce tokenizer divise un texte en une liste de phrases en utilisant un algorithme non supervisé pour construire un modèle pour les mots d'abréviation, les collocations et les mots qui commencent les phrases. Il doit être entraîné sur une grande collection de textes en clair dans la langue cible avant de pouvoir être utilisé.
nltk.download('wordnet') # WordNet est une base de données lexicale pour la langue anglaise, qui a été créée par Princeton, et fait partie du corpus NLTK. Vous pouvez utiliser WordNet avec le module NLTK pour trouver le sens des mots, les synonymes, les antonymes, et plus encore. 
nltk.download('averaged_perceptron_tagger') #The averaged_perceptron_tagger.zip contains the pre-trained English https://en.wikipedia.org/wiki/Part_of_speech
nltk.download('stopwords') #mots vides

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
ls /root/nltk_data/corpora

[0m[01;34mstopwords[0m/     [01;34mtwitter_samples[0m/     [01;34mwordnet[0m/
stopwords.zip  twitter_samples.zip  wordnet.zip


"wordnet" doit être décompressé manuellement

In [4]:
from zipfile import ZipFile
file_loc = '/root/nltk_data/corpora/wordnet.zip'
with ZipFile(file_loc, 'r') as z:
  z.extractall('/root/nltk_data/corpora/')

In [5]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier

import re, string, random

## 1.Définition des fonctions de traitement de texte

In [6]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [7]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

In [8]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

## 2.Construire le dataset à partir du corpus propre

In [9]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

In [10]:
stop_words = stopwords.words('english')

In [11]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

In [12]:
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [13]:
all_pos_words = get_all_words(positive_cleaned_tokens_list)

freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [14]:
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset) #mélanger arbitrairement



## 3.Le modèle Naive Bayes classifier

In [15]:
train_data = dataset[:7000]
test_data = dataset[7000:]

In [16]:
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(20))

Accuracy is: 0.9936666666666667
Most Informative Features
                follower = True           Positi : Negati =     33.1 : 1.0
                     sad = True           Negati : Positi =     25.0 : 1.0
                     x15 = True           Negati : Positi =     19.0 : 1.0
                    damn = True           Negati : Positi =     15.0 : 1.0
                      aw = True           Negati : Positi =     14.3 : 1.0
                   cream = True           Negati : Positi =     13.6 : 1.0
                     ice = True           Negati : Positi =     13.0 : 1.0
                 welcome = True           Positi : Negati =     11.9 : 1.0
                  arrive = True           Positi : Negati =     11.9 : 1.0
                   sorry = True           Negati : Positi =     11.8 : 1.0
                 awesome = True           Positi : Negati =     11.8 : 1.0
                     ugh = True           Negati : Positi =     11.6 : 1.0
                   didnt = True           

## 4.Utiliser le modèle pour classer son tweet

In [17]:
custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))

I ordered just once from TerribleCo, they screwed up, never used the app again. Negative


C'est à vous maintenant de tester et challenger l'algorithme!