# étape 1: Installation NLTK et librairies de visualisations

In [None]:
import sys
# 1... installer NLKT si vous ne l'avez pas déjà fait 
!{sys.executable} -m pip install nltk

In [None]:
#1... import the library NLKT
# A new window should open, showing the NLTK Downloader. Click on the File menu and select Change Download Directory. 
# For central installation, set this to C:\nltk_data (Windows), /usr/local/share/nltk_data (Mac), or /usr/share/nltk_data (Unix). 
# Next, download all.
import nltk
#nltk.download()

In [None]:
#2....installer visualisation libraries
!{sys.executable} -m pip install networkx[default]
!{sys.executable} -m pip install pyvis
!{sys.executable} -m pip install textblob
!{sys.executable} -m pip install wordcloud

# Corpus - example 1 : Wordnet 

In [None]:
# Sample usage for wordnet
# WordNet is just a NLTK corpus reader, and can be imported like this:
from nltk.corpus import wordnet as wn

In [None]:
word="lemme"
synset = wn.synsets(word, lang='fra')
print('The test word is : ', word)
print('Word and Type : ' + synset[0].name())
print('Synonym is: ' + synset[0].lemmas()[0].name())
print('The meaning of the word : ' + synset[0].definition())
print('Example : ' + str(synset[0].examples()))

In [None]:
from textblob import Word
noeud = Word("word")
print (noeud.synsets[:10])
print (noeud.definitions[:10])


In [None]:
import networkx as nx
import matplotlib.pyplot as plt
G=nx.Graph()

w=noeud.synsets[0]

G.add_node(w.name())
for h in w.hypernyms():
    #print (h)
    G.add_node(h.name())
    G.add_edge(w.name(),h.name())


for h in w.hyponyms():
    #print (h)
    G.add_node(h.name())
    G.add_edge(w.name(),h.name())

print (G.nodes(data=True))
plt.show()
plt.rcParams['figure.figsize'] = [21, 5]
nx.draw(G, width=1, with_labels=True, node_color="#007ed9")
plt.savefig("path.png")

In [None]:
# to see all other layouts: https://networkx.org/documentation/stable/reference/generated/networkx.drawing.layout.random_layout.html
nx.draw(G, pos=nx.spiral_layout(G))
plt.show()

# Corpus - example 2 : Guttenberg

In [None]:
# Lets explore the Guttenberg in NLKT !
nltk.download('gutenberg')
nltk.corpus.gutenberg.fileids()

In [None]:
moby_dick = nltk.corpus.gutenberg.words( 'melville-moby_dick.txt')
len(moby_dick)

In [None]:
for text in nltk.corpus.gutenberg.fileids():
    print('# of words in ',text,'is: ', len(nltk.corpus.gutenberg.words( text)))

## a- Télécharger un livre du site

In [None]:
# Lets load a specific book from Guttenberg website 
# you will need to leverage the requests package
import requests
#choose a book in Gutenberg project website the https://www.gutenberg.org/ebooks/5258 and get the reference number of the book, here 5258 !
r = requests.get(r'https://www.gutenberg.org/cache/epub/5258/pg5258.txt')
Zarathoustra_Nietzsche = r.text

# first, remove unwanted new line and tab characters from the text
for char in ["\n", "\r", "\d", "\t"]:
    Zarathoustra_Nietzsche = Zarathoustra_Nietzsche.replace(char, " ")
#print number of characters in the book
print(len(Zarathoustra_Nietzsche))

In [None]:
#To see the project gutenburg introduction and footnotes
print(Zarathoustra_Nietzsche[0:910]) 
print('-------------------------------------------------') 
print(Zarathoustra_Nietzsche[637986:639986]) 

In [None]:
# you can also subset for the book text
# (removing the project gutenburg introduction/footnotes)
Zarathoustra_Nietzsche = Zarathoustra_Nietzsche[911:637986]
#print(Zarathoustra_Nietzsche)

## b- Explorer le corpus

In [None]:
# Lets Tokenize the Text
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
import re

# Choose your Text
text = Zarathoustra_Nietzsche

In [None]:
tokens =word_tokenize(text, language="french")

In [None]:
# print the 20 most commons tokens
from collections import Counter
print(Counter(tokens).most_common(40))

In [None]:
# Lets remove punctiation !
remove = re.sub(r'[^\w\s]', '', text)
#print("updated text with no punctuations :", remove)
tokens =word_tokenize(remove, language="french")
print(Counter(tokens).most_common(40))

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
wordcloud = WordCloud(width= 1000, height = 600, max_words=100,
                      random_state=1, background_color='White', colormap='cubehelix',
                      collocations=False, stopwords = STOPWORDS).generate(text)
plt.figure(figsize=(5, 5))
plt.imshow(wordcloud) 
plt.axis("off")
plt.show()

In [None]:
# Lets remove Stopwords !
french_stopwords = set(stopwords.words('french'))
filtre_stopfr =  lambda text: [token for token in text if token.lower() not in french_stopwords]

tokens_Filtered=filtre_stopfr( tokens)
print(Counter(tokens_Filtered).most_common(20))

In [None]:
wordcloud = WordCloud(width= 1000, height = 600, max_words=100,
                      random_state=1, background_color='Black', colormap='Paired',
                      collocations=False, stopwords = french_stopwords).generate(text)
plt.figure(figsize=(7, 5))
plt.imshow(wordcloud) 
plt.axis("off")
plt.show()

In [None]:
import numpy as np
from PIL import Image
mask = np.array(Image.open("Nietzsche.jpg"))
mask.shape
# Generating colors from image
image_colors = ImageColorGenerator(mask)

In [None]:
wordcloud = WordCloud(width= 1000, height = 800, max_words=700,
                      random_state=1, background_color='Lightblue', colormap='winter_r',
                      collocations=False, stopwords = french_stopwords, mask = mask).generate(text)
plt.figure(figsize=(5, 13))
plt.imshow(wordcloud) 
#plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation ='bilinear') # Using the color function to use the image colors
plt.axis("off")
plt.show()

#Pour afficher l'image mask...
#plt.imshow(mask) 
#plt.axis("off")
#plt.show()

In [None]:
# Do you wantg to exclude anything else?
Stop_words=['plus']
for x in Stop_words:
    french_stopwords.add(x)

In [None]:
tokens_Filtered=filtre_stopfr( tokens)
print(Counter(tokens_Filtered).most_common(20))

# Corpus example 4 : Twitter

In [None]:
# snscrape is a scraper for social networking services (SNS). It scrapes things like user profiles, hashtags, or searches and returns the discovered items, e.g. the relevant posts. 
!{sys.executable} -m pip install snscrape

## Web scraping : Récupérer les tweets directement d'internet
snscrape is a scraper for social networking services (SNS). It scrapes things like user profiles, hashtags, or searches and returns the discovered items, e.g. the relevant posts, from various services: facoebook, instagram, Twitter....

#https://github.com/JustAnotherArchivist/snscrape

In [None]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import time

query = "(from:elonmusk) until:2023-02-17 since:2017-01-01"
# query = "Bitcoins since:2018-01-01" 

#On lance le chrono !
start = time.time()

tweets = []
# Attention mettez le nombre max de Tweets !
limit = 7000

#utiliser snsscrape pour scraper les tweets et les mettre dans une liste
for tweet in sntwitter.TwitterSearchScraper(query).get_items():
    if len(tweets) == limit:
        break
    else:
        tweets.append([tweet.date, tweet.id,tweet.username, tweet.content, tweet.lang, 
                       tweet.hashtags, tweet.replyCount, tweet.retweetCount, tweet.likeCount,
                      tweet.quoteCount, tweet.media, tweet.sourceLabel, tweet.coordinates,tweet.place])

# créer un datframe qui contient tous les résultats
df0 = pd.DataFrame(tweets, columns=['Date', 'TweetID', 'User', 'Tweet', 'Langue', 'Hashtags','ReplyCount','RetweetCount',
                                  'LikeCount','Quotecount','Media', 'Source','coordinates','place'])

#On vérifie le temps d'execution !
end = time.time()
elapsed = end - start
print(f'Temps d\'exécution : {elapsed:.2}ms')

In [None]:
df0.info()

In [None]:
df0.shape

In [None]:
df0.head(5)

In [None]:
df0['Source'].unique()

In [None]:
df0['Langue'].unique()

In [None]:
df0.tail(5)

In [None]:
# to save to csv
#df.to_csv('tweets_ElonMusk_Loubna_SERRAR.csv')

In [None]:
df=df0[['Date','User','Tweet']]

In [None]:
df.head()

In [None]:
print(df['Tweet'][1])

In [None]:
import numpy as np
import regex as re
from collections import Counter

In [None]:
#Tokenization
words = df['Tweet'].apply(lambda x:str(x).split())
top=Counter([item for sublist in words for item in sublist])

In [None]:
words[1]

In [None]:
#Token les plus fréquents
top.most_common(21)

# Pre-Processing Twitter Corpus

## étape 0: Load necessary packages 

In [None]:
# punkt package
import nltk
nltk.download('punkt')

In [None]:
# for expanding contraction words e.g. isn't --> is not
!pip install contractions

In [None]:
# stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
# stopwords
stopwords.words('english')[0:10]

In [None]:
# stopwords
stopwords.words('french')[0:10]

In [None]:
# wordnet lemmatizer
import nltk
nltk.download('punkt')
nltk.download('wordnet')

## étape 1 : Basic Cleaning - à adapter à votre corpus/imagination !

- Remove Unicode Strings and Noise
- Remove/Replace URLs, User Mentions and Hashtags
- Non-Letter characters: numbers, emojis, or hash marks.
- Remove/Replace Slang and Abbreviations
- Remove/Replace Contractions
- Remove/Replace Numbers
- Remove/Replace Repetitions of Punctuation
- Remove Punctuation
- Handling Capitalized Words / Lowercase
- Replace Elongated Words (ex: hahahaaaa, ‘Duuuuude, that's awful,’”)

https://pynative.com/python-regex-replace-re-sub/

In [None]:
#the following pre-tokenization receives string as input parameter
#and returns string as output
import re
import contractions

def remove_links(tweet):
    """Takes a string and removes web links from it"""
    tweet = re.sub(r'http\S+', '', tweet)   # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet)  # remove bitly links
    tweet = tweet.strip('[link]')   # remove [links]
    tweet = re.sub(r'pic.twitter\S+','', tweet) # remove Twitter links
    return tweet

def remove_tags(tweet):
    """Takes a string and removes AUDIO/VIDEO tags or labels"""
    tweet = re.sub('VIDEO:', '', tweet)  # remove 'VIDEO:' from start of tweet
    tweet = re.sub('AUDIO:', '', tweet)  # remove 'AUDIO:' from start of tweet
    tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove hash tags
    tweet = re.sub('RT @[\w_]+:','', tweet)  # remove retweet label
    return tweet

def remove_users(tweet):
    """Takes a string and removes retweet and @user information"""
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove re-tweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove tweeted at
    return tweet

punctuation = '!”$%&\’()*+,-./:;<=>?[\\]^_`{|}~•@'
def remove_nonText(tweet):
    tweet = re.sub('[' + punctuation + ']+', ' ', tweet)  # strip punctuation
    tweet = re.sub('\s+', ' ', tweet)  # remove double spacing
    tweet = re.sub(r'\n','', tweet)  # remove escape sequence
    tweet = re.sub('([0-9]+)', '', tweet)  # remove numbers
    tweet = re.sub('📝 …', '', tweet) # un exemple d'image que vous pouvez compléter !

    return tweet

def remove_contraction(text):
    return ' '.join([contractions.fix(word) for word in text.split()])

def pretokenization_cleaning(tweet):
    """Main master function to clean tweets only without tokenization or removal of stopwords"""
    tweet = remove_links(tweet)
    tweet = remove_tags(tweet)
    tweet = remove_users(tweet)
    tweet = remove_nonText(tweet)
    tweet = remove_contraction(tweet)
    tweet = tweet.lower()  # lower case
    return tweet

In [None]:
Text=df['Tweet'][3636]
Text

In [None]:
pretokenization_cleaning(Text)

In [None]:
df.head()

In [None]:
#calling pretokenization_cleaning
df['Clean']=[pretokenization_cleaning(sentence) for sentence in df['Tweet']]
df.head()

## étape 2 : Normalising data  - à adapter à votre corpus 
- Spelling Correction
- Replace Negations with Antonyms
- Handling Capitalized Words
- Lowercase
- Tokenization
- Remove Stopwords (ex: the, and….)
- Stemming
- Lemmatizing

In [None]:
from nltk.tokenize import TweetTokenizer
def tokenize(text):
    tknzr = TweetTokenizer(reduce_len=True)
    return tknzr.tokenize(text)
def remove_stopwords(text):
    return " ".join([token for token in text if token.lower() not in stop_words])
    #return [token for token in text if token.lower() not in stop_words]

In [None]:
def preprocess_tweet(tweet):
    """Main master function to clean and normalizing tweets, and tokenizing use lemmatization"""
    tweet = remove_links(tweet)
    tweet = remove_tags(tweet)
    tweet = remove_users(tweet)
    tweet = remove_nonText(tweet)
    tweet = remove_contraction(tweet)
    tweet = tweet.lower()  # lower case
    tweet = tokenize(tweet)  # apply tokenization
    tweet = remove_stopwords(tweet)
    return tweet

In [None]:
#calling pretokenization_cleaning
df['Normalized']=[preprocess_tweet(sentence) for sentence in df['Tweet']]
df.head()

In [None]:
#Create a list of all words
all_words = ' '.join([word for word in df['Tweet']])
all_Clean_words = ' '.join([word for word in df['Clean']])
all_Normalized_words = ' '.join([word for word in df['Normalized']])

In [None]:
#Tokenize all_words
tokenized_words = nltk.tokenize.word_tokenize(all_words)
tokenized_Clean_words = nltk.tokenize.word_tokenize(all_Clean_words)
tokenized_Normalized_words = nltk.tokenize.word_tokenize(all_Normalized_words)

In [None]:
# lets find the most frequent words
from nltk.probability import FreqDist
fdist_all = FreqDist(tokenized_words)
fdist_clean = FreqDist(tokenized_Clean_words)
fdist_normalized = FreqDist(tokenized_Normalized_words)

In [None]:
print("The most frequent words in the corpus:")
print(fdist_all.most_common(20))
print("The most frequent words in the corpus---- after cleaning the data:")
print(fdist_clean.most_common(20))
print("The most frequent words in the corpus---- after normalizing the data!")
print(fdist_normalized.most_common(20))

In [None]:
fdist_normalized.plot(50)

In [None]:
pip install seaborn

In [None]:
# Import seaborn
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
## Creating FreqDist for whole BoW, keeping the 20 most common tokens
all_fdist = fdist_normalized.most_common(20)

## Conversion to Pandas series via Python Dictionary for easier plotting
all_fdist = pd.Series(dict(all_fdist))

## Setting figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))

## Seaborn plotting using Pandas attributes + xtick rotation for ease of viewing
all_plot = sns.barplot(x=all_fdist.index, y=all_fdist.values, ax=ax)
plt.xticks(rotation=30);

In [None]:
wordcloud = WordCloud(width= 1000, height = 600, max_words=200,
                      random_state=1, background_color='White',
                      collocations=False, stopwords = stop_words).generate(all_Normalized_words)
plt.figure(figsize=(5, 5))
plt.imshow(wordcloud) 
plt.axis("off")
plt.show()

In [None]:
#Lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
from nltk.stem import WordNetLemmatizer
def lemmatize(sentence):
    """Returns lemmatization of a token"""
    tokens=nltk.tokenize.word_tokenize(sentence)
    return [WordNetLemmatizer().lemmatize(token, pos='v') for token in tokens]

#WordNetLemmatizer().lemmatize(token, pos='v')

In [None]:
df['Lemmatized']=[lemmatize(sentence) for sentence in df['Normalized']]

In [None]:
 df.head()

In [None]:
i=0
for sentence in df['Lemmatized']:
    df['Lemmatized_bis'][i]  = ' '.join(word for word in sentence)
    i=i+1

In [None]:
df.head()

In [None]:
all_Lemmatized_words = ' '.join(words for words in df['Lemmatized_bis'])
tokenized_Lemmatized_words = nltk.tokenize.word_tokenize(all_Lemmatized_words)
fdist_Lemmatized = FreqDist(tokenized_Lemmatized_words)
print("The most frequent lemma in the corpus---- after normalizing the data!")
print(fdist_Lemmatized.most_common(20))

In [None]:
import seaborn as sns
sns.set_theme(style="ticks")

# Obtain top 10 words
top_20 = fdist_Lemmatized.most_common(20)

# Create pandas series to make plotting easier
fdist = pd.Series(dict(top_20))

sns.barplot(y=fdist.index, x=fdist.values, color='blue');