In [None]:
#import modules
import pandas as pd
import numpy as np
import gensim
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
import re
import simplemma
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')



In [None]:
#Upload the TwIT Dataset
twit=pd.read_csv(r"C:\Users\ridol\OneDrive\Desktop\Emotion Recognition\Dataset\TwIT.csv",
                 sep=';',header=0, encoding='utf8', dtype={'Text':'str','Emozione':'str'})

twit.set_index('Id',drop=True,inplace=True)  #I set as row index the id

twit.rename(mapper={'Emozione':'Emotion'},axis='columns',inplace=True) #rename the colums 'Emozione' in 'Emotion'


##Data Exploration
twit.info()
print(twit.isnull().sum()) #there is no null value
freq = twit.groupby(['Emotion']).count()
print(freq)
#Result --> Happiness(0): 549;  Trust(1):504; Sadness(2):479; Anger(3):513; Fear(4):518; Disgust(5):545;


In [None]:
#Preprocessing data
twit_text=twit['Text']
twit_label=twit['Emotion']
it_stop = set(stopwords.words('italian'))


def preprocessing(text):

    # Preprocessing of raw text
    text = re.sub(r'\W',' ',str(text)) # Remove all the special characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # remove all single characters
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)  # Remove single characters from the start
    text = re.sub(r'\s+',' ',text, flags=re.I)  # Substituting multiple spaces with single space
    text = text.lower()   # Converting to Lowercase
    text = re.sub(r'\d',' ',text) #Removing numbers from strings
    text = re.sub(r'^b\s+', '', text) # Removing prefixed 'b' from byte string

    #remove stop words and words less that 3 character
    tokens = text.split()
    tokens = [word for word in tokens if word not in it_stop]
    tokens = [word for word in tokens if len(word) > 3]
    tokens = [word for word in tokens if word in wwv]

    #Lemmatization
    preprocessed_text = []
    for word in tokens:
        preprocessed_text.append(simplemma.lemmatize(word, lang=('it', 'en')))
        
    return preprocessed_text

preprocessing(twit_text)

##tokenize text
twit_text_tokenized = [preprocessing(sentence) for sentence in twit_text if sentence.strip() !='']
print(twit_text_tokenized)


######## ALTERNATIVE METHODS #######
##Alternative Preprocessing with Gensim library
twit_text= twit_text.apply(gensim.utils.simple_preprocess) #library to do preprocessing. easy, but less accurate!!
twit_text= [word for word in twit_text if word not in stopwords.words('italian')]
#twit_text= [word for word in twit_text if word not in get_stop_words('italian')] #with stop-words package
print(twit_text)

In [None]:
# Word2Vec model
model= Word2Vec(sentences=twit_text_tokenized,
               vector_size=128,
               window=10,
               min_count=1,
               sg=0,
               hs=1,
               negative=5,
               epochs=10,
               sample=1e-3)


model.build_vocab(twit_text_tokenized, progress_per=3000)

model.train(twit_text_tokenized, total_examples=model.corpus_count, epochs=10)

#Print some results
print(model.wv.most_similar('buongiorno',topn=3))
print(model.wv.most_similar('dispiacere',topn=3))
print(model.wv.most_similar('vomitare',topn=3))
(model.wv.similarity(w1='bravo', w2='complimento'))
print(model.wv.doesnt_match(['ospedale','felicità','amore'])) #Returns the word that has nothing to do with anything
vector = model.wv['computer'] #see the vector of each word
print(vector)
print(model.predict_output_word(['ti','amo','amore'], topn=3))

In [None]:
#Word Embedding Visualization
vocab= list(model.wv.key_to_index)
X = model.wv[vocab]

#### You can use TSNE or PCA

#TSNE: T-distribuited Stochastics Neighbor Embedding
# with tsne to reduce 2 dimensons
tsne = TSNE(n_components=2,metric='cosine')
X_tsne = tsne.fit_transform(X)
df = pd.DataFrame(X_tsne, index=vocab) #columns=['x', 'y'])


#PCA:Principal Component Analysis
#with pca to reduce 2 dimensions
pca=PCA(n_components=50)
X_pca=pca.fit_transform(X)
df1=pd.DataFrame(X_pca,index=vocab) #columns=['x','y'])

#### You can use MATPLOTLIB or PLOTY for visualize the word embedding in 2 dimension space

##Plot the words with matplotlib
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.scatter(df['x'], df['y'])

for word, pos in df.iterrows():
    ax.annotate(word, pos)

plt.show()


##plot the words with plotly
fig=px.scatter(df1, x='x', y='y',text=vocab,log_x=True, size_max=500)
fig.update_traces(textposition='top center')
fig.update_layout(height=500,
                  title_text='Word embedding chart')
fig.write_html('Word Embedding.html', auto_open=True )


In [None]:
#Save the model in csv file
vocabolary,vectors = model.wv.key_to_index, model.wv.vectors
# get node name and embedding vector index.
name_index = np.array([(v[0], v[1]) for v in vocabolary.items()])
# init dataframe using embedding vectors and set index as node name
df = pd.DataFrame(vectors[name_index[:, 1].astype(int)])
df.index = name_index[:, 0]
df.to_csv("word_embedding.csv")