# * *PRÁCTICA FINAL SOTO ANÁLISIS DE SENTIMIENTOS* *

In [1]:
import os
import re
import sys
import warnings
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import spacy
import numpy as np
import pandas as pd
import emoji
import spacymoji
import codecs
import collections
from spacymoji import Emoji
from spacy import displacy
import demoji 
  

sys.path.append(os.path.realpath('../'))
# from scripts.preprocess_data import Preprocess

warnings.filterwarnings('ignore')

PATH_DATA = "../"
DEV_FILE = "dev/dev.tsv"
TRAIN_FILE = "Corpus/train.tsv"

In [2]:
#cuenta la frecuencia de aparicion  de cada tipo de entidades nombradas que aparecen en los tweets
def entidades_nombradas(doc,nlp):
    
    labels = [e.label_ for e in doc.ents]
    number = collections.Counter(labels)
    print("Entity categories: ",number)
    
    #imprime las entidades nombradas que encuentra en el texto
    #for sent in doc.sents:
    #    displacy.render(nlp(sent.text),style='ent',jupyter=True)
    

In [3]:
#función donde podremos ver las entidades nombradas,las palabras más frecuentes y
#tipo de palabras más frecuentes presentes en nuestros datos.

def analisis_tweets(tweets):
    
    nlp = spacy.load("es_core_news_sm")
    
    for pos,tweet in enumerate(tweets):
        #con decode cambiamos los datos de binario a string
        tweets[pos]=tweet.decode('UTF-8')
    
    
    #pasamos de una lista de tweets un string de tweets
    sentence = ""
    for pos,tweet in enumerate(tweets):
        sentence += str(tweet.lower()) + "."
    
   
    doc = nlp(sentence)
    
    #obtenemos entidades nombradas que hay en el texto
    entidades_nombradas(doc,nlp)

    #obtenemos la frecuencia de tipos de palabras en el texto
     # elimina stop words y signos de puntuación
    tipo_words = [token.pos_ for token in doc if not token.is_stop and not token.is_punct]

    frecuencias = []
    for w in tipo_words:
        frecuencias.append(tipo_words.count(w))

    pairList = list(zip(tipo_words, frecuencias))
    
    setPairList = set(pairList)

    print("*********************************")
    unique_words = [word for (word, freq) in setPairList if freq > 50]
    print("Tipo de palabras mas frecuentes: ", unique_words)
    print("*********************************")
    
    #obtenemos la frecuencia de las palabras en el texto
    # elimina stop words y signos de puntuación
    words = [token.text for token in doc if not token.is_stop and not token.is_punct]

    frecuencias = []
    for w in words:
        frecuencias.append(words.count(w))

    pairList = list(zip(words, frecuencias))
    
    setPairList = set(pairList)

    print("*********************************")
    unique_words = [word for (word, freq) in setPairList if freq > 30]
    print("Palabras mas frecuentes: ", unique_words)
    print("*********************************")
    

In [4]:
#eliminamos los emojis que aparen en los tweets
def give_emoji_free_text(text):
    #return emoji.get_emoji_regexp().sub(u'', text) # requiere que no sea string
    text2 = text.encode('ascii', 'ignore').decode()
    return text2

In [5]:
def basic_processing(X):
    
    #lista de tweets limpios
    tweets_clean = []
    
    nlp = spacy.load("es_core_news_sm")

    for pos, sen in enumerate(X):
        
        tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', sen)  # remove hashtags + espacio at (hashtag con espacio)
        tweet = re.sub('(#\s+[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove hashtags at (hashtag)
        tweet = re.sub('@', '', tweet)  # remove tweeted at (menciones)
        #eliminamos emojis de los tweets
        tweet = give_emoji_free_text(tweet)
        
        
        
        doc = nlp(tweet)

        
        #tokenizamos y hacemos pos y eliminamos urls
        lista_tweet=[]
        newSentence=''
        for token in doc:
            if (
                #not token.is_punct
                #not token.is_emoji
                #and not token.like_num
                #and not token.is_stop
                not token.like_url
                # and not token.is_space
                and not token.pos_ == "SYM"
                and not token.pos_ == "DET"
                and not token.pos_ == "X"
                #and not token.pos_ == "SCONJ"
                #and not token.pos_ == "CONJ"
                #and not token.pos_ == "CCONJ"
                #and not token.pos_ == "ADP"
                and not token.pos_ == "NUM"
                #and not token.pos_ == "AUX"
                and not token.pos_ == "PRON"
                #and not token.pos_ == "PROPN"
            ):
                #texto lematizado y en minusculas
                #newSentence = ' '.join([newSentence,token.lemma_.lower()])
                #texto sin lematizar y en minusculas
                newSentence = ' '.join([newSentence,token.text.lower()])
                
        tweets_clean.append(newSentence)
    #print(tweets_clean)
    return tweets_clean

In [6]:
def processTweetsTSV(corpus_path, trainingFile):
    if (not os.path.exists(corpus_path)):
        os.mkdir(corpus_path)

    df_dev = pd.read_csv(trainingFile, sep="\t", usecols = ['id', 'tweet','emotion'])

    anger_path = corpus_path+'//ANGER'
    disgust_path = corpus_path+"//DISGUST"
    fear_path = corpus_path+"//FEAR"
    joy_path = corpus_path+"//JOY"
    sadness_path = corpus_path + "//SADNESS"
    surprise_path = corpus_path + "//SURPRISE"
    others_path = corpus_path + "//OTHERS"

    if (not os.path.exists(anger_path)):
            os.makedirs(anger_path)
            os.makedirs(disgust_path)
            os.makedirs(fear_path)
            os.makedirs(joy_path)
            os.makedirs(sadness_path)
            os.makedirs(surprise_path)
            os.makedirs(others_path)

    #print(df_dev)
    df_dev.to_csv("filterDataTrain.csv")

    with open('filterDataTrain.csv', mode='r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file, quotechar='"', delimiter=',',
                                    quoting=csv.QUOTE_ALL, skipinitialspace=True)
        path = ''
        for row in csv_reader:
            #print(row)
            if row["emotion"] == "others":
                path= others_path
            elif row["emotion"] == "anger":
                path= anger_path
            elif row["emotion"] == "disgust":
                path= disgust_path
            elif row["emotion"] == "fear":
                path= fear_path
            elif row["emotion"] == "joy":
                path = joy_path
            elif row["emotion"] == "sadness":
                path = sadness_path
            elif row["emotion"] == "surprise":
                path = surprise_path

            # Creating new file
            f=open(path+"//"+row['id']+".txt","w", encoding='utf-8')
            f.write(row['tweet'])
            f.close()

In [7]:
def classifier(corpus_path):

    # CorpusTrain and subfolders (categories) must exist
    tweets_data = load_files(corpus_path)
    X, y = tweets_data.data, tweets_data.target
    
    #en esta funcion se hace un analisis exploratorio de los datos
    analisis_tweets(X)
    
    #en esta funcion hacemos el procesado de datos y los limpiamos
    documents = basic_processing(X)

    #probamos las diferentes combinaciones de vectorizacion
    
    #1_BINARIO
    #vectorizer = CountVectorizer(binary=True, ngram_range=(1,3))
    #X = vectorizer.fit_transform(documents).toarray()
    
    
    #2_TF-IDF
    vectorizer = CountVectorizer()
    #vectorizer = CountVectorizer(ngram_range=(1,2))
    X = vectorizer.fit_transform(documents).toarray()

    tfidfconverter = TfidfTransformer()
    X = tfidfconverter.fit_transform(X).toarray()
    
    
    #3_TF
    #vectorizer = CountVectorizer()
    #vectorizer = CountVectorizer(ngram_range=(1, 3))
    #X = vectorizer.fit_transform(documents).toarray()

    # The data is divided into 20% test set and 80% training set.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    #probamos distintos algoritmos de clasificacion 
    #clf = MultinomialNB().fit(X_train, y_train)
    #clf = BernoulliNB().fit(X_train, y_train)
    #clf = DecisionTreeClassifier().fit(X_train, y_train)
    #clf = ExtraTreeClassifier().fit(X_train, y_train)
    clf = LinearSVC().fit(X_train, y_train)
    #clf = KNeighborsClassifier().fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(confusion_matrix(y_test,y_pred))
    print("------------------------------------------")
    print(classification_report(y_test,y_pred))
    print("------------------------------------------")
    print("accuracy",accuracy_score(y_test, y_pred))

In [8]:
if __name__ == "__main__":
    trainingFile = "Corpus/train.tsv"
    corpus_path = "CorpusTraining"

    
    processTweetsTSV(corpus_path, trainingFile)
    # entrenar el clasificador y evaluar el rendimiento
    classifier(corpus_path)

Entity categories:  Counter({'MISC': 454, 'LOC': 413, 'PER': 318, 'ORG': 152})
*********************************
Tipo de palabras mas frecuentes:  ['PROPN', 'NUM', 'CCONJ', 'ADV', 'ADJ', 'AUX', 'PRON', 'VERB', 'PUNCT', 'NOUN', 'ADP']
*********************************
*********************************
Palabras mas frecuentes:  ['barcelona', '️', 'venezuela', 'libro', 'notredame', 'campeón', 'incendio', '|', 'mundo', 'liverpool', 'historia', 'capítulo', 'laliga', 'gente', 'y', 'elecccionesgenerales28a', 'años', 'messi', '🏆', 'gracias', '⚽', 'o', 'díadellibro', 'user', 'juegodetronos', 'gretathunberg', 'diadellibro', 'championsleague', 'españa', 'a', 'gameofthrones', 'libros']
*********************************
[[  2   0   0   4  22   1   2]
 [  1   0   0   1   5   0   0]
 [  0   0   2   0   3   0   0]
 [  1   0   0  26  31   1   0]
 [  0   0   0  12 116   2   1]
 [  1   0   0   0  10  11   0]
 [  0   0   0   1   6   0   1]]
------------------------------------------
              precisio

# PRUEBAS

In [19]:
s1 = "Hi \xF0\x9F\x98\x81 How is your 🙈 and 😌. Have a nice weekend 💕👭👙"

In [18]:
def give_emoji_free_text(text):
    return emoji.get_emoji_regexp().sub(u'', text)

print (give_emoji_free_text(s1))

Hi ð How is your  and . Have a nice weekend 


In [13]:
import re
#s2 = " b'os presentar a santa arya stark . \\xf0\\x9f\\x99\\x8f"
s2 = b'USER Que mal eres chavista? ! Por pensar as\xc3\xad no vas a salir de pobre ajajaj. #VenezuelaLibre #MaduroNoEsNadie'

In [4]:
def give_emoji_free_text(text):
 #   return emoji.get_emoji_regexp().sub(u'', text)
    emoji_pattern = re.compile(pattern = "["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

print (give_emoji_free_text(s2))

 b'os presentar a santa arya stark . ð


In [14]:
text = s2.encode('ascii', 'ignore').decode()

print(text)

AttributeError: 'bytes' object has no attribute 'encode'

In [3]:
text2 = text.encode('ascii', 'ignore').decode()

print(text2)

 b'os presentar a santa arya stark . \xf0\x9f\x99\x8f
