# * *PRÁCTICA FINAL SOTO ANÁLISIS DE SENTIMIENTOS* *

In [1]:
import os
import re
import sys
import warnings
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


import spacy
import numpy as np
import pandas as pd

sys.path.append(os.path.realpath('../'))
# from scripts.preprocess_data import Preprocess

warnings.filterwarnings('ignore')

PATH_DATA = "../"
DEV_FILE = "dev/dev.tsv"
TRAIN_FILE = "Corpus/train.tsv"

In [2]:
def basic_processing(X):
    tweets_clean = []
    
    nlp = spacy.load("es_core_news_sm")

    # El primer For trata cada tuit y lo pasa a string
    for sen in range(0, len(X)):
        tweet = str(X[sen])
        doc = nlp(tweet) # cada tuit lo pasamos a documento y ya podríamos tokenizarlo
        tweets = []
        for token in doc: # sacamos cada token de ese documento que es un tuit
            if (
                not token.is_punct
                and not token.is_stop
                and not token.like_url
                and not token.is_space
                and not token.pos_ == "CONJ"
            ):
                tweets.append(token.lemma_)
                
        for palabra in tweets:
            tweet_entero = palabra + ' '
        tweets_clean.append(tweet_entero)
        #tweets.append(tweet)
        
        #print(tweets_clean)

    return tweets_clean

In [3]:
def processTweetsTSV(corpus_path, trainingFile):
    if (not os.path.exists(corpus_path)):
        os.mkdir(corpus_path)

    df_dev = pd.read_csv(trainingFile, sep="\t", usecols = ['id', 'tweet','emotion'])

    anger_path = corpus_path+'\\ANGER'
    disgust_path = corpus_path+"\\DISGUST"
    fear_path = corpus_path+"\\FEAR"
    joy_path = corpus_path+"\\JOY"
    sadness_path = corpus_path + "\\SADNESS"
    surprise_path = corpus_path + "\\SURPRISE"
    others_path = corpus_path + "\\OTHERS"

    if (not os.path.exists(anger_path)):
            os.makedirs(anger_path)
            os.makedirs(disgust_path)
            os.makedirs(fear_path)
            os.makedirs(joy_path)
            os.makedirs(sadness_path)
            os.makedirs(surprise_path)
            os.makedirs(others_path)

    #print(df_dev)
    df_dev.to_csv("filterDataTrain.csv")

    with open('filterDataTrain.csv', mode='r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file, quotechar='"', delimiter=',',
                                    quoting=csv.QUOTE_ALL, skipinitialspace=True)
        path = ''
        for row in csv_reader:
            print(row)
            if row["emotion"] == "others":
                path= others_path
            elif row["emotion"] == "anger":
                path= anger_path
            elif row["emotion"] == "disgust":
                path= disgust_path
            elif row["emotion"] == "fear":
                path= fear_path
            elif row["emotion"] == "joy":
                path = joy_path
            elif row["emotion"] == "sadness":
                path = sadness_path
            elif row["emotion"] == "surprise":
                path = surprise_path

            # Creating new file
            f=open(path+"\\"+row['id']+".txt","w", encoding='utf-8')
            f.write(row['tweet'])
            f.close()

In [4]:
def classifier(corpus_path):

    # CorpusTrain and subfolders (categories) must exist
    tweets_data = load_files(corpus_path)
    X, y = tweets_data.data, tweets_data.target

    documents = basic_processing(X)

    vectorizer = CountVectorizer()
    #vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1, 1))
    X = vectorizer.fit_transform(documents).toarray()

    tfidfconverter = TfidfTransformer()
    X = tfidfconverter.fit_transform(X).toarray()

    # The data is divided into 20% test set and 80% training set.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    #clf = MultinomialNB().fit(X_train, y_train)
    clf = BernoulliNB().fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(confusion_matrix(y_test,y_pred))
    print("------------------------------------------")
    print(classification_report(y_test,y_pred))
    print("------------------------------------------")
    print("accuracy",accuracy_score(y_test, y_pred))

In [None]:
if __name__ == "__main__":
    trainingFile = "Corpus/train.tsv"
    corpus_path = "CorpusTraining"

    processTweetsTSV(corpus_path, trainingFile)
    # entrenar el clasificador y evaluar el rendimiento
    classifier(corpus_path)

{'': '0', 'id': 'a0c1a858-a9b8-4cb1-8a81-1602736ff5b8', 'tweet': 'La Gran Guerra de #JuegoDeTronos nos ha dejado muchos momentos para el recuerdo y unas cuantas dudas https://t.co/UE7lCgLSez https://t.co/OO3tjUR34c', 'emotion': 'others'}
{'': '1', 'id': '9b272817-a231-4f68-bdf4-3350d4919330', 'tweet': 'El golpe de Estado en #Venezuela está más lejos que el final de Elif. #VenezuelaEnBatallaYVictoria #VenezuelaNoSeRinde #VenezuelaHoy', 'emotion': 'others'}
{'': '2', 'id': '4bd5b1e5-4b74-440a-82f4-c2567a241011', 'tweet': 'No tengo una foto en la catedral de #NotreDame pero tengo esta secuencia hermosa... https://t.co/cwcdHASGIu', 'emotion': 'sadness'}
{'': '3', 'id': '0bb9d7c9-d781-4684-890e-a94bfb50acc0', 'tweet': '#NotreDame nunca llegue a visitar tan grandiosa construcción. Solo vista en imágenes y videos.. 😭', 'emotion': 'sadness'}
{'': '4', 'id': '88749098-d539-4500-9209-0bbfae2b109c', 'tweet': 'A tomar por culo mi crush 😭😭😭😭😭#JuegoDeTronos', 'emotion': 'sadness'}
{'': '5', 'id': '2