# Classifier
> This is the class that creates the dictionary model

In [1]:
#| default_exp Classifier

In [None]:
#| hide
from nbdev.showdoc import *


In [50]:
#| export
import pandas as pd
import numpy as np

In [93]:
#negative_words file
df = pd.read_csv('negative_words.csv')
df = df.drop(columns=['original']) #drop it since it contains the concept with an * in it

neg_words = [] 
for _, row in df.iterrows():
    for item in row:
        if pd.notnull(item):
            neg_words.append(item)


In [94]:
#Drop duplicates in neg_words:
neg_words = list(set(neg_words))

len(neg_words)

1599

In [95]:
#moral_foundations_dictionary file:
df = pd.read_csv('moral_foundations_dictionary_1.0.csv')
df = df[['categories','word_examples']]

virtue = []
vice = []
for _, row in df.iterrows():
    if pd.notnull(row['word_examples']):
        words = row['word_examples'].split(',')
        if 'Virtue' in row['categories']:
            virtue.extend(words)
        elif 'Vice' in row['categories']:
            vice.extend(words)
        else:
            print(words)




['moralidad']
['moral']
['etica']
['etico']
['principios']
['valores']
['bueno', ' buena']
['bondadoso']
['correcto']
['erroneo', ' equivocado', ' incorrecto', ' malo']
['justicia']
['fechoria', ' pecado', ' indebido']
['virtud']
['vicioso']
['moralidad']
['etica']


In [96]:
virtue.extend(['moralidad','moral','etica','etico','principios','valores','bueno','buena','bondadoso','correcto','justicia','virtud','moralidad','etica'])
vice.extend(['erroneo','equivocado','incorrecto','malo','fechoria','pecado','indebido','vicioso'])
#Drop duplicates in virtue and vice:
virtue = list(set(virtue))
vice = list(set(vice))

print('vice examples: ', vice[:10])
print('virtue examples: ', virtue[:10])

vice examples:  ['incorrecto', 'destruccion', ' perjudicar', 'perjudicar', ' ataque', 'explota', ' abusivo', 'abandonar', ' matar', 'fanatismo']
virtue examples:  [' justicia', 'comunal', 'miembro', 'justicia', 'permitir', ' simpatizar', 'modestia', 'posicion', 'diferir', ' sometido']


In [99]:
#Load the vectors data frame:
def load_embeddings(file_path):
    word_to_vec = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            word_to_vec[word] = vector
    return word_to_vec

word_to_vec = load_embeddings('vectorspol.txt')

In [97]:
one = set(vice)
two = set(neg_words)

vice = list(one.union(two))

In [98]:
print(len(vice)+len(virtue))

1936


In [None]:
#| export
class Classifier:
    def __init__(self):
        self.embeddings = None
        self.vice = None
        self.virtue = None

    def load_embeddings(file_path):
        '''Loads the embeddings from a file and returns a dictionary with the words as keys and the vectors as values'''
        word_to_vec = {}
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                values = line.split()
                word = values[0]
                vector = np.array(values[1:], dtype='float32')
                print(vector.shape)
                word_to_vec[word] = vector
        self.embeddings = word_to_vec
        return word_to_vec
    
    
    def list_to_vec(self,
                    vice:list,  #list of vice words
                    virtue:list):   #list of virtue words
        '''Returns the vector representation of a list of words in a df'''
        df = pd.DataFrame(columns=['word','vector','category'])
        for word in vice:
            if word in word_to_vec.keys():
                df = df.append({'word':word,'vector':word_to_vec[word],'category':'vice'},ignore_index=True)
            else:
                df = df.append({'word':word,'vector':np.zeros(300),'category':'vice'},ignore_index=True)
        for word in virtue:
            if word in word_to_vec.keys():
                df = df.append({'word':word,'vector':np.nan,'category':'virtue'},ignore_index=True)
            else:
                df = df.append({'word':word,'vector':np.nan,'category':'virtue'},ignore_index=True)
        
        return df

In [116]:
def list_to_vec(vice, virtue, word_to_vec):
    '''Returns the vector representation of a list of words in a df'''
    data = {'word': [], 'vector': [], 'category': []}
    for word in vice:
        if word in word_to_vec.keys():
            data['word'].append(word)
            data['vector'].append(word_to_vec[word])
            data['category'].append(0.0) #vice will be represented as 0
        else:
            data['word'].append(word)
            data['vector'].append(np.nan)
            data['category'].append(1.0) #virtue will be represented as 1
    
    for word in virtue:
        if word in word_to_vec.keys():
            data['word'].append(word)
            data['vector'].append(word_to_vec[word])
            data['category'].append('virtue')
        else:
            data['word'].append(word)
            data['vector'].append(np.nan)
            data['category'].append('virtue')

    df = pd.DataFrame(data)
    return df

df = list_to_vec(vice,virtue,word_to_vec)
total = len(df)
print('Total words: ',total)
df.dropna(inplace=True)
df.reset_index(inplace=True,drop=True)
print('Total words after dropping NaN: ',len(df))
print('∆: ',total-len(df))
print('∆%:', round((total-len(df))/total*100,2) )
df

Total words:  1936
Total words after dropping NaN:  1015
∆:  921
∆%: 47.57


Unnamed: 0,word,vector,category
0,perdimos,"[-0.105455, -0.209274, -0.443211, -0.021088, 0...",0.0
1,fracaso,"[-0.031448, -0.209139, 0.20089, 0.257588, -0.4...",0.0
2,peligro,"[0.117094, 0.134832, 0.638222, 0.580737, -0.06...",0.0
3,interrupcion,"[0.319177, -0.141361, -0.873159, 0.273652, -0....",0.0
4,defectuoso,"[0.087096, -0.162107, 0.13108, -0.164852, 0.52...",0.0
...,...,...,...
1010,cumplir,"[0.346516, -0.015141, 0.138794, -0.021297, -0....",virtue
1011,correcto,"[0.071845, -0.322979, -0.259893, 0.400886, 0.3...",virtue
1012,bueno,"[0.717693, 0.01057, 0.502308, 0.411575, -0.427...",virtue
1013,comunidad,"[0.044601, 0.356975, 0.119808, -0.21727, 0.386...",virtue
