# Código do TCC de Paulo de Tarso, da EMAp FGV

In [1]:
import nltk
#Se for preciso, executar o comando abaixo
#nltk.download()

import unicodedata
import string
import os
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import CountVectorizer as CV

#Ignorar warnings
import warnings
warnings.filterwarnings('ignore')

## Códigos auxiliares
### Código para remover símbolos das strings

In [2]:
def clean_string(s):
    s = s.replace("\n"," ")
    return ''.join(x for x in unicodedata.normalize('NFKD', s)
                   if x in string.ascii_letters + " ").lower()

## Parâmetros

In [3]:
#Parâmetros
inicio_atas = 104 #Utilizarei as atas a partir de 2005
final_atas = 206

total_atas = final_atas - inicio_atas + 1
atas = range(inicio_atas,final_atas+1)

path = "D:/Users/paulotarsosantos/Documents/Documentos/TCC/Atas/"

## Tokenização dos textos

### Obter textos dos arquivos

In [45]:
texts = []

#Ir para a pasta com os arquivos
os.chdir(path)

#Obter o texto dos arquivos
for n in atas:
    file = open("COPOM_" + str(n) + ".txt")
    texts.append(clean_string(file.read()))
    file.close()

#Voltar uma pasta
os.chdir("..")

### Vetorizar os textos obtidos

#### Stopwords

In [29]:
sw = [clean_string(word) for word in 
             open("stopwords.txt",encoding="utf-8").read().splitlines()[1:]]

#### Vectorização

In [39]:
#Exemplo
textos = ["COPOM decide cortar a meta da Selic em 100 bps.",
         "COPOM acredita no aceleramento da economia do país e reduz a meta em 200 bps.",
         "COPOM defende que uma taxa de juros menor incentiva o investimento no país."]
textos = [clean_string(texto) for texto in textos]

from sklearn.feature_extraction.text import CountVectorizer as CV

vetorizador = CV(stop_words=sw,min_df=2)
vetores = vetorizador.fit_transform(textos).toarray()

print("Vocabulário:\n")
print(vetorizador.get_feature_names())

print("\nVetores de cada texto:")
dict(zip(textos,[str(elt) for elt in vetores]))

Vocabulário:

['copom', 'meta', 'pais']

Vetores de cada texto:


{'copom acredita no aceleramento da economia do pais e reduz a meta em  bps': '[1 1 1]',
 'copom decide cortar a meta da selic em  bps': '[1 1 0]',
 'copom defende que uma taxa de juros menor incentiva o investimento no pais': '[1 0 1]'}

In [47]:
vectorizer = CV(stop_words=sw,min_df=10)
arrays = vectorizer.fit_transform(texts).toarray()
#d_arrays = dict(zip(range(inicio_atas,final_atas+1),arrays))

## Obter os valores das variações na meta da Selic em cada reunião do Copom

In [56]:
file = open("Cortes.txt")
cortes = [int(line.split(";")[2]) for line in file.read().splitlines()[1:]]
cortes = [int(n/abs(n)) if n != 0 else 0 for n in cortes]
cortes.reverse()
file.close()

## Métodos de classificação
Me basearei nos modelos contidos na biblioteca Scikit Learn. Os modelos estão em http://scikit-learn.org/stable/supervised_learning.html.

#### Função que recebe o modelo desejado e retorna o score médio e o valor predito médio para a última reunião (não presente na amostra)

In [49]:
def RunModel(model,n_executions,next_copom = False):
    counter = n_executions
    classifier = model
    score = 0
    last_prediction = 0
    for i in range(n_executions):
        try:        
            #Separação em dados para treino e dados para teste do modelo
            #(Aqui, utilizo até a antepenúltima reunião; a penúltima será
            #utilizada para prever o resultado da última).
            X_train,X_test,y_train,y_test = tts(arrays[:-2],cortes[:-1],train_size=0.9)

            #Treino do modelo nos dados
            classifier.fit(X_train,y_train)

            #Resultados
            score += classifier.score(X_test,y_test)
            if not next_copom:
                last_prediction += classifier.predict(arrays[-2])[0]
            else:
                last_prediction += classifier.predict(arrays[-1])[0]
        except:
            counter -= 1
        
    score /= counter
    last_prediction /= counter
    
    return [score,last_prediction]

In [50]:
from sklearn.linear_model import LinearRegression as LR
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.kernel_ridge import KernelRidge as KRR
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier as SGDC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.gaussian_process import GaussianProcessClassifier as GPC
from sklearn.cross_decomposition import PLSRegression as PLSR
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.naive_bayes import BernoulliNB as BNB
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import BaggingClassifier as BC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import ExtraTreesClassifier as ETC
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.neural_network import MLPClassifier as MLPC
from sklearn.ensemble import VotingClassifier as VC

models = [LR(),LDA(),QDA(),KRR(),SVC(),SGDC(),KNC(),GPC(),PLSR(),
          GNB(),MNB(),BNB(),DTC(),BC(),RFC(),ETC(),ABC(),GBC(),MLPC(),
          VC(estimators=[('LDA', LDA()), ('ETC', ETC()),
                        ('GBC', GBC())], voting='hard')]

names = ["Generalized Linear Model","Linear Discriminant Analysis",
        "Quadratic Discriminant Analysis","Kernel Ridge Regression",
        "Support Vector Machine Classifier","Stochastic Gradient Descent",
        "K Nearest Neighbors","Gaussian Process",
         "Partial Least Squares Regressors","Gaussian Naive Bayes",
         "Multinomial Naive Bayes","Bernoulli Naive Bayes",
         "Decision Tree Classifier","Bagging meta-estimator",
        "Random Forest Classifier","Extremely Randomized Trees",
        "AdaBoost Classifier","Gradient Tree Boosting",
        "Multi-layer Perceptron Classifier","Voting Classifier"]

Rodar os modelos

In [57]:
results = []

for i in range(len(models)):
    
    results.append(RunModel(models[i],100))

Ordenar e mostrar os resultados

In [58]:
scores = [result[0] for result in results]
last_predictions = [result[1] for result in results]

model_results = list(zip(names,scores,last_predictions))

model_results.sort(key=lambda x:x[1])
model_results.reverse()

In [59]:
model_results

[('Voting Classifier', 0.75454545454545441, -0.91000000000000003),
 ('Linear Discriminant Analysis', 0.73999999999999955, -0.69999999999999996),
 ('Extremely Randomized Trees', 0.73090909090909084, -0.94999999999999996),
 ('Gradient Tree Boosting', 0.69636363636363607, -0.77000000000000002),
 ('Gaussian Naive Bayes', 0.68181818181818177, 0.0),
 ('Bagging meta-estimator', 0.67909090909090875, -0.81000000000000005),
 ('Random Forest Classifier', 0.66909090909090896, -0.93000000000000005),
 ('Decision Tree Classifier', 0.66272727272727283, -0.5),
 ('Kernel Ridge Regression', 0.60476794064129424, -0.80932512970619075),
 ('AdaBoost Classifier', 0.58454545454545437, -0.56000000000000005),
 ('Stochastic Gradient Descent', 0.56727272727272704, -0.90000000000000002),
 ('Generalized Linear Model', 0.56341630791835073, -0.79791251233093607),
 ('Multinomial Naive Bayes', 0.5536363636363637, -0.01),
 ('Bernoulli Naive Bayes', 0.47181818181818158, -0.40000000000000002),
 ('Multi-layer Perceptron Cla

In [61]:
file = open("Results.csv","w")
[file.write(names[i] + ";" + str(scores[i]) + ";" +
            str(last_predictions[i]) + "\n") for i in range(len(scores))]
file.close()

### Generalized Linear Model

In [51]:
from sklearn.linear_model import LinearRegression as LR
RunModel(LR(),1000)

[0.2503604459268276, -52.179634749641636]

### Linear Discriminant Analysis

In [43]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
RunModel(LDA(),1000)

[0.8111999999999947, -3.6000000000000001]

### Quadratic Discriminant Analysis

In [17]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
RunModel(QDA(),1000)

[0.38063636363636399, -0.48499999999999999]

### Kernel Ridge Regression

In [18]:
from sklearn.kernel_ridge import KernelRidge as KRR
RunModel(KRR(),10000)

NameError: name 'KR' is not defined

### Support Vector Machine

In [44]:
from sklearn.svm import SVC
RunModel(SVC(),100)

[0.86399999999999966, 0.0]

### Stochastic Gradient Descent

In [None]:
from sklearn.linear_model import SGDClassifier as SGDC
RunModel(SGDC(),1000)

### Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier as KNC
RunModel(KNC(),1000)

### Gaussian Process

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier as GPC
RunModel(GPC(),1000)

### Cross Decomposition

In [None]:
from sklearn.cross_decomposition import PLSRegression as PLSR
RunModel(PLSR(),1000)

### Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB as GNB
RunModel(GNB(),1000)

### Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB as MNB
RunModel(MNB(),1000)

### Bernoulli Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB as BNB
RunModel(BNB(),1000)

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier as DTC
RunModel(DTC(),1000)

### Bagging meta-estimator

In [None]:
from sklearn.ensemble import BaggingClassifier as BC
RunModel(BC(),100)

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC
RunModel(RFC(),1000)

### Extremely Randomized Trees

In [46]:
from sklearn.ensemble import ExtraTreesClassifier as ETC
RunModel(ETC(),1000)

[0.83418181818182346, -66.900000000000006]

### AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier as ABC
RunModel(ABC(),100)

### Gradient Tree Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier as GBC
RunModel(GBC(),10)

### Multi-layer Perceptron Classifier
Neural Network with backpropagation

In [None]:
from sklearn.neural_network import MLPClassifier as MLPC
RunModel(MLPC(hidden_layer_sizes=(20, 5)),100)

### Grid Search Voting Classifier
Utiliza múltiplos modelos e encontra os pesos para cada modelo.

In [None]:
from sklearn.ensemble import VotingClassifier
RunModel(VC(estimators=[('LDA', LDA()), ('ETC', ETC()),
                        ('GBC', GBC())], voting='hard'),10,next_copom=True)