# Código do TCC de Paulo de Tarso, da EMAp FGV

In [1]:
import nltk
#Se for preciso, executar o comando abaixo
#nltk.download()
import unicodedata
import string
import os
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import CountVectorizer as CV
#Ignorar warnings
import warnings
warnings.filterwarnings('ignore')

## Códigos auxiliares
### Código para remover símbolos das strings

In [2]:
def clean_string(s):
    s = s.replace("\n"," ")
    return ''.join(x for x in unicodedata.normalize('NFKD', s)
                   if x in string.ascii_letters + " ").lower()

## Parâmetros

In [12]:
#Parâmetros
inicio_atas = 104 #Utilizarei as atas a partir de 2005
final_atas = 206

total_atas = final_atas - inicio_atas + 1
atas = range(inicio_atas,final_atas+1)

path = "D:/Users/paulotarsosantos/Documents/Documentos/TCC/Atas/"

## Tokenização dos textos

### Obter textos dos arquivos

In [13]:
texts = []

#Ir para a pasta com os arquivos
os.chdir(path)

#Obter o texto dos arquivos
for n in atas:
    file = open("COPOM_" + str(n) + ".txt")
    texts.append(clean_string(file.read()))
    file.close()
    
#Voltar uma pasta
os.chdir("..")

### Vetorizar os textos obtidos

#### Stopwords

In [14]:
sw = [clean_string(word) for word in 
             open("stopwords.txt",encoding="utf-8").read().splitlines()[1:]]

#### Vectorização

In [257]:
vectorizer = CV(stop_words=sw,min_df=10)
arrays = vectorizer.fit_transform(texts).toarray()
#d_arrays = dict(zip(range(inicio_atas,final_atas+1),arrays))

## Obter os valores das variações na meta da Selic em cada reunião do Copom

In [258]:
file = open("Cortes.txt")
cortes = [int(line.split(";")[2]) for line in file.read().splitlines()[1:]]
cortes = [int(n/abs(n)) if n != 0 else 0 for n in cortes]
cortes.reverse()
file.close()

## Métodos de classificação
Me basearei nos modelos contidos na biblioteca Scikit Learn. Os modelos estão em http://scikit-learn.org/stable/supervised_learning.html.

#### Função que recebe o modelo desejado e retorna o score médio e o valor predito médio para a última reunião (não presente na amostra)

In [259]:
def RunModel(model,n_executions,next_copom = False):
    classifier = model
    score = 0
    last_prediction = 0
    for i in range(n_executions):
        
        #Separação em dados para treino e dados para teste do modelo
        #(Aqui, utilizo até a antepenúltima reunião; a penúltima será
        #utilizada para prever o resultado da última).
        X_train,X_test,y_train,y_test = tts(arrays[:-2],cortes[:-1],train_size=0.9)

        #Treino do modelo nos dados
        classifier.fit(X_train,y_train)

        #Resultados
        score += classifier.score(X_test,y_test)
        if not next_copom:
            last_prediction += classifier.predict(arrays[-2])[0]
        else:
            last_prediction += classifier.predict(arrays[-1])[0]
        
    score /= n_executions
    last_prediction /= n_executions
    
    return [score,last_prediction]

In [260]:
from sklearn.linear_model import LinearRegression as LR
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.kernel_ridge import KernelRidge as KRR
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier as SGDC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.gaussian_process import GaussianProcessClassifier as GPC
from sklearn.cross_decomposition import PLSRegression as PLSR
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.naive_bayes import BernoulliNB as BNB
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import BaggingClassifier as BC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import ExtraTreesClassifier as ETC
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.neural_network import MLPClassifier as MLPC
from sklearn.ensemble import VotingClassifier as VC

models = [LR(),LDA(),QDA(),KRR(),SVC(),SGDC(),KNC(),GPC(),PLSR(),
          GNB(),MNB(),BNB(),DTC(),BC(),RFC(),ETC(),ABC(),GBC(),MLPC(),
          VC(estimators=[('LDA', LDA()), ('ETC', ETC()),
                        ('GBC', GBC())], voting='hard')]

names = ["Generalized Linear Model","Linear Discriminant Analysis",
        "Quadratic Discriminant Analysis","Kernel Ridge Regression",
        "Support Vector Machine Classifier","Stochastic Gradient Descent",
        "K Nearest Neighbors","Gaussian Process",
         "Partial Least Squares Regressors","Gaussian Naive Bayes",
         "Multinomial Naive Bayes","Bernoulli Naive Bayes",
         "Decision Tree Classifier","Bagging meta-estimator",
        "Random Forest Classifier","Extremely Randomized Trees",
        "AdaBoost Classifier","Gradient Tree Boosting",
        "Multi-layer Perceptron Classifier","Voting Classifier"]

Rodar os modelos

In [261]:
results = []

for i in range(len(models)):
    results.append(RunModel(models[i],100))

Ordenar e mostrar os resultados

In [262]:
scores = [result[0] for result in results]
last_predictions = [result[1] for result in results]

model_results = list(zip(names,scores,last_predictions))

model_results.sort(key=lambda x:x[1])
model_results.reverse()

In [263]:
model_results

[('Voting Classifier', 0.75636363636363624, -0.96999999999999997),
 ('Extremely Randomized Trees', 0.73181818181818192, -0.97999999999999998),
 ('Linear Discriminant Analysis', 0.72090909090909061, -0.69999999999999996),
 ('Gradient Tree Boosting', 0.71181818181818157, -0.84999999999999998),
 ('Bagging meta-estimator', 0.69090909090909081, -0.92000000000000004),
 ('Random Forest Classifier', 0.67818181818181755, -0.93000000000000005),
 ('Gaussian Naive Bayes', 0.67636363636363572, 0.0),
 ('AdaBoost Classifier', 0.60454545454545427, -0.64000000000000001),
 ('Decision Tree Classifier', 0.59727272727272696, -0.55000000000000004),
 ('Kernel Ridge Regression', 0.59521960265791451, -0.79468290667362451),
 ('Stochastic Gradient Descent', 0.56272727272727263, -0.89000000000000001),
 ('Generalized Linear Model', 0.55408335156891042, -0.79637770349873416),
 ('Multinomial Naive Bayes', 0.54727272727272713, 0.0),
 ('Multi-layer Perceptron Classifier',
  0.49636363636363634,
  -0.60999999999999999)

### Generalized Linear Model

In [158]:
from sklearn.linear_model import LinearRegression as LR
RunModel(LR(),1000)

'Score: 0.530687283602 Predição da última reunião: -0.829286163544'

### Linear Discriminant Analysis

In [198]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
RunModel(LDA(),1000,next_copom=True)

'Score: 0.765 Predição da próxima reunião: -0.997'

### Quadratic Discriminant Analysis

In [122]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
RunModel(QDA(),1000)

Score: 0.384090909091 Predição da última reunião: -0.558


### Kernel Ridge Regression

In [123]:
from sklearn.kernel_ridge import KernelRidge as KRR
RunModel(KR(),10000)

Score: 0.526121517809 Predição da última reunião: -0.840371189118


### Support Vector Machine

In [124]:
from sklearn.svm import SVC
RunModel(SVC(),100)

Score: 0.374545454545 Predição da última reunião: -1.0


### Stochastic Gradient Descent

In [125]:
from sklearn.linear_model import SGDClassifier as SGDC
RunModel(SGDC(),1000)

Score: 0.539727272727 Predição da última reunião: -0.918


### Nearest Neighbors

In [126]:
from sklearn.neighbors import KNeighborsClassifier as KNC
RunModel(KNC(),1000)

Score: 0.413181818182 Predição da última reunião: -0.991


### Gaussian Process

In [127]:
from sklearn.gaussian_process import GaussianProcessClassifier as GPC
RunModel(GPC(),1000)

Score: 0.322090909091 Predição da última reunião: 1.0


### Cross Decomposition

In [131]:
from sklearn.cross_decomposition import PLSRegression as PLSR
RunModel(PLSR(),1000)

Score: 0.0218190452938 Predição da última reunião: [-2.66567714]


### Gaussian Naive Bayes

In [137]:
from sklearn.naive_bayes import GaussianNB as GNB
RunModel(GNB(),1000)

Score: 0.637636363636 Predição da última reunião: -0.998


### Multinomial Naive Bayes

In [138]:
from sklearn.naive_bayes import MultinomialNB as MNB
RunModel(MNB(),1000)

Score: 0.534909090909 Predição da última reunião: 0.0


### Bernoulli Naive Bayes

In [139]:
from sklearn.naive_bayes import BernoulliNB as BNB
RunModel(BNB(),1000)

Score: 0.483454545455 Predição da última reunião: -0.759


### Decision Tree Classifier

In [149]:
from sklearn.tree import DecisionTreeClassifier as DTC
RunModel(DTC(),1000)

'Score: 0.616909090909 Predição da última reunião: -0.729'

### Bagging meta-estimator

In [152]:
from sklearn.ensemble import BaggingClassifier as BC
RunModel(BC(),100)

'Score: 0.674545454545 Predição da última reunião: -0.86'

### Random Forest Classifier

In [154]:
from sklearn.ensemble import RandomForestClassifier as RFC
RunModel(RFC(),1000)

'Score: 0.679363636364 Predição da última reunião: -0.971'

### Extremely Randomized Trees

In [162]:
from sklearn.ensemble import ExtraTreesClassifier as ETC
RunModel(ETC(),1000)

'Score: 0.718181818182 Predição da última reunião: -0.985'

### AdaBoost Classifier

In [165]:
from sklearn.ensemble import AdaBoostClassifier as ABC
RunModel(ABC(),100)

'Score: 0.553636363636 Predição da última reunião: -0.77'

### Gradient Tree Boosting

In [167]:
from sklearn.ensemble import GradientBoostingClassifier as GBC
RunModel(GBC(),10)

'Score: 0.690909090909 Predição da última reunião: -0.9'

### Multi-layer Perceptron Classifier
Neural Network with backpropagation

In [179]:
from sklearn.neural_network import MLPClassifier as MLPC
RunModel(MLPC(hidden_layer_sizes=(20, 5)),100)

'Score: 0.448181818182 Predição da última reunião: -0.53'

### Grid Search Voting Classifier
Utiliza múltiplos modelos e encontra os pesos para cada modelo.

In [209]:
from sklearn.ensemble import VotingClassifier
RunModel(VC(estimators=[('LDA', LDA()), ('ETC', ETC()),
                        ('GBC', GBC())], voting='hard'),10,next_copom=True)

'Score: 0.781818181818 Predição da próxima reunião: -1.0'