# <center> <img src="figs/LogoUFSCar.jpg" alt="Logo UFScar" width="110" align="left"/>  <br/> <center>Universidade Federal de São Carlos (UFSCar)<br/><font size="4"> Departamento de Computação, campus Sorocaba</center></font>
</p>

<font size="4"><center><b>Disciplina: Aprendizado de Máquina</b></center></font>
  
<font size="3"><center>Prof. Dr. Tiago A. Almeida</center></font>

## <center>Projeto Final</center>

**Aluno**: Luiza Gandolfi Barioto

**RA**: 793247


In [None]:
# Caminho dos arquivos
#FILES_DIRECTORY = "dados"

from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import fnmatch

from wordcloud import WordCloud, STOPWORDS

from sklearn import metrics
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import learning_curve, cross_val_score, train_test_split, StratifiedKFold, RandomizedSearchCV

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.calibration import CalibratedClassifierCV

from scipy.stats import uniform
import scipy.sparse

from yellowbrick.model_selection import learning_curve

import xgboost as xgb
import catboost as ctb

from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

---
### Pré-processamento

Nesta seção, as funções da etapa de pré-processamento dos dados devem ser implementadas e aplicadas (se necessário).

In [None]:
class CreateDataset():
    def __init__(self, datasets):
        self.datasets = datasets

    # Criação de um dataframe com todos os dados de treino e teste, com suas respectivas classes (ou ausência delas)
    def all_data_dataset(path):
        # Lista todos os arquivos CSV no diretório atual que começam com "news_data"
        files_csv = [file for file in glob.glob(os.path.join(path, 'news_data*.csv')) if fnmatch.fnmatch(file, '*2020.csv')]
        combined_data = pd.concat([pd.read_csv(file) for file in tqdm(files_csv)], ignore_index=True)
        return combined_data

    # Cria um .csv apenas com os dados de treino que tem algum rótulo definido
    import pandas as pd

    def all_data_train_test(df_original, df_test, df_train):
        df_train = df_train.dropna(subset=['label'])
        df_train_final = df_original[df_original['id'].isin(df_train['id']) & df_original['date'].between('2020-01-01', '2020-12-31')]
        df_train_final = pd.merge(df_train_final, df_train[['id', 'label']], on='id', how='left')
        df_train_final = df_train_final.dropna(subset=['title', 'content'])
        
        df_test_final = df_original[df_original['id'].isin(df_test['id'])]

        return df_train_final, df_test_final

    # Faz a limpeza dos datasets, removendo pontuacoes, stopwords, etc
    # isso é feito apenas na parte do dataset que será realmente usado, para fins de economia de recursos.
    
    def clean_text(df):
        print("TITLE")
        print("Cleaning empty values")
        df.fillna('empty', inplace=True)
        df.title = df.title.astype(str)
        print("Lower")
        df.title = df.title.str.lower()
        print("Cleaning characters")
        df.title = df.title.str.replace('[^a-zA-Z0-9 ]', '', regex=True)
        print("Cleaning digits")
        df.title = df.title.str.replace('\d*','',regex=True)
        print("Cleaning www")
        df.title = df.title.str.replace('w{3}','')
        print("Cleaning https")
        df.title = df.title.str.replace("http\S+", "",regex=True)
        print("Cleaning spaces")
        df.title = df.title.str.replace('\s+', ' ',regex=True)
        df.title = df.title.str.replace(r'\s+[a-zA-Z]\s+', '',regex=True)
        print("Cleaning stopwords")
        df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
        
        print("CONTENT")
        print("Cleaning empty values")
        df.fillna('empty', inplace=True)
        df.content = df.content.astype(str)
        print("Lower")
        df.content = df.content.str.lower()
        print("Cleaning characters")
        df.content = df.content.str.replace('[^a-zA-Z0-9 ]', ' ', regex=True)
        print("Cleaning digits")
        df.content = df.content.str.replace('\d*','',regex=True)
        print("Cleaning www")
        df.content = df.content.str.replace('w{3}','')
        print("Cleaning https")
        df.content = df.content.str.replace("http\S+", "",regex=True)
        print("Cleaning spaces")
        df.content = df.content.str.replace('\s+', ' ',regex=True)
        df.content = df.content.str.replace(r'\s+[a-zA-Z]\s+', '',regex=True)
        print("Cleaning stopwords")
        df['content'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
        return df
        
    def balance_df(df_true,df_false):
        df_true_downsampled = df_true.sample(df_false.shape[0])
        df_balanced = pd.concat([df_true_downsampled, df_false])

        print("Dataframe balanced")
        return df_balanced

    def concat_title_content(df):
        df["content_title"] = df["title"] + " " + df["content"]
        return df

    def lemmatize_words(text):
        words = text.split()
        words = [lemmatizer.lemmatize(word,pos='v') for word in words]
        return ' '.join(words)

In [None]:
if __name__ == '__main__':
    path_all_data = '/kaggle/input/ufscar-am-2023-2-projeto-final'

    print(" *** Creating the datasets ***")
    df_original = CreateDataset.all_data_dataset(path=path_all_data)
    df_test_original = pd.read_csv('/kaggle/input/ufscar-am-2023-2-projeto-final/test.csv')
    df_train_original = pd.read_csv('/kaggle/input/ufscar-am-2023-2-projeto-final/train.csv')
    df_train, df_test = CreateDataset.all_data_train_test(df_original, df_test=df_test_original, df_train=df_train_original)
    
    print("*** Balancing the datasets ***")
    df_train = CreateDataset.balance_df(df_true= df_train[df_train['label']==0], df_false=df_train[df_train['label']==1])
    
    print("*** Cleaning the datasets ***")
    df_train = df_train.dropna()
    df_test.fillna('empty',inplace=True)
    df_train = CreateDataset.clean_text(df_train)
    df_test_clean = CreateDataset.clean_text(df_test)
    
    print("*** Lemmatizing the datasets ***")
    print("Lemmatizing the train content")
    df_train["content"] = df_train["content"].astype(str).apply(CreateDataset.lemmatize_words)
    print("Lemmatizing the train title")
    df_train["title"] = df_train["title"].astype(str).apply(CreateDataset.lemmatize_words)

    print("Lemmatizing the test content")
    df_test["content"] = df_test["content"].astype(str).apply(CreateDataset.lemmatize_words)
    print("Lemmatizing the test title")
    df_test["title"] = df_test["title"].astype(str).apply(CreateDataset.lemmatize_words)
    
    print("*** Concatenating the datasets title and content ***")
    df_test["content_title"] = df_test["title"] + " " + df_test["content"]
    df_train["content_title"] = df_train["title"] + " " + df_train["content"]
    
    print("*** Ordering the datasets ***")
    column_train_order = ['id', 'label', 'title', 'content', 'content_title','date','day','month','year']
    column_test_order = ['id', 'title', 'content', 'content_title','date','day','month','year']

    if set(df_train.columns) == set(column_train_order):
        df_train = df_train[column_train_order]

    if set(df_test.columns) == set(column_test_order):
        df_test = df_test[column_test_order]
        
    print("*** Saving the datasets ***")
    df_train = df_train.dropna()
    df_test.fillna('empty',inplace=True)
    df_train.to_csv('TRAIN.csv', index=False)
    df_test.to_csv('TEST.csv', index=False)

In [None]:
print("**** IF DATASETS ALREADY CREATED, JUST LOAD THEM ****")
df_train = pd.read_csv('/kaggle/working/TRAIN.csv')
df_test = pd.read_csv('/kaggle/working/TEST.csv')
df_train = df_train.dropna()
df_test.fillna('empty',inplace=True)
print("**** DATASETS LOADED ****")

---
### Análise exploratória

Nesta seção, deve ser feita a leitura da base de dados e todas as análises necessárias para interpretar e analisar os dados, tais como:
* Significado de cada atributo
* Medidas descritivas
* Gráficos

In [None]:
class Analisys():
    def __init__(self,df,label,X,y,estimator,num_trainings):
        self.df = df
        self.label = label
        self.X = X
        self.y = y
        self.estimator = estimator
        self.num_trainings = num_trainings
    
    # *** DATA ***
    
    def pizza_graph(df):
        df['year'] = pd.to_datetime(df['date']).dt.year
        df_agrupado = df.groupby(['year', 'label']).size().unstack()

        for ano in df_agrupado.index.get_level_values('year').unique():
            df_ano = df_agrupado.loc[ano]
            df_ano.plot.pie(autopct='%1.1f%%', startangle=90, title=f'Distribuição das Classes em {ano}')
            plt.ylabel('')  # Remove o rótulo do eixo y
            plt.show()
    
    def wordcloud(df,label):
        df_cloud = df[df_train['label'] == label]
        comment_words = ''
        stopwords = set(STOPWORDS)
        
        for val in df_cloud.content:
            val = str(val)
            tokens = val.split()

            for i in range(len(tokens)):
                tokens[i] = tokens[i].lower()

            comment_words += " ".join(tokens)+" "

        wordcloud = WordCloud(width = 800, height = 800,
                        background_color ='white',
                        stopwords = stopwords,
                        min_font_size = 10).generate(comment_words)

        if label == 1: plt.title('Word Cloud for Fake News',fontsize=17)
        else: plt.title('Word Cloud for True News',fontsize=17)

        plt.figure(figsize = (8, 8), facecolor = None)
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.tight_layout(pad = 0)

        plt.show()
    
    def time_series(data):
        fake_df = data[data['label'] == 1]
        fake=fake_df.groupby(['date'])['label'].count()
        fake=pd.DataFrame(fake)

        true_df = data[data['label'] == 0]
        true=true_df.groupby(['date'])['label'].count()
        true=pd.DataFrame(true)

        #Plotting the time series graph
        fig = go.Figure()
        fig.add_trace(go.Scatter(
                 x=true.index,
                 y=true['label'],
                 name='True',
            line=dict(color='blue'),
            opacity=0.8))

        fig.add_trace(go.Scatter(
                 x=fake.index,
                 y=fake['label'],
                 name='Fake',
            line=dict(color='red'),
            opacity=0.8))

        fig.update_xaxes(
            rangeslider_visible=True,
            rangeselector=dict(
                buttons=list([
                    dict(count=1, label="1m", step="month", stepmode="backward"),
                    dict(count=6, label="6m", step="month", stepmode="backward"),
                    dict(count=1, label="YTD", step="year", stepmode="todate"),
                    dict(count=1, label="1y", step="year", stepmode="backward"),
                    dict(step="all")
                ])
            )
        )


        fig.update_layout(title_text='True and Fake News',plot_bgcolor='rgb(248, 248, 255)',yaxis_title='Value')
        fig.show()

    # *** MODEL ***
    def confusion_matrix(actual_label, predicted_label):
        actual = np.array(actual_label)
        pred = np.array(predicted_label)

        cm = confusion_matrix(actual,predicted_label)
        sns.heatmap(cm, 
                    annot=True,
                    fmt='g', 
                    xticklabels=['False','True'],
                    yticklabels=['False','True'])
        plt.ylabel('Prediction',fontsize=13)
        plt.xlabel('Actual',fontsize=13)
        plt.title('Confusion Matrix',fontsize=17)
        plt.show()
        
    def roc_curve(model,X_test,y_test):
        probs = model.predict_proba(X_test)
        preds = probs[:,1]
        fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
        roc_auc = metrics.auc(fpr, tpr)

        # method I: plt
        import matplotlib.pyplot as plt
        plt.title('Receiver Operating Characteristic')
        plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
        plt.legend(loc = 'lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.show()
        

In [None]:
# -*- coding: utf-8 -*-

if __name__ == '__main__':
    print('*** PIZZA GRAPH ***')
    #Gráfico de pizza da distribuição das classes
    Analisys.pizza_graph(df_train)
    # Nuvem de palavras de notícias falsas
    Analisys.wordcloud(df_train,1)
    # Nuvem de palavras de notícias verdadeiras
    Analisys.wordcloud(df_train,0)
    # Frequência das notícias falsas e verdadeiras ao longo de 2019 e 2020.
    Analisys.time_series(df_train)


---
### Experimento

Nesta seção, o experimento deve ser conduzido, utilizando os protocolos experimentais padrões e testando diferentes modelos.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train['content_title'], df_train['label'], test_size=0.1,stratify=df_train['label'])

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

### Logistic Regression

In [None]:
# FINE TUNING
X = tfidf_train_vectors[:10000]
y = y_train[:10000]

params = {'C': uniform(0.1, 2.0),  
          'max_iter': [1500, 2000, 5000],
          'tol': [0.00001, 0.0001, 0.01, 0.1]}

clf = LogisticRegression()
search_cv_lr = RandomizedSearchCV(clf, params, n_iter=10, cv=5,  n_jobs=-1, scoring='accuracy', return_train_score=True)
search_cv_lr.fit(X, y)
search_cv_lr.best_params_

In [None]:
LR = LogisticRegression(**search_cv_lr.best_params_)
LR.fit(tfidf_train_vectors,y_train)

predicted_LR = LR.predict(tfidf_test_vectors)

print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted_LR))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted_LR))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted_LR))

### Naive-Bayes

In [None]:
# FINE TUNING
tfidf_array = tfidf_train_vectors.toarray()
tfidf_test = tfidf_test_vectors.toarray()

X = tfidf_array.tolist()
y = y_train.values.tolist()

X = np.array(X)
y = np.array(y)

params = {'var_smoothing':np.logspace(0,-9, num=10)}

clf = GaussianNB()
search_cv_nb = RandomizedSearchCV(clf, params, n_iter=10, cv=5,  n_jobs=-1, scoring='accuracy', return_train_score=True)
search_cv_nb.fit(X,y)
search_cv_nb.best_params_

In [None]:
tfidf_array = tfidf_train_vectors.toarray()
tfidf_test = tfidf_test_vectors.toarray()

X = tfidf_array.tolist()
test = tfidf_test.tolist()
y = y_train.values.tolist()

X = np.array(X)
y = np.array(y)
test = np.array(test)

In [None]:
NB = GaussianNB(**search_cv_nb.best_params_)
NB.fit(X, y)

predicted_NB = NB.predict(test)

print("NB Accuracy:",metrics.accuracy_score(y_test, predicted_NB))
print("NB Precision:",metrics.precision_score(y_test, predicted_NB))
print("NB Recall:",metrics.recall_score(y_test, predicted_NB))

### RNN - MLP

In [None]:
# FINE TUNING
X = tfidf_train_vectors[:10000]
y = y_train[:10000]

params = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive']
}

clf = MLPClassifier()
search_cv_mlp = RandomizedSearchCV(clf, params, n_iter=10, cv=5,  n_jobs=-1, scoring='accuracy', return_train_score=True)
search_cv_mlp.fit(X, y)
search_cv_mlp.best_params_

In [None]:
MLP = MLPClassifier(**search_cv_mlp.best_params_)
MLP.fit(tfidf_train_vectors, y_train)

predicted_MLP = MLP.predict(tfidf_test_vectors)

print("MLP Accuracy:",metrics.accuracy_score(y_test, predicted_MLP))
print("MLP Precision:",metrics.precision_score(y_test, predicted_MLP))
print("MLP Recall:",metrics.recall_score(y_test, predicted_MLP))

### SUPPORT VECTOR MACHINES

In [None]:
# FINE TUNING
X = tfidf_train_vectors[:10000]
y = y_train[:10000]

params = {'C':[0.1, 1, 10, 100, 1000]}

clf = LinearSVC()
search_cv_svm =  RandomizedSearchCV(clf, params, n_iter=10, cv=5,  n_jobs=-1, scoring='accuracy', return_train_score=True)
search_cv_svm.fit(X,y)
search_cv_svm.best_params_

In [None]:
model = LinearSVC(**search_cv_svm.best_params_)
SVM = CalibratedClassifierCV(model)
SVM.fit(tfidf_train_vectors,y_train)

predicted_SVM = SVM.predict(tfidf_test_vectors)

print("SVM Accuracy:",metrics.accuracy_score(y_test, predicted_SVM))
print("SVM Precision:",metrics.precision_score(y_test, predicted_SVM))
print("SVM Recall:",metrics.recall_score(y_test, predicted_SVM))

### RANDOM FOREST

In [None]:
# FINE TUNING
X = tfidf_train_vectors[:10000]
y = y_train[:10000]

params = {'bootstrap': [True, False],
          'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
          'max_features': ['auto', 'sqrt','log2'],
          'min_samples_leaf': [1, 2, 4],
          'min_samples_split': [2, 5, 10],
          'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

clf = RandomForestClassifier()
search_cv_rf =  RandomizedSearchCV(clf, params, n_iter=10, cv=5,  n_jobs=-1, scoring='accuracy', return_train_score=True)
search_cv_rf.fit(X,y)
search_cv_rf.best_params_

In [None]:
RF = RandomForestClassifier(**search_cv_rf.best_params_)
RF.fit(tfidf_train_vectors,y_train)

predicted_RF = RF.predict(tfidf_test_vectors)

print("Random Forest Accuracy:",metrics.accuracy_score(y_test, predicted_RF))
print("Random Forest Precision:",metrics.precision_score(y_test, predicted_RF))
print("Random Forest Recall:",metrics.recall_score(y_test, predicted_RF))


### XGBOOST

In [None]:
# FINE TUNING
X = tfidf_train_vectors[:10000]
y = y_train[:10000]

params = {'learning_rate':[0.01,0.03,0.05,0.1,0.15,0.2],
          'n_estimators':[100,200,500,1000,2000],
          'max_depth':[3,5,10],
          'colsample_bytree':[0.1,0.3,0.5,1],
          'subsample':[0.1,0.3,0.5,1]}

clf = xgb.XGBClassifier()
search_cv_xgb =  RandomizedSearchCV(clf, params, n_iter=10, cv=5,  n_jobs=-1, scoring='accuracy', return_train_score=True, verbose=10)
search_cv_xgb.fit(X,y)
search_cv_xgb.best_params_

In [None]:
xgb_model = xgb.XGBClassifier(**search_cv_xgb.best_params_)
xgb_model.fit(tfidf_train_vectors, y_train)

predicted_XG = xgb_model.predict(tfidf_test_vectors)

print("XGB Accuracy:",metrics.accuracy_score(y_test, predicted_XG))
print("XGB Precision:",metrics.precision_score(y_test, predicted_XG))
print("XGB Recall:",metrics.recall_score(y_test, predicted_XG))

### CATBOOST

In [None]:
# FINE TUNING
X = tfidf_train_vectors[:10000]
y = y_train[:10000]

params = { "learning_rate": np.linspace(0,0.2,5),
          "max_depth": randint(3, 10)}

clf = ctb.CatBoostClassifier()
search_cv_ctb =  RandomizedSearchCV(clf, params, n_iter=10, cv=5,  n_jobs=-1, scoring='accuracy', return_train_score=True, verbose=10)
search_cv_ctb.fit(X,y)
search_cv_ctb.best_params_

In [None]:
model_CBC = ctb.CatBoostClassifier(**search_cv_ctb.best_params_)
model_CBC.fit(tfidf_train_vectors, y_train)

predicted_CTB = model_CBC.predict(tfidf_test_vectors)

print("CTB Accuracy:",metrics.accuracy_score(y_test, predicted_CTB))
print("CTB Precision:",metrics.precision_score(y_test, predicted_CTB))
print("CTB Recall:",metrics.recall_score(y_test, predicted_CTB))

### ENSEMBLE - LR + RF

In [None]:
model1 = LogisticRegression()
model2 = RandomForestClassifier()

model_rf_lr = VotingClassifier(estimators=[('lr', model1),('rf', model2)], voting='soft')
model_rf_lr.fit(tfidf_train_vectors,y_train)

predicted_rf_lr = model_rf_lr.predict(tfidf_test_vectors)

print("ENSEMBLE Accuracy:",metrics.accuracy_score(y_test, predicted_rf_lr))
print("ENSEMBLE Precision:",metrics.precision_score(y_test, predicted_rf_lr))
print("ENSEMBLE Recall:",metrics.recall_score(y_test, predicted_rf_lr))

### ENSEMBLE - SVM + RF

In [None]:
model_svm = LinearSVC()
SVM = CalibratedClassifierCV(model_svm)

model1 = RandomForestClassifier()
model2 = SVM

model_rf_svm = VotingClassifier(estimators=[('rf', model1),('svm', model2)], voting='soft')
model_rf_svm.fit(tfidf_train_vectors,y_train)

predicted_svm_rf = model_rf_svm.predict(tfidf_test_vectors)

print("ENSEMBLE Accuracy:",metrics.accuracy_score(y_test, predicted_svm_rf))
print("ENSEMBLE Precision:",metrics.precision_score(y_test, predicted_svm_rf))
print("ENSEMBLE Recall:",metrics.recall_score(y_test, predicted_svm_rf))

### ENSEMBLE - LR + SVM + XGB + RF

In [None]:
model_svm = LinearSVC()
SVM = CalibratedClassifierCV(model_svm)

model1 = xgb.XGBClassifier()
model2 = LogisticRegression()
model3 = SVM
model4 = RandomForestClassifier()

model_svm_rf_lr_xgb = VotingClassifier(estimators=[('xgb', model1), ('lr', model2),('svm', model3), ('rf', model4)], voting='soft')
model_svm_rf_lr_xgb.fit(tfidf_train_vectors,y_train)

predicted_ENS = model_svm_rf_lr_xgb.predict(tfidf_test_vectors)

print("ENSEMBLE Accuracy:",metrics.accuracy_score(y_test, predicted_ENS))
print("ENSEMBLE Precision:",metrics.precision_score(y_test, predicted_ENS))
print("ENSEMBLE Recall:",metrics.recall_score(y_test, predicted_ENS))

---
### Análise dos Resultados

Nesta seção, os resultados devem ser exibidos através de tabelas e gráficos, comparados e profundamente analisados.

### Logistic Regression

In [None]:
# Confusion Matrix
Analisys.confusion_matrix(actual_label=y_test,predicted_label=predicted_LR)
# ROC Curve
Analisys.roc_curve(LR,tfidf_test_vectors,y_test)

### Naive-Bayes

In [None]:
# Confusion Matrix
Analisys.confusion_matrix(actual_label=y_test,predicted_label=predicted_NB)
# ROC Curve
Analisys.roc_curve(NB,test,y_test)

### RNN - MLP

In [None]:
# Confusion Matrix
Analisys.confusion_matrix(actual_label=y_test,predicted_label=predicted_MLP)
# ROC Curve
Analisys.roc_curve(MLP,tfidf_test_vectors,y_test)

### SUPPORT VECTOR MACHINES

In [None]:
# Confusion Matrix
Analisys.confusion_matrix(actual_label=y_test,predicted_label=predicted_SVM)
# ROC Curve
Analisys.roc_curve(SVM,tfidf_test_vectors,y_test)

### RANDOM FOREST

In [None]:
# Confusion Matrix
Analisys.confusion_matrix(actual_label=y_test,predicted_label=predicted_RF)
# ROC Curve
Analisys.roc_curve(RF,tfidf_test_vectors,y_test)

### XGBOOST

In [None]:
# Confusion Matrix
Analisys.confusion_matrix(actual_label=y_test,predicted_label=predicted_XG)
# ROC Curve
Analisys.roc_curve(xgb_model,tfidf_test_vectors,y_test)

### CATBOOST

In [None]:
# Confusion Matrix
Analisys.confusion_matrix(actual_label=y_test,predicted_label=predicted_CTB)
# ROC Curve
Analisys.roc_curve(model_CBC,tfidf_test_vectors,y_test)

### ENSEMBLE - LR + RF

In [None]:
# Confusion Matrix
Analisys.confusion_matrix(actual_label=y_test,predicted_label=predicted_rf_lr)
# ROC Curve
Analisys.roc_curve(model_rf_lr,tfidf_test_vectors,y_test)

### ENSEMBLE SVM + RF

In [None]:
# Confusion Matrix
Analisys.confusion_matrix(actual_label=y_test,predicted_label=predicted_svm_rf)
# ROC Curve
Analisys.roc_curve(model_rf_svm,tfidf_test_vectors,y_test)

### ENSEMBLE LR + SVM + XGB + RF

In [None]:
# Confusion Matrix
Analisys.confusion_matrix(actual_label=y_test,predicted_label=predicted_ENS)
# ROC Curve
Analisys.roc_curve(model_svm_rf_lr_xgb,tfidf_test_vectors,y_test)

### SALVAR OS RESULTADOS

In [None]:
tfidf_df_test_vectors = tfidf_vectorizer.transform(df_test['content_title'])
model_prob = model_CBC.predict_proba(tfidf_df_test_vectors)

prob = pd.DataFrame(model_prob, columns=['pred_0', 'pred_1'])
prob['output'] = prob['pred_1']

df_final = pd.DataFrame()
df_final['id'] = df_test['id']
df_final['label'] = prob['output']

df_final.to_csv('result.csv', index=False)