# Entrenamiento del modelo para identificación de Noticias falsas y Noticias reales

In [None]:
# Importar las librerías con el código desarrollado
import sys
import pathlib
import pandas as pd
import plotly.express as px
from statistics import mean

In [None]:
# Obtener la ruta de las librerías personalizadas requeridas para cargar los datos
def get_directory(subfolder):
    pathtest= pathlib.Path()
    current_path = str(pathtest.parent.absolute())
    name_directory = "ProyectoUNAL"
    index_path = current_path.find(name_directory)
    path_data = current_path[:index_path+len(name_directory)]
    path_data += '\\scripts\\'+ subfolder
    return path_data

In [None]:
# Adcionar la ruta de las librerías  personalizadas requeridas para cargar los datos
path_source = get_directory('data_acquisition')
sys.path.append(path_source)

In [None]:
# Librerías a usar en el proyecto
from download_data import get_data

In [None]:
# Obtención de los archivos
news_true, news_false = get_data()

In [None]:
# Validación de la información
news_true.count()

title      21417
text       21417
subject    21417
date       21417
dtype: int64

In [None]:
# Validación de la información
news_false.count()

title      23481
text       23481
subject    23481
date       23481
dtype: int64

# **1. Preprocesamiento de los archivos con las noticias reales y falsas**

In [None]:
# Preprocesamiento de los datos
# Adcionar la ruta de las librerías  personalizadas requeridas para preprocesar los datos
path_preprocessing = get_directory('preprocessing')
sys.path.append(path_preprocessing)
# Librerías a usar en el proyecto
from preprocessing import preprocessing_false, preprocessing_true, preprocessing_data

In [None]:
# Preprocesamiento de los archivos y obtención de los nuevos dataframes
df_false = preprocessing_false(news_false, 50)
df_true = preprocessing_true(news_true)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['type'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['type'] = 1


In [None]:
# Validación del número de registros resultantes
df_false.count()

title      22640
text       22640
subject    22640
type       22640
dtype: int64

In [None]:
# Validación del número de registros resultantes
df_true.count()

title      21416
text       21416
subject    21416
type       21416
dtype: int64

In [None]:
# Obtención del dataframe listo para para el modelamiento
df_news = preprocessing_data(news_true, news_false, 50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['type'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['type'] = 0


In [None]:
# Validación de la información
df_news.count()

title      44056
text       44056
subject    44056
type       44056
dtype: int64

In [None]:
# Número de noticias reales y falsas
dfg=df_news.groupby('type').count().reset_index()
dfg["type"] = dfg["type"].astype(str)
fig = px.bar(dfg, x='type', y='text', color = 'type', barmode= 'group', text_auto='6', labels={'text':'Número de noticias',
                    'type': 'Tipo de noticias: 0 - Falsas, 1 - Reales'}, title = 'Número de noticias a procesar')
fig.show()

In [None]:
# Número de noticias por subject
dfg=df_news.groupby('subject').count().reset_index()
fig = px.bar(dfg, x='subject', y='text', color = 'subject', barmode= 'group', text_auto='6', labels={'text':'Número de noticias',
                    'subject': 'Tópico de la noticia'}, title = 'Número de noticias a procesar')
fig.show()

In [None]:
# La data procesada se extrae al repositorio
name_directory = "ProyectoUNAL"
index_path = path_source.find(name_directory)
path_data = path_source[:index_path+len(name_directory)]
path_data += "\\src\\proy\\database\\"
path_file = r"" + path_data + "news_processes.csv"
df_news.to_csv(path_file)

In [None]:
# El límite para subir archivos en GitHub es 100MB, por lo cual genero dos archivos más: unoa para cada tipo de noticia
path_true = r"" + path_data + "news_true_processes.csv"
path_false = r"" + path_data + "news_false_processes.csv"
df_false.to_csv(path_false)
df_true.to_csv(path_true)

# **2. Entrenamiento del modelo**

Dado que por recursos el entrenamiento del modelo se hace en Colab. Los archivos generados con las noticias reales y falsas se leerán en este paso desde el drive del autor (previamente cargados). Sin embargo los archivos procesado se encuentran en el repositorio GitHub entregado.
Los archivos son: news_false_processes.csv y news_true_processes.csv


In [3]:
# Cargar los archivos de las noticias procesada
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
# Librerías requeridas
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

Mounted at /content/drive


In [4]:
# Directorios fuente de los archivos
file_news_true = '/content/drive/MyDrive/Colab Notebooks/MLOPS/Data/news_true_processes.csv'
file_news_false = '/content/drive/MyDrive/Colab Notebooks/MLOPS/Data/news_false_processes.csv'

In [5]:
# Obtención de la información de las noticias y generación de un solo dataframe
df_colab_false = pd.read_csv(file_news_false, usecols=["title", "text", "subject", "type"])
df_colab_true = pd.read_csv(file_news_true, usecols=["title", "text", "subject", "type"])

# Concatenación de los dos dataframes
dfs = [df_colab_false, df_colab_true]
df_news_total = pd.concat(dfs)

In [6]:
# Validación de la información recuparada
df_news_total.count()

title      44056
text       44056
subject    44056
type       44056
dtype: int64

In [5]:
df_news_total.head()

Unnamed: 0,title,text,subject,type
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,0


In [7]:
# Indentificación de las características y la variable objetivo
features = df_news_total.drop(["type"], axis=1)
label = df_news_total['type'].tolist()

In [8]:
# Validación
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44056 entries, 0 to 21415
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44056 non-null  object
 1   text     44056 non-null  object
 2   subject  44056 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


In [54]:
# Validación
type(label)

list

In [58]:
# Verificación de los valores únicos
set(label)

{0, 1}

In [9]:
# Las características se unen en una sola columna para el procesamiento requerido
features['all'] = features['title'] +' , '+ features['text']# +' , '+ features['subject']

In [10]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44056 entries, 0 to 21415
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44056 non-null  object
 1   text     44056 non-null  object
 2   subject  44056 non-null  object
 3   all      44056 non-null  object
dtypes: object(4)
memory usage: 1.7+ MB


In [61]:
features.head()

Unnamed: 0,title,text,subject,all
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,Drunk Bragging Trump Staffer Started Russian ...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,Sheriff David Clarke Becomes An Internet Joke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,Trump Is So Obsessed He Even Has Obama’s Name...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,Pope Francis Just Called Out Donald Trump Dur...


In [11]:
# Se genera una lista para procesar con TfidfVectorizer y obtener la represnetación requerida de los datos para el modelo
feature_list = features['all'].tolist()

In [12]:
feature_list[:5]

[' Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing , Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like th

## **2.1. Definición del modelo línea base**
---

In [1]:
# Función para convertir los textos en una matriz de características TF-IDF
def vectorized(list_text):
  # create instance of TfidfVectorizer
  vectorizer = TfidfVectorizer(stop_words='english')
  # fit vectorizer
  x_transformed = vectorizer.fit_transform(list_text)
  return x_transformed

In [13]:
# Se obtiene la representación de los textos-características
vect_features = vectorized(feature_list)

In [14]:
# Partición de los datos
test_size_partition = 0.2
random_state = 0
features_train, features_test, label_train, label_test = train_test_split(vect_features, label, test_size=test_size_partition, random_state = random_state )

In [15]:
# Validación de los datos obtenidos
print(f"Número de características para entrenamiento:  {features_train.shape}")
print(f"Número de características para test:  {features_test.shape}")
print(f"Número de labels para entrenamiento:  {len(label_train)}")
print(f"Número de labels para test:  {len(label_test)}")

Número de características para entrenamiento:  (35244, 122131)
Número de características para test:  (8812, 122131)
Número de labels para entrenamiento:  35244
Número de labels para test:  8812


In [16]:
# Función para la definición del modelo
def train_model(features, label, max_depth, n_estimators, learning_rate, random_state):
    model = XGBClassifier(
        #n_estimators = n_estimators,
        #max_depth =max_depth,
        #learning_rate = learning_rate,
        #random_state = random_state,
        #objective='binary:hinge'
    ).fit(features, label)
    return model

In [17]:
# Entrenamiento del modelo
model_trained = train_model(features_train, label_train, 6, 100, 0.2, 0 )

In [18]:
# Inferencia del modelo
label_pred = model_trained.predict(features_test)

In [19]:
# Obtención del clasificación report y_true = test_y_real y y_pred=np.argmax(metrics_m1p, axis=1)
from sklearn.metrics import classification_report

print(classification_report(label_test ,label_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4486
           1       1.00      1.00      1.00      4326

    accuracy                           1.00      8812
   macro avg       1.00      1.00      1.00      8812
weighted avg       1.00      1.00      1.00      8812



In [21]:
model_trained.predict(features_test[0])

array([0])