# __**BLOCK 2**__

# 3. Using the model and analyzing the most frequent words in a set of news obtained from a media organization.

### To get the set of news, the API news is going to be used. This API is available for Python and collects news depending on the media organization, date, language...
#### For more information related to this API see the below link:
#### https://newsapi.org/docs/get-started
##### Note: An internet connection is required to run the code

##### The code of this notebook was used to create the Dash interface part 2. Note that the analysis related to the most relevant words of each corpus, was not included in the Dash interface. See the "5_Dash_framework_part_2.ipynb" notebook for more information

### To use the API the below package should be installed.

In [None]:
!pip install newsapi-python

In [None]:
from newsapi import NewsApiClient
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from datetime import date, timedelta

## 3.1 Training the model

In [None]:
#The data used to train the model
path_file= './data/dataset_news.csv'
file=pd.read_csv(path_file, sep=',')

In [None]:
#A function is created to normalize the text, removing punctuation symbols and double spaces. This cleaned data is 
#saved in a new column ('TEXT').
def normalize_text(text):
    text = text.lower()
    
    # remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
    text = re.sub('\s\W',' ',text)
    text = re.sub('\W\s',' ',text)
    
    # make sure we didn't introduce any double spaces
    text = re.sub('\s+',' ',text)
    
    return text

file['TEXT'] = [normalize_text(text) for text in file['TITLE']]

In [None]:
#The text is going to be vectorized.  For more info see the below link:
#http://scikit-learn.org/stable/modules/feature_extraction.html
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(file['TEXT'])
encoder = LabelEncoder()
y = encoder.fit_transform(file['CATEGORY'])

In [None]:
### the Naive Bayes model e dividimos o data set co StrtifiedKfold polas razóns da explicación de arriba
### the data is splitted using the Strtified K fold.
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
i=1
score_mean=[]

for train_index,test_index in kf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #Naive Bayes model
    nb = MultinomialNB()
    classifier_model = nb.fit(X_train, y_train)
    score = nb.score(X_test, y_test)
    score_mean.append(score)
    print(score)
    i+=1
print('The mean of the whole predictions is:',sum(score_mean)/len(score_mean))

## 3.2 Getting the news

### With this API, it is possible to get news from different media sources. 
### For more information see the below link:
https://newsapi.org/docs/get-started

##### Note: An internet connection is required to run the code

In [None]:
# The api_key is the key from my account, which can be used to obtain the set of news.
newsapi = NewsApiClient(api_key='9fe0d6dd387c40bc8cb5fdec346f0bda')

In [None]:
today= str(date.today())
day_before_yesterday = str(date.today()-timedelta(2))

#For more information related to the newspi.get_everything, see te link https://newsapi.org/docs/endpoints/everything

all_articles = newsapi.get_everything(sources='bbc-news',
                                      from_param= day_before_yesterday,
                                      to= today,
                                      language='en',
                                      sort_by='relevancy',
                                      page=5)                    

In [None]:
#To see the available media sources
sources = newsapi.get_sources()
sources

## 3.3 Missing values in the data set obtained from the API news.

When the news are downloaded from the API, they contain
none values in the value of the ['content'] key. This none values will cause problems when the news are classified 
with our model

In [None]:
#we are going to select just the key 'content' which has text. When the news are downloaded from the API, they contain
#none values in the value of the ['content'] key. This none values will cause problemns when the news are classified 
#with our model
clean_text=[]
for text in all_articles['articles']:
    #if the value of the ['content']key is a string, it is putted in a new list (clean_text).
    if type(text['content'])==str:
        clean_text.append(text)  
clean_text


## 3.4 Plotting the classification of the news performed by the model

In [None]:
from sklearn import preprocessing
from collections import Counter

le = preprocessing.LabelEncoder()
le.fit(["business", "science and technology", "entertainment", "health"])

x_axis=[]

corpus_business=''
corpus_sciencetechnology=''
corpus_entertainment=''
corpus_health=''

#The classification of the news is performed. Thus, each news is placed in its proper corpus.
#In case there are News with a None in their content, the code will not run.
for i in range(len(clean_text)):
    prediction = nb.predict(vectorizer.transform([clean_text[i]['content']]))
    x_axis.append(list(le.inverse_transform(prediction)))
    #Each news is placed in its corpus.
    if list(le.inverse_transform(prediction)) == ['business']:
        corpus_business= corpus_business + '' + clean_text[i]['content']
    if list(le.inverse_transform(prediction)) == ['science and technology']:
        corpus_sciencetechnology= corpus_sciencetechnology + '' + clean_text[i]['content']
    if list(le.inverse_transform(prediction)) == ['entertainment']:
        corpus_entertainment= corpus_entertainment + '' + clean_text[i]['content']
    if list(le.inverse_transform(prediction)) == ['health']:
        corpus_health= corpus_health + '' + clean_text[i]['content']

corpus=[corpus_business,corpus_sciencetechnology,corpus_entertainment,corpus_health]

#The results will be putted just in one list. Without lists in the middle.
x_axis_list=[]
for i in range(len(x_axis)):
    x_axis_list.append(x_axis[i][0])
#The number of times that each element appears in the list is calculated.
x=Counter(x_axis_list)

#A bar_graph is plotted in order to see the number of news of each topic.
labels, values = zip(*Counter(x_axis_list).items())

indexes = np.arange(len(labels))

plt.bar(indexes, values)
plt.xticks(indexes , labels, rotation= 'vertical')
plt.title('Number of news for each topic')
plt.show()


## 3.3 Calculating the most relevant words of each class (business, entertainment, science/technology and health)

## 3.3.1 tf (*term frequency*)

[**tf**](https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Term_frequency_2) es el peso que indica la frecuencia de un término, es decir, el número de veces que una determinada palabra aparece en un documento. 

La aproximación más sencilla consiste consiste en asignar como peso para el término $t$ en el documento $d$ del corpus $D$ (denotado como $\mbox{tf}_{t,d}$) el número de ocurrencias de $t$ en $d$. Es recomendable normalizar esta frecuencia, diviendo el número de ocurrencias entre el número total de palabras de un documento, para no penalizar los documentos breves: $\mathrm{tf}(t,d) = \frac{\mathrm{f}(t, d)}{\max\{\mathrm{f}(w, d):w \in d\}}$

Vamos a calcularlo.

In [None]:
tf={}
i=-1
for text in corpus:
    sentence=text.split()
    for word in sentence:
        tf[word]=[0]*len(corpus)

for text in corpus:
    sentence=text.split()
    i= i+1
    for word in sentence:
        tf[word][i]=sentence.count(word)/len(sentence)

In [None]:
corpus

## 3.3.2 idf (*inverse document frequency*)

Trabajar unicamente con las frecuencias de los términos conlleva un problema: todos los términos presentes en la colección se consideran igualmente relevantes a la hora de discriminar la relevancia de los documentos, atendiendo a sus frecuencias. Y resulta que esto no es verdad. 

Imaginemos un corpus en el que la frecuencia total de dos términos concretos, *este* y *fonema*, es similar en términos absolutos. La distribución de estos términos a lo largo de la coleccion es seguramente muy diferente. El primero aparece con una distribución uniforme a lo largo del corpus, su capacidad discriminativa es baja y debería penalizarse a la hora de asignar relevancia (como el resto de *stopwords*). El segundo, por el contrario, se concentra principalmente en documentos que hablan de fonología, su capacidad discriminativa es alta y debería ser premiado.

Existen mecanismos correctores para incorporar estas penalizaciones y premios en nuestros pesos. Los más habituales pasan por recurrir a la frecuencia de documento $\mbox{df}_t$, definida como el número de documentos de la colección $D$ que contienen el término $t$: $\mbox{df}_t = {|\{d \in D: t \in d\}|}$.

Más concretamente, se calcula la frecuencia inversa de documento, o [**idf**](https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency_2) (*inverse document frequency*), definida como: $\mbox{idf}_t = \log {|D|\over \mbox{df}_t}$, donde $|D|$ indica el número total de documentos de nuestra colección. De este modo, el **idf** de un término específico pero muy discriminativo será alto, mientras que el de un término muy frecuente a lo largo de la coleccion será bajo.

##### Calculando df

In [None]:
import operator

df={}

for word in tf:
    df[word]=0
    for text in corpus:
        if word in text:
            df[word]+=1

#Ordenamos os valores de df polo value e quedamosns cos 20 primeriso
m=sorted(df.items(),key=operator.itemgetter(1),reverse=True)
m[:10]

Los valores de **df** son números enteros: el número de documentos del corpus que contienen cada uno de los términos.

##### Calculando idf

In [None]:
import math

idf={}
for word in df:
    idf[word]= math.log(len(corpus)/df[word])

## 3.3.3 tf.idf

[**td.idf**](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) (*term frequency - inverse document frequency*) es una medida numérica que expresa la relevancia de una palabra de un documento con respecto a una colección de documentos. Es uno de los esquemas de pesado más comunes en las tareas relacionadas con la recuperación de información y la minería de texto.

El objetivo de esta métrica es representar los documentos de texto como vectores, ignorando el orden concreto de las palabras pero manteniendo la información relativa a las frecuencias de aparición. 

El valor de tf-idf de una palabra:

- es mayor cuanto más frecuente sea esta palabra dentro de un documento concreto, pero;
- es mayor cuando menos común sea la palabra en otros documentos de la colección.

Estas dos características premian a los términos que son muy frecuentes en determinados documentos concretos pero poco comunes en general: estos términos pueden considerarse buenos descriptores de un conjunto de documentos. Y a la vez, penalizan aquellos términos que aparecen con mucha frecuencia a lo largo de toda la colección, como las *stopwords*.


#### Calculando **tf.idf**

**tf.idf** se calcula como el producto de dos términos: $\mathrm{tf.idf}(t, d, D) = \mathrm{tf}(t, d) \times \mathrm{idf}(t, D)$

- la frecuencia de un término (tf): el número de veces que una determinada palabra aparece en un documento. 

- la frecuencia inversa de documento (idf): el logaritmo del número total de documentos en el corpus dividido entre el número de documentos en los que el término aparece.

Ya hemos calculado previamente esos valores. Bastará con realizar los productos.

In [None]:
#Calculamos o tfidf
tfidf = {}

for word in tf:
    tfidf[word]=[]
    for value in tf[word]:
        tfidf[word].append(value*idf[word])

In [None]:
#The first corpus to be sorted are the words from the business
#The words from the business corpus are sorted by its value (tfidf).      
tfidf_business=sorted(tfidf.items(),key= lambda x : x[1][0],reverse=True)
#The second corpus to be sorted are the words from the science/technology corpus.
#The words from the science/technology corpus are sorted by its value (tfidf)      
tfidf_sciencetechnology=sorted(tfidf.items(),key= lambda x : x[1][1],reverse=True)
#The third corpus to be sorted are the words from the entertainment corpus
#The words from the entertainment corpus are sorted by its value (tfidf)     
tfidf_entertainment=sorted(tfidf.items(),key= lambda x : x[1][2],reverse=True)
#The fourth corpus to be sorted are the words from the health corpus.
#The words from the health corpus are sorted by its value (tfidf)      
tfidf_health =sorted(tfidf.items(),key= lambda x : x[1][3],reverse=True)

## 3.3.4 Representing the most important word of each corpus in a table

### Explanation of the data presented in the following tables.

The higher of the **tfidf** value in a specific corpus, the more importance of the word in that corpus.<br>
Note that a word is relevant in a corpus, when the word is repeated a lot of times in the corpus itself, and it is not present in other corpus.
Therefore, the different tables show the most important words for the business, entertainment, science/technology and
health corpus. Bear in mind that these words have a high value just in one of the corpus, and for the rest of them, the value is low or even 0.

In [None]:
#The top 10 most important words of the business corpus are represented in a table.

#When the tf_idf_business is 0 (there are no business news nor words to show ), the code is not execute.
if tfidf_business[1][1][0]!=0:
    n=10
    y_words_b=[tfidf_business[y][0] for y in range(n)]
    y_business=[tfidf_business[y][1][0] for y in range(n) ]
    y_business_sciencetechnolgy=[tfidf_business[y][1][1] for y in range(n) ]
    y_business_entertainment=[tfidf_business[y][1][2] for y in range(n) ]
    y_business_health=[tfidf_business[y][1][3] for y in range(n) ]

    print('\033[1mThe most important words of the business corpus')

    display(pd.DataFrame({'0_Words':y_words_b,'1_Business':y_business, '2_science&technology':y_business_sciencetechnolgy,
                 '3_entertainment':y_business_entertainment, '4_health':y_business_health}))
else:
    print('\033[1mThere are no news classified as business type, therefore there are not words to show')

In [None]:
#The top 10 most important words from the science/technolgy corpus are showed in the below table.

#When the tf_idf_science/technolgy is 0 (there are no science/technolgy news nor words to show ), the code is not execute.
if tfidf_sciencetechnology[1][1][1]!=0:
    n=10
    y_words_sc=[tfidf_sciencetechnology[y][0] for y in range(n)]
    y_sciencetechnolgy_business=[tfidf_sciencetechnology[y][1][0] for y in range(n) ]
    y_sciencetechnolgy=[tfidf_sciencetechnology[y][1][1] for y in range(n) ]
    y_sciencetechnolgy_entertainment=[tfidf_sciencetechnology[y][1][2] for y in range(n) ]
    y_sciencetechnolgy_health=[tfidf_sciencetechnology[y][1][3] for y in range(n)]

    print('\033[1mThe most important words of the science and technolgy corpus')

    display(pd.DataFrame({'0_Words':y_words_sc,'1_Business':y_sciencetechnolgy_business, '2_science&technology':y_sciencetechnolgy,
             '3_entertainment':y_sciencetechnolgy_entertainment, '4_health':y_sciencetechnolgy_health}))
else:
    print('\033[1mThere are no news classified as science & technology type, therefore there are not words to show')

In [None]:
#The top 10 most important words from the entertainment corpus are showed in the below table.
#When the tf_idf_entertainment is 0 (there are no entertainment news nor words to show ), the code is not execute.
if tfidf_entertainment[1][1][2]!=0:
    y_words_e=[tfidf_entertainment[y][0] for y in range(n)]
    y_entertainment_business=[tfidf_entertainment[y][1][0] for y in range(n) ]
    y_entertainment_sciencetechnolgy=[tfidf_entertainment[y][1][1] for y in range(n) ]
    y_entertainment=[tfidf_entertainment[y][1][2] for y in range(n) ]
    y_entertainment_health=[tfidf_entertainment[y][1][3] for y in range(n) ]

    print('\033[1mThe most important words of the entertainment corpus')

    display(pd.DataFrame({'0_Words':y_words_e,'1_Business':y_entertainment_business, '2_science&technology':y_entertainment_sciencetechnolgy,
                 '3_entertainment':y_entertainment, '4_health':y_entertainment_health}))
else:
    print('\033[1mThere are no news classified as entertainment type, therefore there are not words to show')
    

In [None]:
#The top 10 most important words from the health corpus are showed in the below table.
#When the tf_idf_health is 0 (there are no entertainment news nor words to show ), the code is not execute.
if tfidf_health[1][1][3]!=0:
    y_words_h=[tfidf_health[y][0] for y in range(n)]
    y_health_business=[tfidf_health[y][1][0] for y in range(n) ]
    y_health_sciencetechnolgy=[tfidf_health[y][1][1] for y in range(n) ]
    y_health_entertainment=[tfidf_health[y][1][2] for y in range(n) ]
    y_health=[tfidf_health[y][1][3] for y in range(n) ]

    print('\033[1mThe most important words of the health corpus')

    display(pd.DataFrame({'0_Words':y_words_h,'1_Business':y_health_business, '2_science&technology':y_health_sciencetechnolgy,
                 '3_entertainment':y_health_entertainment, '4_health':y_health}))
else:
    print('\033[1mThere are no news classified as health type, therefore there are not words to show')

## Conclusion: As the number of news is too low, the representative words of each corpus are not the typical values expected by a person. 