In [33]:
# libraries importation

import pandas as pd
import re
import string
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import *
from nltk.corpus import stopwords

In [34]:
news = pd.read_csv('./data/fisrt_news.csv', names = ['title', 'text', 'author', 'url'])

In [35]:
news.head()

Unnamed: 0,title,text,author,url
0,La caprichosa y millonaria primera decisión de...,"Begoña Gómez quiere cambios más ""contundentes""...",SALOMON LUSH,https://www.periodistadigital.com/periodismo/t...
1,Santiago Abascal dice que perdona a las manife...,Santiago Abascal se ha mostrado generoso al op...,Ayudante de Becario,https://lavozdelbecario.es/santiago-abascal-di...
2,El aborto será legal durante todo el embarazo ...,La norma avala el aborto tardío en cualquier m...,Actuall / Aci Prensa,https://www.actuall.com/vida/el-aborto-sera-le...
3,El Gobierno cambiará los cementerios españoles...,La naturaleza y la composición de los ritos fu...,Mediterráneo Digital,https://www.mediterraneodigital.com/espana/and...
4,"Dos perros ""contraen matrimonio""","La concejal de Seguridad de Lorca, Murcia (Esp...",Redacción ACI Prensa,https://www.aciprensa.com/noticias/casan-a-dos...


In [36]:
type(news.text[0])

str

In [37]:
news['label'] = True 


In [38]:
news.loc[:59,'label'] = False

Preparing Text Data For Analysis

functions for cleaning, tokenizing, stemming and lemmatizing data and for deleting stop words

In [39]:
# function for cleaning

def clean_up_text(text):
    """
    The function cleans up numbers, remove punctuation and line break, and special characters from a string 
    and converts it to lowercase.

    Args:
        text: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text) 
    text = re.sub('\w*\d\w*', '', text)    
    text = re.sub('[‘’“”…«»¿?¡!\-_\(\)]', '', text)
    text = re.sub(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', text)
  
    return text



In [40]:
# function for tokenizing

def tokenize_text(text):
        """
        Tokenize a string.

        Args:
            text: String to be tokenized.

        Returns:
            A list of words as the result of tokenization.
        """
        return word_tokenize(text)


In [41]:
# function for stemming, and lemmatizing

def stem_and_lemmatize(list_of_words):
    """
    Perform stemming and lemmatization on a list of words.

    Args:
        list_of_words: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    stemmer = nltk.stem.SnowballStemmer('spanish')
    lemmatizer = WordNetLemmatizer()
    stemmed_lemmantized_list = [stemmer.stem(lemmatizer.lemmatize(word)) for word in list_of_words]
    return stemmed_lemmantized_list

In [51]:
# function for stops words

def remove_stopwords(list_of_words):
    """
    Remove English stopwords from a list of strings.

    Args:
        list_of_words: A list of strings.

    Returns:
        A list of strings after stop words are removed.
    """
    spanish_stop_words = stopwords.words('spanish')
      
    return [w for w in list_of_words if not w in spanish_stop_words]


Executing cleaning, tokenizing, stemming, lemmatizing and deleting stop words functions

In [44]:
news['processed_text'] = news['text'].apply(lambda x: clean_up_text(str(x)))

In [46]:
news['processed_text'] = news['processed_text'].apply(tokenize_text)

In [48]:
news['processed_text'] = news['processed_text'].apply(stem_and_lemmatize)

In [52]:
news['processed_text'] = news['processed_text'].apply(remove_stopwords)

In [53]:
news.head()

Unnamed: 0,title,text,author,url,label,processed_text
0,La caprichosa y millonaria primera decisión de...,"Begoña Gómez quiere cambios más ""contundentes""...",SALOMON LUSH,https://www.periodistadigital.com/periodismo/t...,False,"[begoñ, gomez, quier, cambi, mas, contundent, ..."
1,Santiago Abascal dice que perdona a las manife...,Santiago Abascal se ha mostrado generoso al op...,Ayudante de Becario,https://lavozdelbecario.es/santiago-abascal-di...,False,"[santiag, abascal, mostr, gener, opin, sobr, m..."
2,El aborto será legal durante todo el embarazo ...,La norma avala el aborto tardío en cualquier m...,Actuall / Aci Prensa,https://www.actuall.com/vida/el-aborto-sera-le...,False,"[norm, aval, abort, tardi, cualqui, moment, ca..."
3,El Gobierno cambiará los cementerios españoles...,La naturaleza y la composición de los ritos fu...,Mediterráneo Digital,https://www.mediterraneodigital.com/espana/and...,False,"[naturalez, composicion, rit, funerari, duel, ..."
4,"Dos perros ""contraen matrimonio""","La concejal de Seguridad de Lorca, Murcia (Esp...",Redacción ACI Prensa,https://www.aciprensa.com/noticias/casan-a-dos...,False,"[concejal, segur, lorc, murci, españ, cas, do,..."


Creating Bag of Words.
 The bag of words contains all the unique words in your whole text body

In [55]:
# Creating a bag of words from the processed data.

bag_of_words = [word for word in news.processed_text]


[['begoñ',
  'gomez',
  'quier',
  'cambi',
  'mas',
  'contundent',
  'profund',
  'nunc',
  'nuestr',
  'democraci',
  'hab',
  'produc',
  'mudanz',
  'expres',
  'inquilin',
  'monclo',
  'cuestion',
  'hor',
  'graci',
  'mocion',
  'censur',
  'famili',
  'rajoy',
  'fernandez',
  'ten',
  'empaquet',
  'pertenent',
  'dej',
  'atras',
  'resident',
  'sid',
  'ultim',
  'años',
  'ana',
  'ros',
  'especul',
  'caramelit',
  'pued',
  'caer',
  'podemit',
  'mejor',
  'sanchez',
  'da',
  'radi',
  'television',
  'español',
  'public',
  'llam',
  'premur',
  'puert',
  'estab',
  'pedr',
  'sanchez',
  'omnipresent',
  'influyent',
  'primer',
  'dam',
  'begoñ',
  'gomez',
  'personal',
  'monclo',
  'trabaj',
  'destaj',
  'par',
  'prepar',
  'cambi',
  'residencial',
  'toall',
  'saban',
  'colchon',
  'cambi',
  'cas',
  'hac',
  'defect',
  'per',
  'ocasion',
  'va',
  'hab',
  'mas',
  'segun',
  'fuent',
  'palaci',
  'monclo',
  'consult',
  'period',
  'digital',
 