# <span style="color:#0077b6"> <center> Text Mining and Search - AA 2020/2021 </center>

## <span style="color:#0077b6"> <center> Preprocessing </center>

> <span style="color:#00b4d8">**Studente**:</span> Campironi Matteo
>
> <span style="color:#00b4d8">**Matricola**:</span> 801850

> <span style="color:#00b4d8">**Studente**:</span> Di Maggio Serena
>
> <span style="color:#00b4d8">**Matricola**:</span> 821063

## Importo librerie necessarie

In [None]:
import pandas as pd
import numpy as np
import os
import re
import unicodedata

import random
from random import sample

import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

import contractions

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
WHITE_SPACE_PATTERN = re.compile(r' +')

## Definisco le directory 

In [None]:
CNN_STORIES_PATH = "/home/mcampironi/TMS/cnn_stories/cnn/stories/"
DM_STORIES_PATH = "/home/mcampironi/TMS/dailymail_stories/dailymail/stories/"

## Definisco funzioni

-`loadStory`: carica gli articoli;

-`nltk_tag_to_wordnet_tag`: funzione che mappa i tag di nltk in quelli di wordnet in modo da renderli compatibili con il lemmatizer;

-`lemmatize_sentence`: effettua la lemmatization di una frase dopo aver applicato il POS tagging;

-`stringPreprocessing`: effettua il preprocessing di un testo.

In [None]:
def loadStory(path):
    file = open(path, encoding='utf-8')
    text = file.read()
    file.close()
    return text

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def stringPreprocessing(text):
    text = text.lower() #lowercase
    text = contractions.fix(text) #contractions
    text = re.sub(r'[^\w\s]', '', text) #punctuation
    text = re.sub(r'\d', '', text) #numbers
    text = re.sub(WHITE_SPACE_PATTERN, ' ', text.strip()) #whitespaces
    text = lemmatize_sentence(text) #lemmatization
    tokenizedText = word_tokenize(text) #tokenize
    finalText= [item for item in tokenizedText if item not in stop_words] #stopwords
    finalText = ' '.join(map(str, finalText))
    
    return finalText

## Pulisco il dataset ed estraggo articoli e rispettivi riassunti

In [None]:
articlesCNN = []
summariesCNN = []
i=0

random.seed(2226)
for filename in sample(sorted(os.listdir(CNN_STORIES_PATH)),15000):
    
    if i < 10000:
        
        text = loadStory(CNN_STORIES_PATH + filename)
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

        #text cleaning
        text = re.sub("(\.?\s*\\n\\n)", '. ', text)
        text = text.replace('\n',' ')
        text = re.sub(".*\(CNN[a-z]*\)\s*-*\s*", '', text)
        text = re.sub(".*\(([^\)]+)\)\s*-+\s*", '', text)

        article = text[:text.find('@highlight.')].strip()
        
        if len(article) > 5:
            summary = text[text.find('@highlight.'):].split('@highlight.')[1:]
            summary = [sent.strip() for sent in summary]
            summary = ' '.join(map(str, summary))

            articlesCNN.append(article)
            summariesCNN.append(summary)  
            i = i + 1

In [None]:
articlesDM = []
summariesDM = []
i=0

random.seed(2226)
for filename in sample(sorted(os.listdir(DM_STORIES_PATH)),15000):
    
    if i < 10000:
        
        text = loadStory(DM_STORIES_PATH + filename)
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

        #text cleaning
        text = re.sub("(\.?\s*\\n\\n)", '. ', text)
        text = text.replace('\n',' ')
        text = re.sub("^By.*?(?:UPDATED).*?(?:\d{4}).\s*", '', text)
        text = re.sub("^PUBLISHED.*?(?:UPDATED).*?(?:\d{4}).\s*", '', text)
        text = re.sub("^By.*?\.\s", '', text)
        text = re.sub("(Scroll down for video)s*", '', text)

        article = text[:text.find('@highlight.')].strip()
        
        if len(article) > 5:
            summary = text[text.find('@highlight.'):].split('@highlight.')[1:]
            summary = [sent.strip() for sent in summary]
            summary = ' '.join(map(str, summary))

            articlesDM.append(article)
            summariesDM.append(summary)  
            i = i + 1
        

In [None]:
df = pd.DataFrame()
df["article"] = articlesCNN + articlesDM
df["summary"] = summariesCNN + summariesDM
df['articlePP'] = df['article'].apply(lambda x: stringPreprocessing(x))
df

Unnamed: 0,article,summary,articlePP
0,As ash from Chile's Calbuco Volcano spread eas...,Volcano already has erupted twice this week. I...,ash chile calbuco volcano spread east argentin...
1,Baltimore Ravens star running back Ray Rice wa...,Baltimore Raven running back Ray Rice is indic...,baltimore raven star run back ray rice indict ...
2,"Nine years after ""Bruce Almighty,"" Universal i...",EW has confirmed that Universal has plans to r...,nine year bruce almighty universal plot second...
3,Former Italian Prime Minister Silvio Berluscon...,"A judge in Italy indicts Sergio Berlusconi, ac...",former italian prime minister silvio berluscon...
4,"Despite most humans' land-centric view, Earth ...",Oceans make life on Earth possible providing o...,despite human landcentric view earth ocean pla...
...,...,...,...
19995,A teenage girl has died after she jumped out o...,Laikyn Field hit the pavement on Saturday when...,teenage girl die jump parent move car argument...
19996,Cash on tap: Scarlet Johansson is being paid 2...,Scarlett Johansson admitted to Mail on Sunday ...,cash tap scarlet johansson pay sodastream appe...
19997,With young children inevitably set to ask a ba...,Fun infographic attempts to explain the scienc...,young child inevitably set ask barrage questio...
19998,Prince Harry proved he is an excellent uncle-t...,Prince was pictured holding the bear as he lan...,prince harry prove excellent uncletobe handdel...


## Salvo il dataset in un file csv

In [None]:
df.to_csv("df.csv", index=False)