# DATA PROCESSING

In [18]:
import re
from typing import List
import spacy
from spacy.tokens import Doc
from tqdm import tqdm
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer

data = "train"

In [19]:
class SpaCyPreProcessor:

    def __init__(self, spacy_model = None, remove_numbers = False, remove_special = False, pos_to_remove = None, remove_stop_words = False, lemmatize = False, use_gpu = False) -> None:
        
        self.__remove_numbers = remove_numbers
        self.__remove_special = remove_special
        self.__pos_to_remove = pos_to_remove
        self.__remove_stop_words = remove_stop_words
        self.__lemmatize = lemmatize

        if spacy_model is None:
            self.model = spacy.load("en_core_web_sm")
        else:
            self.model = spacy_model

        if use_gpu:
            spacy.prefer_gpu()

    @staticmethod
    def download_spacy_model(model="en_core_web_sm"):
        print(f"Downloading spaCy model {model}")
        spacy.cli.download(model)
        print(f"Finished downloading model")

    @staticmethod
    def load_model(model="en_core_web_sm"):
        return spacy.load(model, disable=["ner", "parser"])

    def tokenize(self, text) -> List[str]:
        """
        Tokenize text using a spaCy pipeline
        :param text: Text to tokenize
        :return: list of str
        """
        doc = self.model(text)
        return [token.text for token in doc]

    def preprocess_text(self, text) -> str:
        """
        Runs a spaCy pipeline and removes unwanted parts from text
        :param text: text string to clean
        :return: str, clean text
        """
        doc = self.model(text)
        return self.__clean(doc)

    def preprocess_text_list(self, texts=List[str]) -> List[str]:
        """
        Runs a spaCy pipeline and removes unwantes parts from a list of text.
        Leverages spaCy's `pipe` for faster batch processing.
        :param texts: List of texts to clean
        :return: List of clean texts
        """
        clean_texts = []
        for doc in tqdm(self.model.pipe(texts)):
            clean_texts.append(self.__clean(doc))

        return clean_texts

    def __clean(self, doc: Doc) -> str:

        tokens = []
        # POS Tags removal
        if self.__pos_to_remove:
            for token in doc:
                if token.pos_ not in self.__pos_to_remove:
                    tokens.append(token)
        else:
            tokens = doc

        # Remove Numbers
        if self.__remove_numbers:
            tokens = [
                token for token in tokens if not (token.like_num or token.is_currency)
            ]

        # Remove Stopwords
        if self.__remove_stop_words:
            tokens = [token for token in tokens if not token.is_stop]
        # remove unwanted tokens
        tokens = [
            token
            for token in tokens
            if not (
                token.is_punct or token.is_space or token.is_quote or token.is_bracket
            )
        ]

        # Remove empty tokens
        tokens = [token for token in tokens if token.text.strip() != ""]

        # Lemmatize
        if self.__lemmatize:
            text = " ".join([token.lemma_ for token in tokens])
        else:
            text = " ".join([token.text for token in tokens])

        if self.__remove_special:
            # Remove non alphabetic characters
            text = re.sub(r"[^a-zA-Z\']", " ", text)
        # remove non-Unicode characters
        text = re.sub(r"[^\x00-\x7F]+", "", text)

        text = text.lower()

        return text

In [20]:
df = pd.read_csv(f'data/{data}.csv')
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [21]:
lemmatizer = WordNetLemmatizer()
keywords = list(set(df[df['keyword'].notna()]['keyword']))
keywords_lemmatized = [lemmatizer.lemmatize(word).lower() for word in keywords]
filtered_df = df[df['keyword'].isna()]
texts_without_keyword = filtered_df['text']
ids_without_keyword = filtered_df['id']
pattern = r'\#\w+'
extracted_words = []

for id, text in zip(ids_without_keyword, texts_without_keyword):
    matches = re.findall(pattern, text)
    for match in matches:
        word = re.search(r'\w+', match).group()
        if word in keywords or lemmatizer.lemmatize(word).lower() in keywords_lemmatized:
            row_index = df.index[df['id'] == id].tolist()[0]
            df['keyword'][row_index] = word
            extracted_words.append(word)
        else:
            keywords = df.loc[df['keyword'].notna() & df['text'].str.contains(word), 'keyword'].tolist()
            if len(keywords) == 1:
                row_index = df.index[df['id'] == id].tolist()[0]
                df['keyword'][row_index] = keywords[0]
df.loc[df['keyword'].isna(), 'keyword'] = 'NONE'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['keyword'][row_index] = word
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['keyword'][row_index] = word
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['keyword'][row_index] = word
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['keyword'][row_index] = word
A value is trying to be set on a copy of

In [22]:
# SpaCyPreProcessor.download_spacy_model('en_core_web_trf')

In [23]:
spacy_model = SpaCyPreProcessor.load_model('en_core_web_trf')
preprocessing_pipeline = SpaCyPreProcessor(spacy_model=spacy_model, remove_numbers=True, remove_special=True, remove_stop_words=True, lemmatize=True, use_gpu=True)

df['cleaned_text'] = ''
df['cleaned_keyword'] = ''

for i in tqdm(df.index):
    df['cleaned_text'][i] = preprocessing_pipeline.preprocess_text(df['text'][i])
    df['cleaned_keyword'][i] = preprocessing_pipeline.preprocess_text(df['keyword'][i])
df.loc[df['cleaned_keyword'] == '', 'cleaned_keyword'] = 'none'
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_text'][i] = preprocessing_pipeline.preprocess_text(df['text'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_keyword'][i] = preprocessing_pipeline.preprocess_text(df['keyword'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_text'][i] = preprocessing_pipeline.preprocess_text(df['text'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/panda

Unnamed: 0,id,keyword,location,text,target,cleaned_text,cleaned_keyword
0,1,earthquake,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive,earthquake
1,4,NONE,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,
2,5,NONE,,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...,
3,6,wildfires,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...,wildfire
4,7,wildfires,,Just got sent this photo from Ruby #Alaska as ...,1,get send photo ruby alaska smoke wildfire pour...,wildfire
...,...,...,...,...,...,...,...
7608,10869,NONE,,Two giant cranes holding a bridge collapse int...,1,giant crane hold bridge collapse nearby home h...,
7609,10870,NONE,,@aria_ahrary @TheTawniest The out of control w...,1,aria ahrary thetawniest control wild fire ca...,
7610,10871,NONE,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,m utc km s volcano hawaii http ...,
7611,10872,NONE,,Police investigating after an e-bike collided ...,1,police investigate e bike collide car little p...,


In [27]:
df.to_csv(f"cleaned_{data}.csv")