### Limpiar muestras 1 y 2 en inglés

**Nota:** Las muestras en castellano _evaluation1_labeled.csv_ y _vacunes_100rt_evaluation2_label.csv_ corresponden respectivamente a _df_sample_en.csv_ y _df_sample_en2.csv_, que pasa a ser _df_sample_en2-455.csv_ después de corregir algunas cosas usando el código en _translate.py_

In [13]:
import spacy
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

import pandas as pd, os, numpy as np, csv, re, nltk, string
from collections import Counter
from time import time
from collections import defaultdict 
from spacy.lang.en import English
from textblob import TextBlob

In [14]:
df_1 = pd.read_csv('data/df_sample_en.csv', index_col = 0)
df_2 = pd.read_csv('data/df_sample_en2-455.csv', index_col = 0)
df_sample = pd.concat([df_1, df_2])

In [15]:
# Emojis

def extract_emojis(text):
    emojis = []
    for char in text:
        if ord(char) > 600: emojis.append(char)
    return emojis

emojis_lists = df_sample.content.apply(extract_emojis).tolist()

emojis = Counter( [emoji for emojis_list in emojis_lists for emoji in emojis_list] )

with open('data/emojis.csv', "w", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(emojis.most_common())
    
with open('data/emojis_dict.csv', encoding="utf-8") as csvfile:
    emojis = {}
    emojis_ranking = []
    reader = csv.reader(csvfile)
    for i in reader:
        if len(i) > 2:
            emojis[i[0]] = i[-1]
            emojis_ranking.append(i)
            

slang_data = {}
slang_data[':)'] = r'happy'
slang_data[':-)'] = r'happy'
slang_data[':D'] = r'happy'
slang_data[':-D'] = r'happy'

slang_data[':*'] = r'kiss'

slang_data[":'("] = r'sad'
slang_data[':('] = r'sad'
slang_data[':-('] = r'sad'
slang_data['TT'] = r'sad'

In [16]:
def clean_full_text(text):
    hashtags = []
    hashtags_ = re.findall(r'#[A-ZÀ-ßa-zà-ý]+', text)
    mention_ = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)

    for hashtag in hashtags_:
        text = re.sub(hashtag, '', text)
        expanded = " ".join([re.sub(r'[#\s]', '', token) for token in re.split(r'([A-ZÀ-ß][a-zà-ý]*)', hashtag) if token and not re.match(r'^\s$', hashtag)]).strip()
        expanded = re.sub(r'\b(\S)\s+(?=\S\b)', r'\1', expanded)
        hashtags.append(expanded)

    text = re.sub(r'\s', ' ', text + '. '.join(hashtags))
    text = re.sub(mention_, "", text)
    text = re.sub(r"http\S+", "", text)
    
    return text

In [17]:
df_sample['preprocessed_text'] = df_sample.content.apply(clean_full_text)
for key, value in emojis.items(): df_sample['preprocessed_text'] = df_sample['preprocessed_text'].str.replace(key,' ' +  value + ' ')# 'emoji ')


df_sample['preprocessed_text'] = df_sample['preprocessed_text'].str.lower()
df_sample['slang_data'] = 0
for key, value in slang_data.items():
    df_sample['slang_data'] += df_sample['preprocessed_text'].str.count(pat=re.escape(key))
    df_sample['preprocessed_text'] = df_sample.preprocessed_text.str.replace(pat=re.escape(key), repl=' ' + value + ' ', regex=True)
    
df_sample['preprocessed_text'] = df_sample['preprocessed_text'].str.replace(pat=r"\d", repl=' ', regex=True) #remove numbers
df_sample['preprocessed_text'] = df_sample['preprocessed_text'].str.replace(pat='[' + re.escape('?¿') + ']', repl=' ', regex=True)
df_sample['preprocessed_text'] = df_sample['preprocessed_text'].str.replace(pat='[' + re.escape('!¡') + ']', repl=' ', regex=True)
df_sample['preprocessed_text'] = df_sample['preprocessed_text'].str.replace(pat='[' + re.escape('?¿') + ']', repl=' ', regex=True)
df_sample['preprocessed_text'] = df_sample['preprocessed_text'].str.replace(pat='[' + re.escape('!¡') + ']', repl=' ', regex=True)
df_sample['preprocessed_text'] = df_sample['preprocessed_text'].str.replace(pat='[' + re.escape(string.punctuation) + r'\n\t]', repl=' ', regex=True)
df_sample['preprocessed_text'] = df_sample['preprocessed_text'].str.strip()
df_sample['preprocessed_text'] = df_sample['preprocessed_text'].str.replace(pat=r'\s+', repl=' ', regex=True)

In [18]:
stopwords = nltk.corpus.stopwords.words('english')
nlp = spacy.load('en_core_web_sm')


t = time()

# Lemmatize and remove stop words
df_sample['preprocessed_text'] = df_sample['preprocessed_text'].apply(lambda x: ' '.join([l.lemma_ for l in nlp(x) if l.text not in stopwords]))

df_sample['punctuation'] = df_sample['content'].str.count(pat='[' + re.escape(string.punctuation) + r'!¿?¡]')

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.29 mins


In [19]:
#%% Sentiment
def get_polarity(text):
    return TextBlob(text).sentiment

sentiment = df_sample['preprocessed_text'].apply(get_polarity)


df_sample['polarity'] = sentiment.apply(lambda x: x[0])
df_sample['subjectivity'] = sentiment.apply(lambda x: x[1])


# Classes based on polarity scores
df_sample['pol_labels'] = [1 if x > 0 else 2 if x == 0 else 0 for x in df_sample['polarity']]

In [None]:
# df_sample.to_csv('data/all_sample.csv')