In [77]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_colwidth', -1)
from time import time
import re
import string
import os
import emoji
from pprint import pprint
import collections

from sklearn.base import BaseEstimator, TransformerMixin

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english')) 

In [140]:
training_path = "2018-Valence-oc-En-train.txt"
dev_path = "2018-Valence-oc-En-dev.txt"
test_path = "2018-Valence-oc-En-test-gold.txt"

training_outpath = "cleaned_training.txt"
dev_outpath = "cleaned_dev.txt"
test_outpath = "cleaned_test.txt"

In [141]:
training_df = pd.read_table(training_path)
dev_df = pd.read_table(dev_path)
test_df = pd.read_table(test_path)

test_df

Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Class
0,2018-En-01964,Gm and have a #Tuesday!,valence,0: neutral or mixed emotional state can be inferred
1,2018-En-01539,@realDonaldTrump But you have a lot of time for tweeting #ironic,valence,0: neutral or mixed emotional state can be inferred
2,2018-En-04235,I graduated yesterday and already had 8 family members asking what job I've got now 😂 #nightmare,valence,0: neutral or mixed emotional state can be inferred
3,2018-En-03711,@jaimitoelcrack7 Seriously...I've been sitting here for five minutes watching this in awe. It never gets less amazing.,valence,1: slightly positive emotional state can be inferred
4,2018-En-01177,Whether my glass is half empty or its half full. I'm just grateful I even have a glass and that there's something in it.\n #optimism 🤔,valence,2: moderately positive emotional state can be inferred
...,...,...,...,...
932,2018-En-04059,Premier League Teams should fear next seasons Arsenal's XI. #coyg #afc,valence,0: neutral or mixed emotional state can be inferred
933,2018-En-01488,"how are you my love? @Hashtag_DonJon love youu!! thanks for the smile, that motivates me to keep going!! 💛 so blessed to have you 😊",valence,3: very positive emotional state can be inferred
934,2018-En-02648,"'She is the clothed with strength and dignity, and she laughs without fear of the future.' 💛🌿 @jessconte",valence,0: neutral or mixed emotional state can be inferred
935,2018-En-03444,My dads big day is only less than 2 weeks away. 😱 #excited,valence,3: very positive emotional state can be inferred


In [142]:
def adapt_valence_scores(df): 
    valence_list = list()
    for index, row in df.iterrows():
        valence = row["Intensity Class"]
        valence = valence.replace(valence, valence[:2].replace(":", ""))
        valence_list.append(valence)
    return valence_list
        
training_valence = adapt_valence_scores(training_df)
dev_valence = adapt_valence_scores(dev_df)
test_valence = adapt_valence_scores(test_df)

In [143]:
class CleanText(BaseEstimator, TransformerMixin):
    """
    From https://towardsdatascience.com/sentiment-analysis-with-text-mining-13dd2b33de27
    """
    def remove_mentions(self, input_text):
        return re.sub(r'@\w+', '', input_text)
    
    def remove_urls(self, input_text):
        return re.sub(r'http.?://[^\s]+[\s]?', '', input_text)
    
    def emoji_oneword(self, input_text):
        # By compressing the underscore, the emoji is kept as one word
        return input_text.replace('_','')
    
    def remove_punctuation(self, input_text):
        # Make translation table
        punct = string.punctuation
        trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
        return input_text.translate(trantab)
    
    def remove_digits(self, input_text):
        return re.sub('\d+', '', input_text)
    
    def to_lower(self, input_text):
        return input_text.lower()
    
    def remove_stopwords(self, input_text):
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        clean_X = X.apply(self.remove_mentions).apply(self.remove_urls).apply(self.emoji_oneword).apply(self.remove_punctuation).apply(self.remove_digits).apply(self.to_lower).apply(self.remove_stopwords)
        return clean_X

In [144]:
ct = CleanText()
training_clean = ct.fit_transform(training_df.Tweet)
dev_clean = ct.fit_transform(dev_df.Tweet)
test_clean = ct.fit_transform(test_df.Tweet)

In [147]:
training_cleaned = pd.DataFrame(training_clean)
training_cleaned['Valence score']= training_valence

dev_cleaned = pd.DataFrame(dev_clean)
dev_cleaned['Valence score']= dev_valence

test_cleaned = pd.DataFrame(test_clean)
test_cleaned['Valence score']= test_valence

In [148]:
training_cleaned.to_csv(training_outpath, sep="\t")
dev_cleaned.to_csv(dev_outpath, sep="\t")
test_cleaned.to_csv(test_outpath, sep="\t")

In [192]:


test_df.loc[[271]]
test_cleaned.loc[[271]]

KeyError: "None of [Int64Index([271], dtype='int64')] are in the [index]"