In [11]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_colwidth', -1)
from time import time
import re
import string
import os
import emoji
from pprint import pprint
import collections
import glob

from sklearn.base import BaseEstimator, TransformerMixin

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english')) 

In [4]:
training_path = "data/raw/2018-Valence-oc-En-train.txt"
dev_path = "data/raw/2018-Valence-oc-En-dev.txt"
test_path = "data/raw/2018-Valence-oc-En-test-gold.txt"

training_outpath = "data/cleaned_en_training.txt"
dev_outpath = "data/cleaned_en_dev.txt"
test_outpath = "data/cleaned_en_test.txt"

In [5]:
training_df = pd.read_table(training_path)
dev_df = pd.read_table(dev_path)
test_df = pd.read_table(test_path)

test_df

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Class
0,2018-En-01964,Gm and have a #Tuesday!,valence,0: neutral or mixed emotional state can be inferred
1,2018-En-01539,@realDonaldTrump But you have a lot of time for tweeting #ironic,valence,0: neutral or mixed emotional state can be inferred
2,2018-En-04235,I graduated yesterday and already had 8 family members asking what job I've got now 😂 #nightmare,valence,0: neutral or mixed emotional state can be inferred
3,2018-En-03711,@jaimitoelcrack7 Seriously...I've been sitting here for five minutes watching this in awe. It never gets less amazing.,valence,1: slightly positive emotional state can be inferred
4,2018-En-01177,Whether my glass is half empty or its half full. I'm just grateful I even have a glass and that there's something in it.\n #optimism 🤔,valence,2: moderately positive emotional state can be inferred
5,2018-En-01352,"Do you ever fall asleep before you set your alarm, but then the Lord wakes you up on time anyways 🙌🏻 #WontHeDoIt",valence,1: slightly positive emotional state can be inferred
6,2018-En-03829,There was a live chicken running around the cfa parking lot. No one knows where it came from or how it got there. #ironic,valence,0: neutral or mixed emotional state can be inferred
7,2018-En-02627,Memo to @PutinRF_Eng: I'm going to run for #POTUS and would appreciate your help with opposition research. #sarcasm,valence,0: neutral or mixed emotional state can be inferred
8,2018-En-02078,@TommySandhu @bbcasiannetwork tommy you r phenomenal smashed #ajjdin on the mixes #hilarious and #awesome 😜😂😂👍🏻,valence,3: very positive emotional state can be inferred
9,2018-En-00030,@LDN_Muscle a #bright and early session this morning before work! #fullbody #ldnmladies surprised that it actually helped reduce my #doms,valence,2: moderately positive emotional state can be inferred


In [6]:
def adapt_valence_scores(df): 
    valence_list = list()
    for index, row in df.iterrows():
        valence = row["Intensity Class"]
        valence = valence.replace(valence, valence[:2].replace(":", ""))
        valence_list.append(valence)
    return valence_list
        
training_valence = adapt_valence_scores(training_df)
dev_valence = adapt_valence_scores(dev_df)
test_valence = adapt_valence_scores(test_df)

In [7]:
class CleanText(BaseEstimator, TransformerMixin):
    """
    From https://towardsdatascience.com/sentiment-analysis-with-text-mining-13dd2b33de27
    """
    def remove_mentions(self, input_text):
        return re.sub(r'@\w+', '', input_text)
    
    def remove_urls(self, input_text):
        return re.sub(r'http.?://[^\s]+[\s]?', '', input_text)
    
    def emoji_oneword(self, input_text):
        # By compressing the underscore, the emoji is kept as one word
        return input_text.replace('_','')
    
    def remove_punctuation(self, input_text):
        # Make translation table
        punct = string.punctuation
        trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
        return input_text.translate(trantab)
    
    def remove_digits(self, input_text):
        return re.sub('\d+', '', input_text)
    
    def to_lower(self, input_text):
        return input_text.lower()
    
    def remove_stopwords(self, input_text):
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        clean_X = X.apply(self.remove_mentions).apply(self.remove_urls).apply(self.emoji_oneword).apply(self.remove_punctuation).apply(self.remove_digits).apply(self.to_lower).apply(self.remove_stopwords)
        return clean_X

In [8]:
ct = CleanText()
training_clean = ct.fit_transform(training_df.Tweet)
dev_clean = ct.fit_transform(dev_df.Tweet)
test_clean = ct.fit_transform(test_df.Tweet)

In [9]:
training_cleaned = pd.DataFrame(training_clean)
training_cleaned['Valence score']= training_valence

dev_cleaned = pd.DataFrame(dev_clean)
dev_cleaned['Valence score']= dev_valence

test_cleaned = pd.DataFrame(test_clean)
test_cleaned['Valence score']= test_valence
test_cleaned = test_cleaned.drop([271]) #Drop empty line from test set

In [10]:
training_cleaned.to_csv(training_outpath, sep="\t")
dev_cleaned.to_csv(dev_outpath, sep="\t")
test_cleaned.to_csv(test_outpath, sep="\t")

In [21]:
#if __name__ == '__main__':
raw_filepaths = glob.glob('data/V_reg_raw/*')
for filepath in raw_filepaths:
    basename = os.path.basename(filepath)
    #os.path.basename(path)
    print(basename.split('-'))