In [2]:
import pandas as pd
import spacy
import string
import re
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS

In [3]:
pd.set_option('display.max_colwidth', None)
nlp = spacy.load('en_core_web_sm')

In [4]:
df = pd.read_csv('text_emotion.csv')

In [5]:
df.shape

(40000, 4)

In [6]:
df.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin on your call...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will."


In [7]:
df['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [8]:
sent_list = ['neutral','worry', "happiness", "sadness", 'love']
df = df[df["sentiment"].isin(sent_list)]
df.shape

(31313, 4)

In [9]:
df['sentiment'].value_counts(normalize=True)

neutral      0.275860
worry        0.270143
happiness    0.166353
sadness      0.164947
love         0.122697
Name: sentiment, dtype: float64

In [None]:
### CLEANING DATA

In [10]:
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # convert all words to lowercase
    tokens = [tokens.lower() for tokens in tokens]
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    tokens = ' '.join(tokens)
    return tokens

In [11]:
def preprocess(text):
    # Create Doc object
    doc = nlp(text, disable=['ner', 'parser'])
    # Generate lemmas
    lemmas = [token.lemma_ for token in doc]
    # Remove stopwords and non-alphabetic characters
    a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() and lemma not in STOP_WORDS]
    
    return ' '.join(a_lemmas)

In [12]:
# In order to reduce the vocabulary size, change words with more than two repeated letters. Example: soooo => so.
def repeated_letters(text):
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    return text

In [13]:
df['content'] = df['content'].apply(clean_doc)

In [14]:
df.head()

Unnamed: 0,tweet_id,sentiment,author,content
1,1956967666,sadness,wannamama,layin bed with headache ughhhhwaitin on your call
2,1956967696,sadness,coolfunky,funeral ceremonygloomy friday
4,1956968416,neutral,xkilljoyx,dannycastillo we want to trade with someone who has houston tickets but no one will
5,1956968477,worry,xxxPEACHESxxx,repinging why didnt you go to prom bc my bf didnt like my friends
6,1956968487,sadness,ShansBee,should be sleep but im not thinking about an old friend who want but hes married now damn amp he wants me scandalous


In [15]:
# After cleaning process, several rows are empty. Remove all instances with empty content.
df.drop(df[df["content"] == ''].index, inplace=True)

In [16]:
df.shape

(31301, 4)

In [18]:
df['content'] = df['content'].apply(preprocess)

In [19]:
df.head()

Unnamed: 0,tweet_id,sentiment,author,content
1,1956967666,sadness,wannamama,layin bed headache ughhhhwaitin
2,1956967696,sadness,coolfunky,funeral ceremonygloomy friday
4,1956968416,neutral,xkilljoyx,dannycastillo want trade houston ticket
5,1956968477,worry,xxxPEACHESxxx,repinge prom bc bf like friend
6,1956968487,sadness,ShansBee,sleep think old friend want s married damn amp want scandalous


In [20]:
df['content'] = df['content'].apply(repeated_letters)

In [21]:
df.head()

Unnamed: 0,tweet_id,sentiment,author,content
1,1956967666,sadness,wannamama,layin bed headache ughwaitin
2,1956967696,sadness,coolfunky,funeral ceremonygloomy friday
4,1956968416,neutral,xkilljoyx,dannycastillo want trade houston ticket
5,1956968477,worry,xxxPEACHESxxx,repinge prom bc bf like friend
6,1956968487,sadness,ShansBee,sleep think old friend want s married damn amp want scandalous


In [22]:
df.to_csv('sentiment_5sent_clean')