# Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import re, nltk, string
nltk.download('punkt')
nltk.download("stopwords")
nltk.download('vader_lexicon')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem.porter import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# load dataset
twcs = pd.read_csv('twcs.csv')
twcs.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0


In [None]:
# hanya ambil data kolom 'text'
df = twcs[['text']][0:2811774]
df.head()

Unnamed: 0,text
0,@115712 I understand. I would like to assist y...
1,@sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...
3,@115712 Please send us a Private Message so th...
4,@sprintcare I did.


In [None]:
# lower case
df['text'] = df['text'].str.lower()
df.head

<bound method NDFrame.head of                                                       text
0        @115712 i understand. i would like to assist y...
1            @sprintcare and how do you propose we do that
2        @sprintcare i have sent several private messag...
3        @115712 please send us a private message so th...
4                                       @sprintcare i did.
...                                                    ...
2811769  @823869 hey, we'd be happy to look into this f...
2811770  @115714 wtf!? i’ve been having really shitty s...
2811771  @143549 @sprintcare you have to go to https://...
2811772  @823870 sounds delicious, sarah! 😋 https://t.c...
2811773  @aldiuk  warm sloe gin mince pies with ice cre...

[2811774 rows x 1 columns]>

In [None]:
# remove punctuations
df['text'] = df['text'].str.translate(str.maketrans('', '', string.punctuation))
df.head()

Unnamed: 0,text
0,115712 i understand i would like to assist you...
1,sprintcare and how do you propose we do that
2,sprintcare i have sent several private message...
3,115712 please send us a private message so tha...
4,sprintcare i did


In [None]:
# remove stopwords
stop = stopwords.words('english')
pat = r'\b(?:{})\b'.format('|'.join(stop))
df['text_wo_stopwords'] = df['text'].str.replace(pat, '')
df['text_wo_stopwords'] = df['text_wo_stopwords'].str.replace(r'\s+', ' ')

NameError: ignored

In [None]:
# stemming
stemmer = SnowballStemmer('english')
def stem_words(tweet):
    return ' '.join([stemmer.stem(word) for word in str(tweet).split()])

df['text_stem'] = df['text_wo_stopwords'].apply(lambda tweet: stem_words(tweet))

In [None]:
# labelling
# apakah tweet memiliki sentimen negatif, netral, atau positif
def compute_vader_scores(df, label):
    sid = SentimentIntensityAnalyzer()
    df["vader_neg"] = df[label].apply(lambda x: sid.polarity_scores(x)["neg"])
    df["vader_neu"] = df[label].apply(lambda x: sid.polarity_scores(x)["neu"])
    df["vader_pos"] = df[label].apply(lambda x: sid.polarity_scores(x)["pos"])
    df["vader_comp"] = df[label].apply(lambda x: sid.polarity_scores(x)["compound"])
    return df

df2 = compute_vader_scores(df,'text_stem')

for i in range(len(df2)):
    if df2.loc[i,'vader_comp'] >= 0.05:
        df2.loc[i, 'label'] = 'positive'
    elif df2.loc[i,'vader_comp'] <= 0.05:
        df2.loc[i, 'label'] = 'negative'       
    else:
        df2.loc[i, 'label'] = 'neutral'

In [None]:
# remove unused column
df3 = df2[['text_stem', 'vader_comp', 'label']][0:2811774]
df3.head()

In [None]:
# rename column 'text_stem' to 'tweet'
df3.rename(columns = {'text_stem':'tweet'}, inplace = True)

In [None]:
# saving file
# menyimpan data yang sudah bersih dan siap untuk digunakan dalam membuat model machine learning
df3.to_csv("twcs_cleaned.csv", index = False)