In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nomjn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load Data
Panggil data jadikan sebagai dataframe supaya lebih mudah ketika di proses

In [12]:
df = pd.read_csv('./datasets/latihan-teks-preprocessing.csv')

In [13]:
df.head()

Unnamed: 0,teks
0,RT @last_jones: Just saw 4 children killed in ...
1,@productortano The Walking dead #CaballeroDeDia
2,RT @DavidHundeyin: Eastern Europe is NOT LIKE ...
3,Tried to add their one iconic symbol from thei...
4,watching the finale season of ‘the walking dea...


## Case Folding
Ubah teks menjadi huruf kecil semua

In [14]:
# Case Folding
df['case_folded'] = df['teks'].str.lower()

In [15]:
df.head()

Unnamed: 0,teks,case_folded
0,RT @last_jones: Just saw 4 children killed in ...,rt @last_jones: just saw 4 children killed in ...
1,@productortano The Walking dead #CaballeroDeDia,@productortano the walking dead #caballerodedia
2,RT @DavidHundeyin: Eastern Europe is NOT LIKE ...,rt @davidhundeyin: eastern europe is not like ...
3,Tried to add their one iconic symbol from thei...,tried to add their one iconic symbol from thei...
4,watching the finale season of ‘the walking dea...,watching the finale season of ‘the walking dea...


## Remove Stopword
Hapus kalimat yang tidak memiliki hubungan dengan subyek utama yang dimaksud

In [20]:
additional = ['rt', 'rts', 'retweet']
words = set().union(stopwords.words('english'), additional)

In [21]:
print(words)

{'ain', "you'd", 'these', 'am', 'can', 'y', 'have', 's', 'theirs', 'she', 'ours', 'mightn', 'only', 'd', 'from', 'an', "couldn't", 'needn', 'shouldn', 'in', 'has', "wouldn't", 'the', "hasn't", 'doing', "shouldn't", "aren't", "weren't", 'hers', 'who', 'are', 'and', 'so', 'o', 'should', 'won', "you've", 'its', 'those', 'of', 'their', "should've", 'our', 'through', 'out', 'at', 'm', 'same', 'rt', 'your', 'not', 'ma', 'to', 'but', 'above', 'hasn', 'below', 'after', 'over', "didn't", 'this', "hadn't", 'until', 'll', 'had', 'wouldn', 'isn', "that'll", 'having', 'then', 'itself', 'how', 'him', 'wasn', 'which', 'haven', 'whom', 'against', "isn't", 'why', 'my', 'they', 'retweet', "mustn't", 'by', 'under', 'very', "haven't", 'once', 'with', "she's", 'were', 'yourself', 'during', 'aren', 'myself', "doesn't", "you're", 'nor', 'before', 'is', 'any', 'as', 'just', 'themselves', 'too', 'do', 'what', 'again', 'on', 't', 'all', 'ourselves', 'yourselves', 'where', 'if', 'now', 'did', 'here', 'them', 'we

## Text Filtering
Menghapus simbol, link, yang tidak ada hubungannya dengan subyek yang dimaksud

In [23]:
df['filtered_text'] = df['teks'].str.replace('(@[a-z0-9]+)\w+',' ')\
          .str.replace('(http\S+)', ' ')\
          .str.replace('([^0-9a-z \t])',' ')\
          .str.replace(' +',' ')\
          .apply(lambda x: [i for i in x.split() if not i in words])

  df['filtered_text'] = df['teks'].str.replace('(@[a-z0-9]+)\w+',' ')\


In [24]:
df.head()

Unnamed: 0,teks,case_folded,filtered_text
0,RT @last_jones: Just saw 4 children killed in ...,rt @last_jones: just saw 4 children killed in ...,"[ust, saw, 4, children, killed, kraine, shelli..."
1,@productortano The Walking dead #CaballeroDeDia,@productortano the walking dead #caballerodedia,"[alking, dead, aballero, e, ia]"
2,RT @DavidHundeyin: Eastern Europe is NOT LIKE ...,rt @davidhundeyin: eastern europe is not like ...,"[avid, undeyin, astern, urope, imagine, look, ..."
3,Tried to add their one iconic symbol from thei...,tried to add their one iconic symbol from thei...,"[ried, add, one, iconic, symbol, movie, series..."
4,watching the finale season of ‘the walking dea...,watching the finale season of ‘the walking dea...,"[watching, finale, season, walking, dead, laur..."


## Stemming
Mencari akar kata, misal walking menjadi walk

In [25]:
ps = PorterStemmer()
df['stemmed'] = df['filtered_text'].apply(lambda x: [ps.stem(i) for i in x if i != ''])

df['stemmed'].head()

0    [ust, saw, 4, children, kill, krain, shell, us...
1                         [alk, dead, aballero, e, ia]
2    [avid, undeyin, astern, urop, imagin, look, si...
3    [rie, add, one, icon, symbol, movi, seri, pict...
4    [watch, final, season, walk, dead, lauren, cohan]
Name: stemmed, dtype: object

In [26]:
df.head()

Unnamed: 0,teks,case_folded,filtered_text,stemmed
0,RT @last_jones: Just saw 4 children killed in ...,rt @last_jones: just saw 4 children killed in ...,"[ust, saw, 4, children, killed, kraine, shelli...","[ust, saw, 4, children, kill, krain, shell, us..."
1,@productortano The Walking dead #CaballeroDeDia,@productortano the walking dead #caballerodedia,"[alking, dead, aballero, e, ia]","[alk, dead, aballero, e, ia]"
2,RT @DavidHundeyin: Eastern Europe is NOT LIKE ...,rt @davidhundeyin: eastern europe is not like ...,"[avid, undeyin, astern, urope, imagine, look, ...","[avid, undeyin, astern, urop, imagin, look, si..."
3,Tried to add their one iconic symbol from thei...,tried to add their one iconic symbol from thei...,"[ried, add, one, iconic, symbol, movie, series...","[rie, add, one, icon, symbol, movi, seri, pict..."
4,watching the finale season of ‘the walking dea...,watching the finale season of ‘the walking dea...,"[watching, finale, season, walking, dead, laur...","[watch, final, season, walk, dead, lauren, cohan]"
