# Merge Data

In [85]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import re

In [86]:
import os

directory = 'data_kp'

df = pd.DataFrame()

for filename in os.scandir(directory):
    if filename.is_file():
        temp = pd.read_csv(filename.path)
        df = df.append(temp, ignore_index=True)

In [87]:
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df.sort_values(by='created_at', inplace=True)
df.reset_index(drop=True, inplace=True)
df.dropna(subset=['created_at'], inplace=True)

In [88]:
df.head()

Unnamed: 0,conversation_id_str,created_at,favorite_count,full_text,id_str,image_url,in_reply_to_screen_name,lang,location,quote_count,reply_count,retweet_count,tweet_url,user_id_str,username
0,1190066708402204672,2019-11-01 00:43:08+00:00,0,Kebijakan Pendidikan by @aab_qurani https://t....,1190066708402204672,,,in,,0,0,0,https://twitter.com/SaifudinSoleh2/status/1190...,1189937900672049152,SaifudinSoleh2
1,1190096843155181568,2019-11-01 02:42:52+00:00,0,Kakanwil Kemenag Provinsi Papua Pdt. Amsal Yow...,1190096843155181568,https://pbs.twimg.com/media/EIQSwmBWkAA9v25.jpg,,in,"Jayapura, Papua",0,0,0,https://twitter.com/kemenag_papua/status/11900...,2831210114,kemenag_papua
2,1190190121913503745,2019-11-01 08:53:32+00:00,18,Ini pendidikan politik yang baik dari anggota...,1190190121913503745,,,in,Jakarta,0,3,5,https://twitter.com/kompascom/status/119019012...,23343960,kompascom
3,1190195545022713857,2019-11-01 09:15:05+00:00,18,Ini pendidikan politik yang baik dari anggota...,1190195545022713857,,,in,Jakarta,0,1,1,https://twitter.com/kompascom/status/119019554...,23343960,kompascom
4,1190226043891679233,2019-11-01 11:16:16+00:00,7,Bekerja di institusi pendidikan mengizinkan sa...,1190226043891679233,https://pbs.twimg.com/media/EISIQ2eWsAU41fW.jpg,,in,,0,4,0,https://twitter.com/rijalram/status/1190226043...,100397036,rijalram


# Clean Data

### Slang dan Abreviasi

In [89]:
kamus_slang=pd.read_csv('src\kamus_slang.csv')
kamus_slang=kamus_slang.rename(columns = {'slang' : 'kamus_slang' , 'formal' : 'kamus_perbaikan'})
slang_mapping = dict(zip(kamus_slang['kamus_slang'], kamus_slang['kamus_perbaikan']))

kamus_singkatan = pd.read_csv('src\kamus_singkatan.csv', header=None, names=['sebelum_perbaikan', 'setelah_perbaikan'],delimiter=';')
singkatan_mapping=dict(zip(kamus_singkatan['sebelum_perbaikan'],kamus_singkatan['setelah_perbaikan']))

### Stopword, emoji, dan Stemmer Factory

In [94]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import  StopWordRemoverFactory
import emoji
from spacy.lang.id import Indonesian
import string

In [95]:
stopword_factory = StopWordRemoverFactory()
stopwords = stopword_factory.get_stop_words()
# List of words with negation meaning
data = emoji.EMOJI_DATA

# Remove negation words from stopwords
# stopwords = set(stopwords).difference(excluded_stopwords)
nlp = Indonesian()
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [2]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import  StopWordRemoverFactory
stopword_factory = StopWordRemoverFactory()
stopwords = stopword_factory.get_stop_words()
stopwords

['yang',
 'untuk',
 'pada',
 'ke',
 'para',
 'namun',
 'menurut',
 'antara',
 'dia',
 'dua',
 'ia',
 'seperti',
 'jika',
 'jika',
 'sehingga',
 'kembali',
 'dan',
 'tidak',
 'ini',
 'karena',
 'kepada',
 'oleh',
 'saat',
 'harus',
 'sementara',
 'setelah',
 'belum',
 'kami',
 'sekitar',
 'bagi',
 'serta',
 'di',
 'dari',
 'telah',
 'sebagai',
 'masih',
 'hal',
 'ketika',
 'adalah',
 'itu',
 'dalam',
 'bisa',
 'bahwa',
 'atau',
 'hanya',
 'kita',
 'dengan',
 'akan',
 'juga',
 'ada',
 'mereka',
 'sudah',
 'saya',
 'terhadap',
 'secara',
 'agar',
 'lain',
 'anda',
 'begitu',
 'mengapa',
 'kenapa',
 'yaitu',
 'yakni',
 'daripada',
 'itulah',
 'lagi',
 'maka',
 'tentang',
 'demi',
 'dimana',
 'kemana',
 'pula',
 'sambil',
 'sebelum',
 'sesudah',
 'supaya',
 'guna',
 'kah',
 'pun',
 'sampai',
 'sedangkan',
 'selagi',
 'sementara',
 'tetapi',
 'apakah',
 'kecuali',
 'sebab',
 'selain',
 'seolah',
 'seraya',
 'seterusnya',
 'tanpa',
 'agak',
 'boleh',
 'dapat',
 'dsb',
 'dst',
 'dll',
 'dahulu

In [96]:
def replace_emoji_with_space(text, emoji_data, language='id'):
    for emoji, translations in emoji_data.items():
        if language in translations:
            text = text.replace(emoji, '')
    return text

In [116]:
def process_tweet(tweet) :
  tweet=tweet.lower()
  # link
  tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)

  # spesifik
  # tweet = re.sub(r'\[username\]|\[url\]|\[askmf\]|\[sensitive-no\]|\[satu menit kemudian\]|\[seo in ha, love rain\]|\[bb\]|\[c48\]|\[idm\]', '', tweet)

  # emoji
  tweet=replace_emoji_with_space(tweet,data)
  # tweet=tweet.replace(":",' ').replace('_','mask').replace('-','rus').strip()
  # tweet=re.sub(' +', ' ', tweet)

  # username
  tweet = re.sub('@[^\s]+','',tweet)

  # hashtag
  tweet = re.sub(r'#([^\s]+)', '', tweet)

  # tokenisasi
  tokens = tweet.split()

  tweet_tokens = []
  for ele in tokens:
    ele_kamus = kamus_singkatan.get(ele, ele)
    ele_slang = slang_mapping.get(ele_kamus, ele_kamus)
    tweet_tokens.append(ele_slang)

  tweet = ' '.join(tweet_tokens)
  tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
  tweet = re.sub(r'#([^\s]+)', '', tweet)
  tweet=re.sub(r'\d+', '', tweet)
  tweet = tweet.strip('\'"')
  tweet = tweet.lstrip('\'"')

  tweet = "".join([char for char in tweet if char not in string.punctuation])

  doc = nlp(tweet)

  tokens = [token.text for token in doc]
      # Hapus stopwords dari tokens
  filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
  tweet = ' '.join(filtered_tokens)

  tweet=stemmer.stem(tweet)
  # tweet=tweet.replace('mask',' ').replace('rus','-')

  # hapus kata bijak dan didik
  tweet = re.sub(r'\b(?:bijak|didik)\b', '', tweet)

  # hapus spasi berlebih di awal dan akhir
  tweet = tweet.strip()

  return tweet

In [98]:
df.head(5)

Unnamed: 0,conversation_id_str,created_at,favorite_count,full_text,id_str,image_url,in_reply_to_screen_name,lang,location,quote_count,reply_count,retweet_count,tweet_url,user_id_str,username
0,1190066708402204672,2019-11-01 00:43:08+00:00,0,Kebijakan Pendidikan by @aab_qurani https://t....,1190066708402204672,,,in,,0,0,0,https://twitter.com/SaifudinSoleh2/status/1190...,1189937900672049152,SaifudinSoleh2
1,1190096843155181568,2019-11-01 02:42:52+00:00,0,Kakanwil Kemenag Provinsi Papua Pdt. Amsal Yow...,1190096843155181568,https://pbs.twimg.com/media/EIQSwmBWkAA9v25.jpg,,in,"Jayapura, Papua",0,0,0,https://twitter.com/kemenag_papua/status/11900...,2831210114,kemenag_papua
2,1190190121913503745,2019-11-01 08:53:32+00:00,18,Ini pendidikan politik yang baik dari anggota...,1190190121913503745,,,in,Jakarta,0,3,5,https://twitter.com/kompascom/status/119019012...,23343960,kompascom
3,1190195545022713857,2019-11-01 09:15:05+00:00,18,Ini pendidikan politik yang baik dari anggota...,1190195545022713857,,,in,Jakarta,0,1,1,https://twitter.com/kompascom/status/119019554...,23343960,kompascom
4,1190226043891679233,2019-11-01 11:16:16+00:00,7,Bekerja di institusi pendidikan mengizinkan sa...,1190226043891679233,https://pbs.twimg.com/media/EISIQ2eWsAU41fW.jpg,,in,,0,4,0,https://twitter.com/rijalram/status/1190226043...,100397036,rijalram


In [118]:
process_tweet(df['full_text'][0])

'by lewat'

In [119]:
df['full_text'] = df['full_text'].apply(lambda x: process_tweet(str(x)))

In [122]:
df.to_csv('cleaned_kp.csv')

In [123]:
(df['full_text'])

0                                                by lewat
1       kakanwil kemenag provinsi papua pdt amsal yowe...
2       politik baik anggota dprd betul psi libat raky...
3       politik baik anggota dprd betul psi libat raky...
4       kerja institusi  izin alami baru masuk temu bi...
                              ...                        
8431    pintas masa depan agama indonesia lihat lebih ...
8432    hitung tahun ajar perintah lalu kemendikbud ja...
8433    bersikukuh  tinggi komersialisasi semenjak  pt...
8434    kagak ngotak asli dahh sbnrnya salah   sekaran...
8435                     bahasa inggris sd tantang imbang
Name: full_text, Length: 8436, dtype: object

In [124]:
df.duplicated(subset=['full_text']).sum()

958