##**Cleaning & Preprocess Dataset**

In [1]:
pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [2]:
import re
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Download data NLTK
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_tweet(tweet):
    # Menghapus URL
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet)
    # Menghapus mention dan hashtag
    tweet = re.sub(r"@\w+|#\w+", '', tweet)
    # Menghapus karakter khusus
    tweet = re.sub(r"[^\w\s]", '', tweet)
    # Tokenisasi
    tokens = word_tokenize(tweet)
    # Mengonversi ke huruf kecil
    tokens = [token.lower() for token in tokens]
    # Menghapus stopwords
    stop_words = set(stopwords.words('indonesian'))
    tokens = [token for token in tokens if token not in stop_words]
    # Stemming (opsional)
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

# Contoh penggunaan
# tweet = "Cek penelitian #kecerdasanbuatan yang menarik di https://contoh.com! @peneliti #machinelearning"
# cleaned_tweet = preprocess_tweet(tweet)
# print("Tweet bersih:", cleaned_tweet)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Read the CSV file into a DataFrame
df = pd.read_csv('dataset.csv')

# Apply the preprocess_text function to the desired column
df['preprocessed_column'] = df['full_text'].apply(preprocess_tweet)

# Display the DataFrame with the preprocessed column
print(df)

# Save the DataFrame back to a CSV file
df.to_csv('preprocessed_dataset.csv', index=False)

     Status                                          full_text  \
0      Fact  Ahli Ajak Masyarakat Melihat Teknologi Wolbach...   
1      Fact  TEKNOLOGI wolbachia dianggap sebagai trobosan ...   
2      Fact  Kok, bisa ada nyamuk yang mencegah DBD? Bukann...   
3      Fact  Program Penyebaran Nyamuk Wolbachia, Langkah M...   
4      Fact  Wolbachia sebagai upaya menekan khasus DBD.  #...   
...     ...                                                ...   
996    Fact  @mozamozza770 @mr_banan12 @8angSaid @Abahpurwa...   
997    Fake  InsyaALLAAH sedikit penjelasan Wolbachia yg se...   
998    Fact  Hasil kajian analisis risiko yang dilakukan se...   
999    Fake  @DokterTifa Krn @DPR_RI wakil rakyat jd tolong...   
1000   Fact  Teknologi nyamuk ber-Wolbachia merupakan salah...   

                                    preprocessed_column  
0     ahli ajak masyarakat teknologi wolbachia lawan...  
1     teknologi wolbachia anggap trobosan entas dema...  
2     nyamuk cegah dbd nyamuk age

In [4]:
df_pre = pd.read_csv('preprocessed_dataset.csv')

# Mengubah nilai dalam kolom "status" dari "fact" dan "fake" menjadi 0 dan 1
df_pre['Status'] = df_pre['Status'].replace({'Fact': 1, 'Fake': 0})

# Mengubah nama kolom "status" menjadi "label"
df_pre.rename(columns={'Status': 'label'}, inplace=True)

# Menyimpan DataFrame kembali ke file CSV (opsional)
df_pre.to_csv('new_preprocessed_dataset.csv', index=False)

# Menampilkan DataFrame
print(df_pre)

      label                                          full_text  \
0         1  Ahli Ajak Masyarakat Melihat Teknologi Wolbach...   
1         1  TEKNOLOGI wolbachia dianggap sebagai trobosan ...   
2         1  Kok, bisa ada nyamuk yang mencegah DBD? Bukann...   
3         1  Program Penyebaran Nyamuk Wolbachia, Langkah M...   
4         1  Wolbachia sebagai upaya menekan khasus DBD.  #...   
...     ...                                                ...   
996       1  @mozamozza770 @mr_banan12 @8angSaid @Abahpurwa...   
997       0  InsyaALLAAH sedikit penjelasan Wolbachia yg se...   
998       1  Hasil kajian analisis risiko yang dilakukan se...   
999       0  @DokterTifa Krn @DPR_RI wakil rakyat jd tolong...   
1000      1  Teknologi nyamuk ber-Wolbachia merupakan salah...   

                                    preprocessed_column  
0     ahli ajak masyarakat teknologi wolbachia lawan...  
1     teknologi wolbachia anggap trobosan entas dema...  
2     nyamuk cegah dbd nyamuk age

##**Bag-of-Words Representation**

In [5]:
from collections import Counter

# Step 1: Tokenization
def tokenize(text):
    return text.split()

# Step 2: Build Vocabulary
def build_vocabulary(corpus):
    vocabulary = set()
    for document in corpus:
        tokens = tokenize(document)
        vocabulary.update(tokens)
    return sorted(vocabulary)

# Step 3: Create Bag-of-Words Representation
def create_bow(corpus, vocabulary):
    bow = []
    for document in corpus:
        tokens = tokenize(document)
        bow_vector = Counter(tokens)
        document_vector = [bow_vector[word] if word in bow_vector else 0 for word in vocabulary]
        bow.append(document_vector)
    return bow


In [7]:
# Read the CSV file into a DataFrame
df = pd.read_csv('new_preprocessed_dataset.csv')

# Build vocabulary from cleaned text
vocabulary = build_vocabulary(df_pre['preprocessed_column'])

# Create BoW representation
bow_representation = create_bow(df['preprocessed_column'], vocabulary)

# Display the BoW representation of the first document
print("BoW representation of the first document:")
print(bow_representation[0])

BoW representation of the first document:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [13]:
import pandas as pd

# Membuat DataFrame dari BoW representation
bow_df = pd.DataFrame(bow_representation, columns=vocabulary)

# Menampilkan DataFrame
print("Bag-of-Words Representation:")
print(bow_df)


Bag-of-Words Representation:
      0  02  06  1  10  100  1000  100000  10tahun  11  ...  yogyakarta  you  \
0     0   0   0  0   0    0     0       0        0   0  ...           0    0   
1     0   0   0  0   0    0     0       0        0   0  ...           0    0   
2     0   0   0  0   0    0     0       0        0   0  ...           0    0   
3     0   0   0  0   0    0     0       0        0   0  ...           0    0   
4     0   0   0  0   0    0     0       0        0   0  ...           0    0   
...  ..  ..  .. ..  ..  ...   ...     ...      ...  ..  ...         ...  ...   
996   0   0   0  0   0    0     0       0        0   0  ...           0    0   
997   0   0   0  1   0    0     0       0        0   0  ...           0    0   
998   0   0   0  0   0    0     0       0        0   0  ...           0    0   
999   0   0   0  0   0    0     0       0        0   0  ...           0    0   
1000  0   0   0  0   0    0     0       0        0   0  ...           0    0   

      yout

In [15]:
# Dan DataFrame yang berisi label
labels_df = df['label']  # Misalnya, 1 untuk fakta dan 0 untuk palsu

# Menggabungkan kedua DataFrame berdasarkan indeks
combined_df = pd.concat([labels_df, bow_df], axis=1)

# Menampilkan DataFrame hasil gabungan
print("Combined DataFrame:")
print(combined_df)

# Menyimpan DataFrame ke dalam file CSV
combined_df.to_csv('bow_representation.csv', index=False)

Combined DataFrame:
      label  0  02  06  1  10  100  1000  100000  10tahun  ...  yogyakarta  \
0         1  0   0   0  0   0    0     0       0        0  ...           0   
1         1  0   0   0  0   0    0     0       0        0  ...           0   
2         1  0   0   0  0   0    0     0       0        0  ...           0   
3         1  0   0   0  0   0    0     0       0        0  ...           0   
4         1  0   0   0  0   0    0     0       0        0  ...           0   
...     ... ..  ..  .. ..  ..  ...   ...     ...      ...  ...         ...   
996       1  0   0   0  0   0    0     0       0        0  ...           0   
997       0  0   0   0  1   0    0     0       0        0  ...           0   
998       1  0   0   0  0   0    0     0       0        0  ...           0   
999       0  0   0   0  0   0    0     0       0        0  ...           0   
1000      1  0   0   0  0   0    0     0       0        0  ...           0   

      you  youtube  yoyabantul  yuk  z  zik