In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import dask.dataframe as dd

In [2]:
import spacy
from spacy.matcher import PhraseMatcher
from fuzzywuzzy import process, fuzz



In [3]:
df = dd.read_csv('/kaggle/input/preprocessed-satdat-dataset/preprocessed_dask.csv')
df.head()

Unnamed: 0,created_at,tcode,num_retweets,frn_cnt,flw_cnt,sts_cnt,lst_cnt,content
0,2024-01-04T09:57:09Z,rt,1248.0,266.0,107.0,9687.0,0.0,k-popers berencana kirim food truck untuk anie...
1,2024-01-04T09:57:09Z,rt,195.0,564.0,303.0,12461.0,2.0,bapak pendeta yusak ini dari magetan ke ponoro...
2,2024-01-04T09:57:10Z,rt,116.0,376.0,156.0,7488.0,1.0,"viral , gimana mak mak di jakarta tidak ter an..."
3,2024-01-04T09:57:10Z,rt,2264.0,163.0,203.0,2065.0,0.0,"mendengar pak anies disini, rasanya saya sudah..."
4,2024-01-04T09:57:11Z,rt,1157.0,1.0,1.0,798.0,0.0,media asing soroti cara anies gaet pemilih mud...


In [4]:
# drop content yang kosong akibat diclean

df = df.dropna()

In [5]:
nlp = spacy.blank("id")
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

In [6]:
candidate_keywords = {
    "paslon 1": ["paslon 1", "calon 1", "paslon 01",'anies','anis','baswedan','muhaimin','iskandar','cak imin','imin','amin','anies-muhamin'],
    "paslon 2": ["paslon 2", "calon 2", "paslon 02",'prabowo','subianto','gibran','rakabuming','pragib','prabowo-gibran'],
    "paslon 3": ["paslon 3", "calon 3", "paslon 03", "ganjar",'pranowo','mahfud md','mahfud','ganjar-mahfud']
}

In [7]:
for candidate, patterns in candidate_keywords.items():
    matcher.add(candidate, [nlp.make_doc(text) for text in patterns])

In [8]:
def classify_candidates_combined(text):
    doc = nlp(text)
    matches = matcher(doc)
    
    # Initialize the classifications dictionary with False
    classifications = {candidate: False for candidate in candidate_keywords}
    
    # Update the classifications dictionary based on matches
    for match_id, start, end in matches:
        candidate = nlp.vocab.strings[match_id]
        classifications[candidate] = True
    
    # If no exact matches found, use fuzzy matching
    if not any(classifications.values()):
        for candidate, keywords in candidate_keywords.items():
            match = process.extractOne(text, keywords, scorer=fuzz.partial_ratio)
            if match and match[1] > 80:  # Threshold for matching
                classifications[candidate] = True
    
    return classifications

In [9]:
def cek_is_paslon_1(text):
    result = classify_candidates_combined(text)
    if result['paslon 1']:
        return 1
    return 0

def cek_is_paslon_2(text):
    result = classify_candidates_combined(text)
    if result['paslon 2']:
        return 1
    return 0

def cek_is_paslon_3(text):
    result = classify_candidates_combined(text)
    if result['paslon 3']:
        return 1
    return 0

In [10]:
df['is_paslon_1'] = df['content'].apply(cek_is_paslon_1,meta=('is_paslon_1', 'int8'))

In [11]:
df['is_paslon_2'] = df['content'].apply(cek_is_paslon_2,meta=('is_paslon_2', 'int8'))

In [12]:
df['is_paslon_3'] = df['content'].apply(cek_is_paslon_3,meta=('is_paslon_3', 'int8'))

In [13]:
df.to_csv('classify_president.csv', single_file=True, index=False)

['/kaggle/working/classify_president.csv']

In [14]:
try:
    unidentified_df = df[(df['is_paslon_1'] == 0) & (df['is_paslon_2'] == 0) & (df['is_paslon_3'] == 0)]
    total_rows = unidentified_df.shape[0].compute()
    unidentified_df.to_csv('unidentified_president.csv', single_file=True, index=False)
    print(f"Total baris : {total_rows}")
except:
    print("Error")

Total baris : 346892
