In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

True

**QUICK EDA + PREPROCESSING**

* 0 = negative
* 1 = neutral
* 2 = positive


In [None]:
df = pd.read_parquet('train.parquet')
df.head(),df.shape

(                                                text  label
 0  dulu setiap ke jakarta suka mampir ke bakmi gm...      2
 1                  Apa sek senyatane ora ana setelan      1
 2  waroeng bandoeng tempat jual steak dengan harg...      2
 3  aku jadi berpikir mungkin dia tokoh utama perp...      0
 4  Cek kecewana sara channel mandimman baik chann...      0,
 (26361, 2))

In [None]:
df.label.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
label,Unnamed: 1_level_1
2,0.538219
0,0.347787
1,0.113994


KELAS TIDAK BALANCED

In [None]:
test = pd.read_csv('#KaburDulu.csv')
test.head(),test.shape

(   conversation_id_str                      created_at  favorite_count  \
 0  1891727992348365101  Tue Feb 18 23:59:17 +0000 2025               0   
 1  1892000871963345060  Tue Feb 18 23:58:54 +0000 2025               0   
 2  1891722770209292784  Tue Feb 18 23:57:22 +0000 2025               0   
 3  1891525837784064128  Tue Feb 18 23:54:48 +0000 2025               0   
 4  1891779453979705707  Tue Feb 18 23:53:30 +0000 2025               8   
 
                                            full_text               id_str  \
 0  @__LOVE_AG4EVER Lah Prabowo kok pelopor #Kabur...  1892000967668777233   
 1  #KaburAjaDulu Viral: Istana Ingatkan Skill Jad...  1892000871963345060   
 2  @sadarrdirii Kenapa gw kok bisa memilih Arab s...  1892000485483426167   
 3  @kompascom Tagar #KaburAjaDulu adalah sebuah k...  1891999842647323132   
 4  @josua41 @durenkalibata @s_24mp @gutbbam @tric...  1891999515374387462   
 
                                          image_url in_reply_to_screen_name  \

In [None]:
def clean_tweet(text):
  text = re.sub(r'@[A-Za-z0-9_]+', '', str(text)) #hapus mention
  text = re.sub(r'#\w+', '', str(text)) #hapus Hashtag
  text = re.sub(r'RT[\s]+', '', str(text)) #hapus retweet
  text = re.sub(r'https?://\S+', '', str(text)) #hapus url
  text = re.sub(r'[^A-Za-z0-9 ]', '', str(text)) #hapus karakter non alpha numeric
  text = re.sub(r'\s+', ' ', str(text)).strip() #hapus spasi berlebih
  return text


In [None]:
from nltk.tokenize import word_tokenize

In [None]:
df['clean_tweet'] = df.text.apply(clean_tweet)
df

Unnamed: 0,text,label,clean_tweet
0,dulu setiap ke jakarta suka mampir ke bakmi gm...,2,dulu setiap ke jakarta suka mampir ke bakmi gm...
1,Apa sek senyatane ora ana setelan,1,Apa sek senyatane ora ana setelan
2,waroeng bandoeng tempat jual steak dengan harg...,2,waroeng bandoeng tempat jual steak dengan harg...
3,aku jadi berpikir mungkin dia tokoh utama perp...,0,aku jadi berpikir mungkin dia tokoh utama perp...
4,Cek kecewana sara channel mandimman baik chann...,0,Cek kecewana sara channel mandimman baik chann...
...,...,...,...
26356,Saluyu namina creative space kafe ku kituna se...,2,Saluyu namina creative space kafe ku kituna se...
26357,"resto ini selalu ramai pengunjung , masakan ny...",2,resto ini selalu ramai pengunjung masakan nya ...
26358,Traveloka dulu liburan siap tu wkwkwkwk lah ba...,1,Traveloka dulu liburan siap tu wkwkwkwk lah ba...
26359,adik ku diterima di teknik lingkungan itb,1,adik ku diterima di teknik lingkungan itb


In [None]:
X = df.clean_tweet
y = df.label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((21088,), (5273,), (21088,), (5273,))

**TRAINING**


1.   Ekstrasi fitur menggunakan TFIDF
2.   Algoritma yang digunakan adalah RandomForest
dengan parameter:
```
param_grid = {
    'n_estimators': [100],
    'max_depth': [None, 500],
    'min_samples_split': [5,10],
    'max_features': ['sqrt',0.2],
    'class_weight': ['balanced']
}
```

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
tfidf = TfidfVectorizer(tokenizer=word_tokenize,max_features=10000)
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)



In [None]:
rf = RandomForestClassifier()

In [None]:
param_grid = {
    'n_estimators': [100],
    'max_depth': [None, 500],
    'min_samples_split': [5,10],
    'max_features': ['sqrt',0.2],
    'class_weight': ['balanced']
}

In [None]:
%load_ext cuml.accel

[2025-04-11 06:47:59.313] [CUML] [info] cuML: Installed accelerator for sklearn.
[2025-04-11 06:48:30.142] [CUML] [info] cuML: Installed accelerator for umap.
[2025-04-11 06:48:30.263] [CUML] [info] cuML: Installed accelerator for hdbscan.
[2025-04-11 06:48:30.263] [CUML] [info] cuML: Successfully initialized accelerator.


In [None]:
%%time
model = GridSearchCV(rf,param_grid,cv=5,n_jobs=-1,verbose=1,scoring='accuracy')
model.fit(X_train,y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
CPU times: user 28.6 s, sys: 1.86 s, total: 30.4 s
Wall time: 26min 21s


In [None]:
model.score(X_train, y_train),model.score(X_test, y_test),model.best_score_

(0.6609446130500759, 0.651621467855111, np.float64(0.6462443722506376))

In [None]:
from sklearn.metrics import f1_score

In [None]:
hasilX_test = model.predict(X_test)
f1 = f1_score(y_test,hasilX_test, labels=[0,1,2], average='weighted')
print(f1)

0.8946261232506427


In [None]:
test['clean_tweet'] = test.full_text.apply(clean_tweet)

In [None]:
test = test[['clean_tweet']]

In [None]:
test

Unnamed: 0,clean_tweet
0,Lah Prabowo kok pelopor Lari ke Jordan krn tak...
1,Viral Istana Ingatkan Skill Jadi Kunci Sukses ...
2,Kenapa gw kok bisa memilih Arab sebagai tujuan...
3,Tagar adalah sebuah kenyataan bahwa negara ini...
4,Melihat kamu sepresistance itu membuktikan kal...
...,...
526,dok Jepang aman
527,Tum pilih team atau team tinggal di indonesia
528,Kenapa sih banyak banget yang salty saat orang...
529,mau nggaa


In [None]:
pred = tfidf.transform(test.clean_tweet)

In [None]:
test['sentimen'] = model.predict(pred)

In [None]:
test.sentimen.value_counts()

Unnamed: 0_level_0,count
sentimen,Unnamed: 1_level_1
0,412
1,84
2,35


In [None]:
test['sentimen'] = test['sentimen'].map({
    0: 'negatif',
    1: 'netral',
    2: 'positif'
})

In [None]:
test.to_excel('HASIL SENTIMEN.xlsx')