Text Preprocessing

In [1]:
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

namafile = 'globalwarming.csv'
data = pd.read_csv(namafile, sep=",", encoding='utf-8')
data.head()

Unnamed: 0,publishedAt,authorDisplayName,textDisplay,likeCount
0,2025-02-07T06:50:09Z,@therealcyberwolfman,"<a href=""https://www.youtube.com/watch?v=uynhv...",0
1,2025-02-07T06:33:32Z,@ZjayJD,Bruh why do the poorest countries get the wors...,0
2,2025-02-06T20:59:43Z,@paulschoeters1375,why dont you talk about the cows and meat indu...,0
3,2025-02-06T06:21:26Z,@surendrakarki8173,Himalayan nations like Nepal will face a very ...,0
4,2025-02-03T19:59:09Z,@muhammadtehzeeb759,Effects are quite visible now,1


In [2]:
data = data.drop(columns=['publishedAt', 'authorDisplayName', 'likeCount'])
data = data.rename(columns={'textDisplay': 'comment'})
data

Unnamed: 0,comment
0,"<a href=""https://www.youtube.com/watch?v=uynhv..."
1,Bruh why do the poorest countries get the wors...
2,why dont you talk about the cows and meat indu...
3,Himalayan nations like Nepal will face a very ...
4,Effects are quite visible now
...,...
10284,@@jonathanedwardgibson what if it was always l...
10285,"Snow in texas and in EU , probably global warm..."
10286,Humans now easily have the knowledge and abili...
10287,Geoengineering haarp DARPA weather warfare mot...


In [3]:
data = data.dropna(subset=['comment'])
data

Unnamed: 0,comment
0,"<a href=""https://www.youtube.com/watch?v=uynhv..."
1,Bruh why do the poorest countries get the wors...
2,why dont you talk about the cows and meat indu...
3,Himalayan nations like Nepal will face a very ...
4,Effects are quite visible now
...,...
10284,@@jonathanedwardgibson what if it was always l...
10285,"Snow in texas and in EU , probably global warm..."
10286,Humans now easily have the knowledge and abili...
10287,Geoengineering haarp DARPA weather warfare mot...


In [4]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_

True

In [5]:
#Case Folding
data['comment'] = data['comment'].astype(str)
data['comment'] = data['comment'].str.lower()
data

Unnamed: 0,comment
0,"<a href=""https://www.youtube.com/watch?v=uynhv..."
1,bruh why do the poorest countries get the wors...
2,why dont you talk about the cows and meat indu...
3,himalayan nations like nepal will face a very ...
4,effects are quite visible now
...,...
10284,@@jonathanedwardgibson what if it was always l...
10285,"snow in texas and in eu , probably global warm..."
10286,humans now easily have the knowledge and abili...
10287,geoengineering haarp darpa weather warfare mot...


In [6]:
import string 
import re 
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

In [7]:
def remove_links(text):
    # menghapus tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # menghapus non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # menghapus mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # menghapus URL
    return text.replace("http://", " ").replace("https://", " ")
                
data['comment'] = data['comment'].apply(remove_links)

#menghapus number
def remove_number(text):
    return  re.sub(r"\d+", " ", text)

data['comment'] = data['comment'].apply(remove_number)

#menghapus punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

data['comment'] = data['comment'].apply(remove_punctuation)

# menghapus single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", " ", text)

data['comment'] = data['comment'].apply(remove_singl_char)
data

Unnamed: 0,comment
0,href this is far too optimistic as many gove...
1,bruh why do the poorest countries get the wors...
2,why dont you talk about the cows and meat indu...
3,himalayan nations like nepal will face very ...
4,effects are quite visible now
...,...
10284,what if it was always like that what if its j...
10285,snow in texas and in eu probably global warming
10286,humans now easily have the knowledge and abili...
10287,geoengineering haarp darpa weather warfare mot...


In [8]:
# Tokenisasi 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

data['comment_tokenize'] = data['comment'].apply(word_tokenize_wrapper)
data

Unnamed: 0,comment,comment_tokenize
0,href this is far too optimistic as many gove...,"[href, this, is, far, too, optimistic, as, man..."
1,bruh why do the poorest countries get the wors...,"[bruh, why, do, the, poorest, countries, get, ..."
2,why dont you talk about the cows and meat indu...,"[why, dont, you, talk, about, the, cows, and, ..."
3,himalayan nations like nepal will face very ...,"[himalayan, nations, like, nepal, will, face, ..."
4,effects are quite visible now,"[effects, are, quite, visible, now]"
...,...,...
10284,what if it was always like that what if its j...,"[what, if, it, was, always, like, that, what, ..."
10285,snow in texas and in eu probably global warming,"[snow, in, texas, and, in, eu, probably, globa..."
10286,humans now easily have the knowledge and abili...,"[humans, now, easily, have, the, knowledge, an..."
10287,geoengineering haarp darpa weather warfare mot...,"[geoengineering, haarp, darpa, weather, warfar..."


In [9]:
# Menghitung Distibusi Persebaran Kata
def freqDist_wrapper(text):
    return FreqDist(text)

comment_fqsist = data['comment_tokenize'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(comment_fqsist.head().apply(lambda x : x.most_common()))

Frequency Tokens : 

0    [(and, 6), (the, 5), (as, 3), (many, 3), (be, ...
1    [(the, 3), (countries, 2), (bruh, 1), (why, 1)...
2    [(the, 3), (why, 2), (about, 2), (cows, 2), (d...
3    [(winter, 2), (himalayan, 1), (nations, 1), (l...
4    [(effects, 1), (are, 1), (quite, 1), (visible,...
Name: comment_tokenize, dtype: object


In [10]:
slank_word_dict = {
    "don't": "do not",
    "can't": "cannot",
    "won't": "will not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "it's": "it is",
    "i'm": "i am",
    "you're": "you are",
    "Dr.": "Doctor",
    "Mr.": "Mister",
    "Mrs.": "Mistress",
    "St.": "Saint",
    "Co.": "Company",
    }

def slank_normalized_term(document):
    return [slank_word_dict[term] if term in slank_word_dict else term for term in document]

In [11]:
data['comment_normalized'] = data['comment_tokenize'].apply(slank_normalized_term)
data

Unnamed: 0,comment,comment_tokenize,comment_normalized
0,href this is far too optimistic as many gove...,"[href, this, is, far, too, optimistic, as, man...","[href, this, is, far, too, optimistic, as, man..."
1,bruh why do the poorest countries get the wors...,"[bruh, why, do, the, poorest, countries, get, ...","[bruh, why, do, the, poorest, countries, get, ..."
2,why dont you talk about the cows and meat indu...,"[why, dont, you, talk, about, the, cows, and, ...","[why, dont, you, talk, about, the, cows, and, ..."
3,himalayan nations like nepal will face very ...,"[himalayan, nations, like, nepal, will, face, ...","[himalayan, nations, like, nepal, will, face, ..."
4,effects are quite visible now,"[effects, are, quite, visible, now]","[effects, are, quite, visible, now]"
...,...,...,...
10284,what if it was always like that what if its j...,"[what, if, it, was, always, like, that, what, ...","[what, if, it, was, always, like, that, what, ..."
10285,snow in texas and in eu probably global warming,"[snow, in, texas, and, in, eu, probably, globa...","[snow, in, texas, and, in, eu, probably, globa..."
10286,humans now easily have the knowledge and abili...,"[humans, now, easily, have, the, knowledge, an...","[humans, now, easily, have, the, knowledge, an..."
10287,geoengineering haarp darpa weather warfare mot...,"[geoengineering, haarp, darpa, weather, warfar...","[geoengineering, haarp, darpa, weather, warfar..."


In [12]:
import nltk
from nltk.corpus import stopwords

# Unduh stopwords jika belum tersedia
nltk.download('stopwords')

# Ambil daftar stopwords bahasa Inggris dari NLTK
list_stopwords_en = set(stopwords.words('english'))

# Fungsi untuk menghapus stopwords dari teks
def remove_stopwords(text):
    return [word for word in text if word.lower() not in list_stopwords_en]

data['comment_stop_removed'] = data['comment_normalized'].apply(remove_stopwords)
data 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,comment,comment_tokenize,comment_normalized,comment_stop_removed
0,href this is far too optimistic as many gove...,"[href, this, is, far, too, optimistic, as, man...","[href, this, is, far, too, optimistic, as, man...","[href, far, optimistic, many, governments, cor..."
1,bruh why do the poorest countries get the wors...,"[bruh, why, do, the, poorest, countries, get, ...","[bruh, why, do, the, poorest, countries, get, ...","[bruh, poorest, countries, get, worst, polluti..."
2,why dont you talk about the cows and meat indu...,"[why, dont, you, talk, about, the, cows, and, ...","[why, dont, you, talk, about, the, cows, and, ...","[dont, talk, cows, meat, industries, cows, res..."
3,himalayan nations like nepal will face very ...,"[himalayan, nations, like, nepal, will, face, ...","[himalayan, nations, like, nepal, will, face, ...","[himalayan, nations, like, nepal, face, severe..."
4,effects are quite visible now,"[effects, are, quite, visible, now]","[effects, are, quite, visible, now]","[effects, quite, visible]"
...,...,...,...,...
10284,what if it was always like that what if its j...,"[what, if, it, was, always, like, that, what, ...","[what, if, it, was, always, like, that, what, ...","[always, like, fear, peddled, marxists, order,..."
10285,snow in texas and in eu probably global warming,"[snow, in, texas, and, in, eu, probably, globa...","[snow, in, texas, and, in, eu, probably, globa...","[snow, texas, eu, probably, global, warming]"
10286,humans now easily have the knowledge and abili...,"[humans, now, easily, have, the, knowledge, an...","[humans, now, easily, have, the, knowledge, an...","[humans, easily, knowledge, ability, solve, pr..."
10287,geoengineering haarp darpa weather warfare mot...,"[geoengineering, haarp, darpa, weather, warfar...","[geoengineering, haarp, darpa, weather, warfar...","[geoengineering, haarp, darpa, weather, warfar..."


In [13]:
import nltk
from nltk.stem import PorterStemmer
import swifter

# Pastikan NLTK sudah mengunduh modulnya
nltk.download('punkt')

# Inisialisasi stemmer
stemmer = PorterStemmer()

# Fungsi stemming dengan caching
term_dict = {}

def stemmed_wrapper(term):
    if term not in term_dict:
        term_dict[term] = stemmer.stem(term)  # Stem hanya jika belum ada dalam dictionary
    return term_dict[term]

# Pastikan kolom 'comment_stop_removed' sudah dalam bentuk list
if isinstance(data['comment_stop_removed'].iloc[0], str):
    data['comment_stop_removed'] = data['comment_stop_removed'].apply(lambda x: nltk.word_tokenize(x.lower())) 

# Terapkan stemming ke setiap kata dalam komentar
data['comment_Stemmed'] = data['comment_stop_removed'].swifter.apply(lambda doc: [stemmed_wrapper(term) for term in doc])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Pandas Apply:   0%|          | 0/10289 [00:00<?, ?it/s]

In [14]:
data

Unnamed: 0,comment,comment_tokenize,comment_normalized,comment_stop_removed,comment_Stemmed
0,href this is far too optimistic as many gove...,"[href, this, is, far, too, optimistic, as, man...","[href, this, is, far, too, optimistic, as, man...","[href, far, optimistic, many, governments, cor...","[href, far, optimist, mani, govern, corpor, pl..."
1,bruh why do the poorest countries get the wors...,"[bruh, why, do, the, poorest, countries, get, ...","[bruh, why, do, the, poorest, countries, get, ...","[bruh, poorest, countries, get, worst, polluti...","[bruh, poorest, countri, get, worst, pollut, d..."
2,why dont you talk about the cows and meat indu...,"[why, dont, you, talk, about, the, cows, and, ...","[why, dont, you, talk, about, the, cows, and, ...","[dont, talk, cows, meat, industries, cows, res...","[dont, talk, cow, meat, industri, cow, respons..."
3,himalayan nations like nepal will face very ...,"[himalayan, nations, like, nepal, will, face, ...","[himalayan, nations, like, nepal, will, face, ...","[himalayan, nations, like, nepal, face, severe...","[himalayan, nation, like, nepal, face, sever, ..."
4,effects are quite visible now,"[effects, are, quite, visible, now]","[effects, are, quite, visible, now]","[effects, quite, visible]","[effect, quit, visibl]"
...,...,...,...,...,...
10284,what if it was always like that what if its j...,"[what, if, it, was, always, like, that, what, ...","[what, if, it, was, always, like, that, what, ...","[always, like, fear, peddled, marxists, order,...","[alway, like, fear, peddl, marxist, order, get..."
10285,snow in texas and in eu probably global warming,"[snow, in, texas, and, in, eu, probably, globa...","[snow, in, texas, and, in, eu, probably, globa...","[snow, texas, eu, probably, global, warming]","[snow, texa, eu, probabl, global, warm]"
10286,humans now easily have the knowledge and abili...,"[humans, now, easily, have, the, knowledge, an...","[humans, now, easily, have, the, knowledge, an...","[humans, easily, knowledge, ability, solve, pr...","[human, easili, knowledg, abil, solv, problem,..."
10287,geoengineering haarp darpa weather warfare mot...,"[geoengineering, haarp, darpa, weather, warfar...","[geoengineering, haarp, darpa, weather, warfar...","[geoengineering, haarp, darpa, weather, warfar...","[geoengin, haarp, darpa, weather, warfar, moth..."


In [15]:
data["comment_clean"] = [' '.join(map(str, l)) for l in data['comment_Stemmed']]
data

Unnamed: 0,comment,comment_tokenize,comment_normalized,comment_stop_removed,comment_Stemmed,comment_clean
0,href this is far too optimistic as many gove...,"[href, this, is, far, too, optimistic, as, man...","[href, this, is, far, too, optimistic, as, man...","[href, far, optimistic, many, governments, cor...","[href, far, optimist, mani, govern, corpor, pl...",href far optimist mani govern corpor plan carb...
1,bruh why do the poorest countries get the wors...,"[bruh, why, do, the, poorest, countries, get, ...","[bruh, why, do, the, poorest, countries, get, ...","[bruh, poorest, countries, get, worst, polluti...","[bruh, poorest, countri, get, worst, pollut, d...",bruh poorest countri get worst pollut done ric...
2,why dont you talk about the cows and meat indu...,"[why, dont, you, talk, about, the, cows, and, ...","[why, dont, you, talk, about, the, cows, and, ...","[dont, talk, cows, meat, industries, cows, res...","[dont, talk, cow, meat, industri, cow, respons...",dont talk cow meat industri cow respons presen...
3,himalayan nations like nepal will face very ...,"[himalayan, nations, like, nepal, will, face, ...","[himalayan, nations, like, nepal, will, face, ...","[himalayan, nations, like, nepal, face, severe...","[himalayan, nation, like, nepal, face, sever, ...",himalayan nation like nepal face sever result ...
4,effects are quite visible now,"[effects, are, quite, visible, now]","[effects, are, quite, visible, now]","[effects, quite, visible]","[effect, quit, visibl]",effect quit visibl
...,...,...,...,...,...,...
10284,what if it was always like that what if its j...,"[what, if, it, was, always, like, that, what, ...","[what, if, it, was, always, like, that, what, ...","[always, like, fear, peddled, marxists, order,...","[alway, like, fear, peddl, marxist, order, get...",alway like fear peddl marxist order get us low...
10285,snow in texas and in eu probably global warming,"[snow, in, texas, and, in, eu, probably, globa...","[snow, in, texas, and, in, eu, probably, globa...","[snow, texas, eu, probably, global, warming]","[snow, texa, eu, probabl, global, warm]",snow texa eu probabl global warm
10286,humans now easily have the knowledge and abili...,"[humans, now, easily, have, the, knowledge, an...","[humans, now, easily, have, the, knowledge, an...","[humans, easily, knowledge, ability, solve, pr...","[human, easili, knowledg, abil, solv, problem,...",human easili knowledg abil solv problem willbr...
10287,geoengineering haarp darpa weather warfare mot...,"[geoengineering, haarp, darpa, weather, warfar...","[geoengineering, haarp, darpa, weather, warfar...","[geoengineering, haarp, darpa, weather, warfar...","[geoengin, haarp, darpa, weather, warfar, moth...",geoengin haarp darpa weather warfar mother ear...


In [16]:
data = data.drop(columns=['comment', 'comment_tokenize', 'comment_normalized', 'comment_stop_removed', 'comment_Stemmed'])
data

Unnamed: 0,comment_clean
0,href far optimist mani govern corpor plan carb...
1,bruh poorest countri get worst pollut done ric...
2,dont talk cow meat industri cow respons presen...
3,himalayan nation like nepal face sever result ...
4,effect quit visibl
...,...
10284,alway like fear peddl marxist order get us low...
10285,snow texa eu probabl global warm
10286,human easili knowledg abil solv problem willbr...
10287,geoengin haarp darpa weather warfar mother ear...


PELABELAN DATA

In [17]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Mengunduh lexicon VADER
import nltk
nltk.download('vader_lexicon')

# Inisialisasi SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Fungsi untuk menentukan sentimen
def get_sentiment(text):
    score = sia.polarity_scores(text)
    # Menentukan sentimen berdasarkan nilai compound
    if score['compound'] > 0:
        return 'positif'
    elif score['compound'] == 0:
        return 'netral'
    else:
        return 'negatif'

# Menambahkan kolom sentimen ke DataFrame
data['sentimen'] = data['comment_clean'].apply(get_sentiment)

print(data)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


                                           comment_clean sentimen
0      href far optimist mani govern corpor plan carb...  positif
1      bruh poorest countri get worst pollut done ric...  negatif
2      dont talk cow meat industri cow respons presen...  positif
3      himalayan nation like nepal face sever result ...  positif
4                                     effect quit visibl   netral
...                                                  ...      ...
10284  alway like fear peddl marxist order get us low...  negatif
10285                   snow texa eu probabl global warm  positif
10286  human easili knowledg abil solv problem willbr...  negatif
10287  geoengin haarp darpa weather warfar mother ear...  positif
10288                              dont love fear monger  negatif

[10289 rows x 2 columns]


In [18]:
# Menghitung jumlah label sentimen
jumlah_sentimen = data['sentimen'].value_counts()

# Menampilkan hasil
print(jumlah_sentimen)

sentimen
positif    3583
netral     3494
negatif    3212
Name: count, dtype: int64


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Ekstraksi fitur menggunakan TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=5000, max_df=0.95, min_df=2)
X = vectorizer.fit_transform(data['comment_clean'])
y = data['sentimen']

# Pembagian data latih (80%) dan data uji (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Inisialisasi dan pelatihan model SVM
model = SVC(kernel='linear', class_weight='balanced', C=1.5)
model.fit(X_train, y_train)

# Prediksi dan evaluasi akurasi
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Akurasi Model SVM pada Training Set: {train_accuracy:.4f}')
print(f'Akurasi Model SVM pada Testing Set: {test_accuracy:.4f}')

Akurasi Model SVM pada Training Set: 0.9651
Akurasi Model SVM pada Testing Set: 0.8547
