In [1]:
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

nltk.download('punkt')  # Mengunduh data tokenizer dari NLTK


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Memuat dataset dari file CSV
df = pd.read_csv('train.csv')



In [3]:
df.head()

Unnamed: 0,tweets,class
0,Be aware dirty step to get money #staylight ...,figurative
1,#sarcasm for #people who don't understand #diy...,figurative
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative
3,@wilw Why do I get the feeling you like games?...,figurative
4,-@TeacherArthurG @rweingarten You probably jus...,figurative


In [4]:
# Noise Removal
df['cleaned_text'] = df['tweets'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

df.head()

Unnamed: 0,tweets,class,cleaned_text
0,Be aware dirty step to get money #staylight ...,figurative,be aware dirty step to get money staylight s...
1,#sarcasm for #people who don't understand #diy...,figurative,sarcasm for people who dont understand diy art...
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,iminworkjeremy medsingle dailymail readers bei...
3,@wilw Why do I get the feeling you like games?...,figurative,wilw why do i get the feeling you like games s...
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,teacherarthurg rweingarten you probably just m...


In [5]:
# Tokenisasi
df['tokenized_text'] = df['cleaned_text'].apply(lambda x: word_tokenize(x))


In [6]:
df.head()

Unnamed: 0,tweets,class,cleaned_text,tokenized_text
0,Be aware dirty step to get money #staylight ...,figurative,be aware dirty step to get money staylight s...,"[be, aware, dirty, step, to, get, money, stayl..."
1,#sarcasm for #people who don't understand #diy...,figurative,sarcasm for people who dont understand diy art...,"[sarcasm, for, people, who, dont, understand, ..."
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,iminworkjeremy medsingle dailymail readers bei...,"[iminworkjeremy, medsingle, dailymail, readers..."
3,@wilw Why do I get the feeling you like games?...,figurative,wilw why do i get the feeling you like games s...,"[wilw, why, do, i, get, the, feeling, you, lik..."
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,teacherarthurg rweingarten you probably just m...,"[teacherarthurg, rweingarten, you, probably, j..."


In [7]:

# Stemming
stemmer = SnowballStemmer('english')  # Membuat objek stemmer
df['stemmed_text'] = df['tokenized_text'].apply(lambda x: [stemmer.stem(word) for word in x])

# Menampilkan hasil
print(df['stemmed_text'])

0        [be, awar, dirti, step, to, get, money, stayli...
1        [sarcasm, for, peopl, who, dont, understand, d...
2        [iminworkjeremi, medsingl, dailymail, reader, ...
3        [wilw, whi, do, i, get, the, feel, you, like, ...
4        [teacherarthurg, rweingarten, you, probabl, ju...
                               ...                        
81403    [photo, imag, via, we, heart, it, httptcoky8nf...
81404    [i, never, knewi, better, put, this, out, to, ...
81405    [hey, just, want, to, say, thank, puberti, for...
81406    [im, sure, coverag, like, the, fox, news, spec...
81407    [skeyno16, at, u13, i, wont, believ, it, until...
Name: stemmed_text, Length: 81408, dtype: object


In [8]:
# TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# Menggabungkan kembali teks yang telah di-stemming menjadi kalimat
df['processed_text'] = df['stemmed_text'].apply(lambda x: ' '.join(x))


In [11]:
# Feature Extraction dengan TF-IDF
corpus = df['processed_text'].tolist()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Menampilkan hasil
print(vectorizer.get_feature_names())
print(X.toarray())





MemoryError: Unable to allocate 66.9 GiB for an array with shape (81408, 110360) and data type float64

# tokenize and build vocab
T = tf_idf_vectorizer.fit(df['processed_text'])
print(len(T.vocabulary_))

In [12]:
df.head()

Unnamed: 0,tweets,class,cleaned_text,tokenized_text,stemmed_text,processed_text
0,Be aware dirty step to get money #staylight ...,figurative,be aware dirty step to get money staylight s...,"[be, aware, dirty, step, to, get, money, stayl...","[be, awar, dirti, step, to, get, money, stayli...",be awar dirti step to get money staylight stay...
1,#sarcasm for #people who don't understand #diy...,figurative,sarcasm for people who dont understand diy art...,"[sarcasm, for, people, who, dont, understand, ...","[sarcasm, for, peopl, who, dont, understand, d...",sarcasm for peopl who dont understand diy arta...
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,iminworkjeremy medsingle dailymail readers bei...,"[iminworkjeremy, medsingle, dailymail, readers...","[iminworkjeremi, medsingl, dailymail, reader, ...",iminworkjeremi medsingl dailymail reader be se...
3,@wilw Why do I get the feeling you like games?...,figurative,wilw why do i get the feeling you like games s...,"[wilw, why, do, i, get, the, feeling, you, lik...","[wilw, whi, do, i, get, the, feel, you, like, ...",wilw whi do i get the feel you like game sarcasm
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,teacherarthurg rweingarten you probably just m...,"[teacherarthurg, rweingarten, you, probably, j...","[teacherarthurg, rweingarten, you, probabl, ju...",teacherarthurg rweingarten you probabl just mi...


In [13]:
# Membuat fitur tambahan
df['num_words'] = df['tokenized_text'].apply(lambda x: len(x))
df['avg_word_length'] = df['cleaned_text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))


In [14]:
df.head()

Unnamed: 0,tweets,class,cleaned_text,tokenized_text,stemmed_text,processed_text,num_words,avg_word_length
0,Be aware dirty step to get money #staylight ...,figurative,be aware dirty step to get money staylight s...,"[be, aware, dirty, step, to, get, money, stayl...","[be, awar, dirti, step, to, get, money, stayli...",be awar dirti step to get money staylight stay...,12,6.833333
1,#sarcasm for #people who don't understand #diy...,figurative,sarcasm for people who dont understand diy art...,"[sarcasm, for, people, who, dont, understand, ...","[sarcasm, for, peopl, who, dont, understand, d...",sarcasm for peopl who dont understand diy arta...,9,6.888889
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,iminworkjeremy medsingle dailymail readers bei...,"[iminworkjeremy, medsingle, dailymail, readers...","[iminworkjeremi, medsingl, dailymail, reader, ...",iminworkjeremi medsingl dailymail reader be se...,13,8.615385
3,@wilw Why do I get the feeling you like games?...,figurative,wilw why do i get the feeling you like games s...,"[wilw, why, do, i, get, the, feeling, you, lik...","[wilw, whi, do, i, get, the, feel, you, like, ...",wilw whi do i get the feel you like game sarcasm,11,3.818182
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,teacherarthurg rweingarten you probably just m...,"[teacherarthurg, rweingarten, you, probably, j...","[teacherarthurg, rweingarten, you, probabl, ju...",teacherarthurg rweingarten you probabl just mi...,9,6.888889


In [15]:

# Menggabungkan fitur tambahan dengan matriks TF-IDF
feature_names = vectorizer.get_feature_names()
extra_features = ['num_words', 'avg_word_length']
feature_names.extend(extra_features)

X_extra = df[extra_features].values
X_combined = pd.concat([pd.DataFrame(X.toarray(), columns=feature_names), pd.DataFrame(X_extra, columns=extra_features)], axis=1)





MemoryError: Unable to allocate 66.9 GiB for an array with shape (81408, 110360) and data type float64