In [153]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


df = pd.read_csv('./news.csv') # Buka dataset

In [154]:
df = df.sample(frac=1, random_state=42) 

In [155]:
selected_column = ["title", "text","label"]

df_cleaned = df[selected_column].dropna() 

In [156]:
df_cleaned.head()

Unnamed: 0,title,text,label
1357,"American Dream, Revisited","Will Trump pull a Brexit times ten? What would it take, beyond WikiLeaks, to bring the Clinton (...",FAKE
2080,Clintons Are Under Multiple FBI Investigations as Agents Are Stymied,Clintons Are Under Multiple FBI Investigations as Agents Are Stymied Source: Wall street on pa...,FAKE
2718,The FBI Can’t Actually Investigate a Candidate Such as Hillary Clinton.,Dispatches from Eric Zuesse This piece is crossposted at strategic-culture.org The power above t...,FAKE
812,Confirmed: Public overwhelmingly (10-to-1) says media want Hillary to win,Print \n[Ed. – Every now and then the facade cracks. Somebody asks a question the media haven’t ...,FAKE
4886,Nanny In Jail After Force Feeding Baby To Death,"Nanny In Jail After Force Feeding Baby To Death 2 shares by Ike Mclean / October 27, 2016 / LIFE...",FAKE


In [157]:
df_cleaned.shape

(6335, 3)

In [158]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from langdetect import detect
import string
import nltk
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK data if needed
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\62822\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\62822\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [159]:
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()
corpus = df_cleaned


pd.set_option('display.max_colwidth', 100)


In [160]:
corpus['text'].head() # Sebelum Casefolding

1357    Will Trump pull a Brexit times ten? What would it take, beyond WikiLeaks, to bring the Clinton (...
2080    Clintons Are Under Multiple FBI Investigations as Agents Are Stymied   Source: Wall street on pa...
2718    Dispatches from Eric Zuesse This piece is crossposted at strategic-culture.org The power above t...
812     Print \n[Ed. – Every now and then the facade cracks. Somebody asks a question the media haven’t ...
4886    Nanny In Jail After Force Feeding Baby To Death 2 shares by Ike Mclean / October 27, 2016 / LIFE...
Name: text, dtype: object

In [161]:
corpus['text'].apply(lambda x: x.lower()).head() # Setelah CaseFolding

1357    will trump pull a brexit times ten? what would it take, beyond wikileaks, to bring the clinton (...
2080    clintons are under multiple fbi investigations as agents are stymied   source: wall street on pa...
2718    dispatches from eric zuesse this piece is crossposted at strategic-culture.org the power above t...
812     print \n[ed. – every now and then the facade cracks. somebody asks a question the media haven’t ...
4886    nanny in jail after force feeding baby to death 2 shares by ike mclean / october 27, 2016 / life...
Name: text, dtype: object

In [162]:
corpus['text'].head()

1357    Will Trump pull a Brexit times ten? What would it take, beyond WikiLeaks, to bring the Clinton (...
2080    Clintons Are Under Multiple FBI Investigations as Agents Are Stymied   Source: Wall street on pa...
2718    Dispatches from Eric Zuesse This piece is crossposted at strategic-culture.org The power above t...
812     Print \n[Ed. – Every now and then the facade cracks. Somebody asks a question the media haven’t ...
4886    Nanny In Jail After Force Feeding Baby To Death 2 shares by Ike Mclean / October 27, 2016 / LIFE...
Name: text, dtype: object

In [163]:
corpus['text'][: 10].apply(lambda x: ' '.join([word.lower() for word in x.split(' ') if word not in stop_words])).head()

1357    will trump pull brexit times ten? what would take, beyond wikileaks, bring clinton (cash) machin...
2080    clintons are under multiple fbi investigations agents are stymied   source: wall street parade \...
2718    dispatches eric zuesse this piece crossposted strategic-culture.org the power u.s. federal burea...
812     print \n[ed. – every facade cracks. somebody asks question media haven’t intervened spin yet, bi...
4886    nanny in jail after force feeding baby to death 2 shares ike mclean / october 27, 2016 / life / ...
Name: text, dtype: object

In [164]:
corpus['text'][: 10].apply(lambda x: ' '.join([porter.stem(word.lower()) for word in x.split(' ') if word not in stop_words])).head()

1357    will trump pull brexit time ten? what would take, beyond wikileaks, bring clinton (cash) machin ...
2080    clinton are under multipl fbi investig agent are stymi   source: wall street parad \ndisgrac for...
2718    dispatch eric zuess thi piec crosspost strategic-culture.org the power u.s. feder bureau investi...
812     print \n[ed. – everi facad cracks. somebodi ask question media haven’t interven spin yet, bit tr...
4886    nanni in jail after forc feed babi to death 2 share ike mclean / octob 27, 2016 / life / \nolure...
Name: text, dtype: object

In [165]:
corpus['text'][: 10].apply(lambda x: word_tokenize(' '.join([porter.stem(word.lower()) for word in x.split(' ') if word not in stop_words]))).head()

1357    [will, trump, pull, brexit, time, ten, ?, what, would, take, ,, beyond, wikileaks, ,, bring, cli...
2080    [clinton, are, under, multipl, fbi, investig, agent, are, stymi, source, :, wall, street, parad,...
2718    [dispatch, eric, zuess, thi, piec, crosspost, strategic-culture.org, the, power, u.s., feder, bu...
812     [print, [, ed, ., –, everi, facad, cracks, ., somebodi, ask, question, media, haven, ’, t, inter...
4886    [nanni, in, jail, after, forc, feed, babi, to, death, 2, share, ike, mclean, /, octob, 27, ,, 20...
Name: text, dtype: object

In [166]:
# Tokenize, remove stopwords, and perform stemming
preprocessed_corpus = []
labels = []

for index, row in corpus.iterrows():
    words = word_tokenize(row['text'])
    filtered_words = [porter.stem(word.lower()) for word in words if word.isalnum() and word.lower() not in stop_words]

    # Only append to preprocessed_corpus if filtered_words is not empty
    if filtered_words:
        preprocessed_corpus.append(' '.join(filtered_words))
        labels.append(row['label'])

# Create a DataFrame for preprocessed text and labels
result_df = pd.DataFrame({'text': preprocessed_corpus, 'label': labels})

# Save the result DataFrame to CSV
result_df.to_csv('preprocessed_corpus_with_labels.csv', index=False)

In [167]:
preprocessed_corpus = pd.read_csv('./preprocessed_corpus_with_labels.csv', dtype=str)

preprocessed_corpus.head()


Unnamed: 0,text,label
0,trump pull brexit time ten would take beyond wikileak bring clinton cash machin hillari win decl...,FAKE
1,clinton multipl fbi investig agent stymi sourc wall street parad disgrac former congressman anth...,FAKE
2,dispatch eric zuess piec crosspost power feder bureau investig fbi attorney gener person presid ...,FAKE
3,print ed everi facad crack somebodi ask question media interven spin yet bit truth peek public r...,FAKE
4,nanni jail forc feed babi death 2 share ike mclean octob 27 2016 life oluremi oyindasola 66 glen...,FAKE


In [168]:
import joblib
# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=2000)

# Fit and transform the preprocessed corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(list(preprocessed_corpus['text']))

joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
# Convert the TF-IDF matrix to a DataFrame (for better visualization)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names(), 
                        index=['Doc'+str(i+1) for i in range(preprocessed_corpus.shape[0])])

# Reset indices before concatenating
preprocessed_corpus.reset_index(drop=True, inplace=True)
tfidf_df.reset_index(drop=True, inplace=True)

# Concatenate TF-IDF DataFrame with the label column
result_df = pd.concat([tfidf_df, preprocessed_corpus['label']], axis=1)

result_df.head()


Unnamed: 0,10,100,11,12,13,14,15,16,17,18,...,year,yemen,yesterday,yet,york,young,zero,zika,zone,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.014132,0.0,0.0,0.0,0.0,0.0,0.036473,0.0,0.03715,FAKE
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041144,0.0,0.0,0.0,0.066209,0.0,0.0,0.0,0.0,FAKE
2,0.025978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.007412,0.0,0.0,0.034113,0.0,0.0,0.0,0.0,0.0,FAKE
3,0.16135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.070627,0.0,0.0,0.0,0.0,0.0,FAKE
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FAKE


In [169]:
# Fit and transform the preprocessed corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(list(preprocessed_corpus['text']))

tfidf_df.head()

Unnamed: 0,10,100,11,12,13,14,15,16,17,18,...,ye,year,yemen,yesterday,yet,york,young,zero,zika,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.014132,0.0,0.0,0.0,0.0,0.0,0.036473,0.0,0.03715
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.041144,0.0,0.0,0.0,0.066209,0.0,0.0,0.0,0.0
2,0.025978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.007412,0.0,0.0,0.034113,0.0,0.0,0.0,0.0,0.0
3,0.16135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.070627,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [170]:
from sklearn.model_selection import train_test_split

# Assuming result_df is your DataFrame with preprocessed text, TF-IDF features, and labels
X = result_df.drop('label', axis=1)  # Features (TF-IDF)
y = result_df['label'].replace({"REAL": 0, "FAKE": 1}).T.groupby(level=0).last().T # Labels

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [171]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Create KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors

# Train the model
knn_model.fit(X_train, y_train)

# Make predictions on the test set
knn_predictions = knn_model.predict(X_test)

# Evaluate accuracy
knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_classification_report = classification_report(y_test, knn_predictions)

print(f'KNN Accuracy: {knn_accuracy:.2f}')
print('KNN Classification Report:')
print(knn_classification_report)


KNN Accuracy: 0.85
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.80      0.84       641
           1       0.81      0.90      0.85       619

    accuracy                           0.85      1260
   macro avg       0.85      0.85      0.85      1260
weighted avg       0.85      0.85      0.85      1260



In [172]:
from sklearn.svm import SVC
import joblib

# Misalkan model Anda disimpan dalam variabel 'svm_model'


# Create SVM model
svm_model = SVC(kernel='linear')  # You can adjust the kernel type (linear, polynomial, etc.)

# Train the model
svm_model.fit(X_train, y_train)

joblib.dump(svm_model, 'svm_model.joblib')
# Make predictions on the test set
svm_predictions = svm_model.predict(X_test)

# Evaluate accuracy, precision, recall, and F1-score for SVM
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_classification_report = classification_report(y_test, svm_predictions)

print(f'SVM Accuracy: {svm_accuracy:.6f}')
print('SVM Classification Report:')
print(svm_classification_report)


SVM Accuracy: 0.911111
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       641
           1       0.90      0.92      0.91       619

    accuracy                           0.91      1260
   macro avg       0.91      0.91      0.91      1260
weighted avg       0.91      0.91      0.91      1260



In [173]:
from sklearn.linear_model import LogisticRegression

# Create Logistic Regression model
lr_model = LogisticRegression(random_state=42)

# Train the model
lr_model.fit(X_train, y_train)

# Make predictions on the test set
lr_predictions = lr_model.predict(X_test)

# Evaluate accuracy, precision, recall, and F1-score for Logistic Regression
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_classification_report = classification_report(y_test, lr_predictions)

print(f'Logistic Regression Accuracy: {lr_accuracy:.6f}')
print('Logistic Regression Classification Report:')
print(lr_classification_report)


Logistic Regression Accuracy: 0.900794
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       641
           1       0.89      0.91      0.90       619

    accuracy                           0.90      1260
   macro avg       0.90      0.90      0.90      1260
weighted avg       0.90      0.90      0.90      1260



In [174]:
from xgboost import XGBClassifier

# Create XGBoost model
xgb_model = XGBClassifier(random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
xgb_predictions = xgb_model.predict(X_test)

# Evaluate accuracy, precision, recall, and F1-score for XGBoost
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
xgb_classification_report = classification_report(y_test, xgb_predictions)

print(f'XGBoost Accuracy: {xgb_accuracy:.2f}')
print('XGBoost Classification Report:')
print(xgb_classification_report)


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGBoost Accuracy: 0.93
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.92      0.93       641
           1       0.92      0.93      0.92       619

    accuracy                           0.93      1260
   macro avg       0.93      0.93      0.93      1260
weighted avg       0.93      0.93      0.93      1260

