In [1]:
#!kaggle datasets download -d aadyasingh55/fake-news-classification
#!unzip fake-news-classification.zip


In [2]:
import pandas as pd

df_train = pd.read_csv(
    "train (2).csv",
    sep=";",
    quotechar='"',
    escapechar="\\",
    engine="python")
df_test = pd.read_csv(
    "test (1).csv",
    sep=";",
    quotechar='"',
    escapechar="\\",
    engine="python")
df_evaluate = pd.read_csv(
    "evaluation.csv",
    sep=";",
    quotechar='"',
    escapechar="\\",
    engine="python")
all_dfs = [df_train, df_test, df_evaluate]

In [3]:
for df in [df_train, df_test, df_evaluate]:
    df.rename(columns={df.columns[0]: "index"}, inplace=True)
    df.title = df.title.str.lower().str.replace(r"[()\\'/:-]@.,'!‘’“”", "", regex=True)
    df.text = df.text.str.lower().str.replace("(", "").str.replace(")", "").str.replace("\\", "").str.replace("'", "").str.replace("/", "").str.replace(":", "").str.replace(" -", "").str.replace("-", "")


In [4]:
y_train_true = df_train.label.values
y_test_true = df_test.label.values
y_evaluate_true = df_evaluate.label.values

In [5]:
for df in all_dfs:
    df.drop("label", axis=1, inplace=True)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [7]:
for df in all_dfs:
    all_nas = df.isna().sum().sum()
    print(all_nas)
    assert all_nas == 0

0
0
0


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression


preprocess = ColumnTransformer(
    transformers=[
        ("tfidf_title", TfidfVectorizer(), "title"),
        ("tfidf_text", TfidfVectorizer(), "text")
    ],
    remainder="drop"
)
pipeline = Pipeline([
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=200, class_weight="balanced"))
])


In [28]:
pipeline.fit(df_train, y_train_true)

In [29]:
y_test = pipeline.predict(df_test)
y_evaluate = pipeline.predict(df_evaluate)


In [30]:
from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score
# Evaluate the model (e.g., using accuracy)
accuracy = accuracy_score(y_test, y_test_true)
confusion_matrix = confusion_matrix(y_test, y_test_true)
print(f"Accuracy: {accuracy:.2f}. Balanced accuracy score: {balanced_accuracy_score(y_test, y_test_true)}")
print(f"Accuracy on evaluation: {accuracy_score(y_evaluate, y_evaluate_true):.2f}, Balance on evaluation {balanced_accuracy_score(y_evaluate, y_evaluate_true)}")

#TODO: classes might be unbalanced! Check this

Accuracy: 0.98. Balanced accuracy score: 0.9775110449006783
Accuracy on evaluation: 0.97, Balance on evaluation 0.9726950354609929


In [31]:
import pickle
with open("pipeline_v1.pickle", "wb") as f:
    pickle.dump(pipeline, f)

In [32]:
def load_pipeline(filename: str):
    with open(filename, "rb") as f:
        saved = pickle.load(f)
    return saved

In [33]:
loaded_pipeline = load_pipeline("pipeline_v1.pickle")

y_test_loaded = pipeline.predict(df_test)
y_evaluate_loaded = pipeline.predict(df_evaluate)
print(f"Accuracy: {accuracy_score(y_test_loaded, y_test_true):.2f}")
print(f"Accuracy on evaluation: {accuracy_score(y_evaluate_loaded, y_evaluate_true):.2f}")

Accuracy: 0.98
Accuracy on evaluation: 0.97
