In [1]:
#!kaggle datasets download -d aadyasingh55/fake-news-classification
#!unzip fake-news-classification.zip


In [2]:
import pandas as pd

df_train = pd.read_csv(
    "train (2).csv",
    sep=";",
    quotechar='"',
    escapechar="\\",
    engine="python")
df_test = pd.read_csv(
    "test (1).csv",
    sep=";",
    quotechar='"',
    escapechar="\\",
    engine="python")
df_evaluate = pd.read_csv(
    "evaluation.csv",
    sep=";",
    quotechar='"',
    escapechar="\\",
    engine="python")
all_dfs = [df_train, df_test, df_evaluate]

In [3]:
for df in [df_train, df_test, df_evaluate]:
    df.rename(columns={df.columns[0]: "index"}, inplace=True)
    df.title = df.title.str.lower().str.replace(r"[()\\'/:-]@.,'!‘’“”", "", regex=True)
    df.text = df.text.str.lower().str.replace("(", "").str.replace(")", "").str.replace("\\", "").str.replace("'", "").str.replace("/", "").str.replace(":", "").str.replace(" -", "").str.replace("-", "")


In [4]:
y_true_train = df_train.label.values
y_true_test = df_test.label.values
y_true_evaluate = df_evaluate.label.values

In [5]:
for df in all_dfs:
    df.drop("label", axis=1, inplace=True)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [7]:
for df in all_dfs:
    all_nas = df.isna().sum().sum()
    print(all_nas)
    assert all_nas == 0

0
0
0


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
tfidf_title = TfidfVectorizer()
tfidf_text = TfidfVectorizer()


class TwoTfIds(TfidfVectorizer):

    def __init__(self):
        super().__init__()
        self.tfidf_title = TfidfVectorizer()
        self.tfidf_text = TfidfVectorizer()

    def fit_transform(self, df: pd.DataFrame, y=None):
        x_title = self.tfidf_title.fit_transform(df.title)
        x_text = self.tfidf_text.fit_transform(df.text)
        return hstack([x_title, x_text])

    def transform(self, df: pd.DataFrame):
        x_title = self.tfidf_title.transform(df.title)
        x_text = self.tfidf_text.transform(df.text)
        return hstack([x_title, x_text])

    def load(self, tfidf_title: TfidfVectorizer, tfidf_text: TfidfVectorizer):
        self.tfidf_title = tfidf_title
        self.tfidf_text = tfidf_text
        return self

tfid_vectorizer = TwoTfIds()
x_train = tfid_vectorizer.fit_transform(df_train)

In [13]:
model = LogisticRegression(max_iter=200)
model.fit(x_train, y_true_train)

In [16]:
y_test = model.predict(tfid_vectorizer.transform(df_test))
y_evaluate = model.predict(tfid_vectorizer.transform(df_evaluate))


In [17]:
from sklearn.metrics import accuracy_score
# Evaluate the model (e.g., using accuracy)
accuracy = accuracy_score(y_test, y_true_test)
print(f"Accuracy: {accuracy:.2f}")
print(f"Accuracy on evaluation: {accuracy_score(y_evaluate, y_true_evaluate):.2f}")

#TODO: classes might be unbalanced! Check this

Accuracy: 0.98
Accuracy on evaluation: 0.97


In [20]:
import pickle
with open("model_v1.pickle", "wb") as f:
    pickle.dump(
        {
            "tfid_title": tfid_vectorizer.tfidf_title,
            "tfid_text": tfid_vectorizer.tfidf_text,
            "model": model
        },
        f
    )

In [22]:
def load_model(filename: str):
    with open(filename, "rb") as f:
        saved = pickle.load(f)
    tfid_title = saved["tfid_title"]
    tfid_text = saved["tfid_text"]
    model = saved["model"]

    tfid = TwoTfIds().load(tfid_title, tfid_text)
    return tfid, model

In [24]:
tfid_loaded, model = load_model("model_v1.pickle")

y_test_loaded = model.predict(tfid_loaded.transform(df_test))
y_evaluate_loaded = model.predict(tfid_loaded.transform(df_evaluate))
print(f"Accuracy: {accuracy_score(y_test_loaded, y_true_test):.2f}")
print(f"Accuracy on evaluation: {accuracy_score(y_evaluate_loaded, y_true_evaluate):.2f}")

Accuracy: 0.98
Accuracy on evaluation: 0.97
