In [17]:
import pandas as pd

data = "../data/feature_selection_data.csv"

data = pd.read_csv(data)
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,text,label,title_len,text_len,title_word_count,text_word_count
0,0,0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",fake,76,2114,11,361
1,1,1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,fake,71,2823,9,495
2,2,2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,fake,88,2402,14,379
3,3,3,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,true,64,629,9,88
4,4,4,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,fake,124,793,19,138


In [None]:
features = data.drop(columns="label")
target = data["label"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42, stratify=target
)

In [20]:
!pip install nltk



In [None]:
import nltk

nltk.download("punkt_tab")
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/mohiuddin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /home/mohiuddin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mohiuddin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mohiuddin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/mohiuddin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
import re
import os
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def nlp_pipeline_generator(text_series):

    for text in text_series:
        if not isinstance(text, str):
            return []

        # convert lowecase
        text = text.lower()

        # Remove Punction and Numbers
        text = re.sub(r"[^a-z\s]", "", text)

        # Tokenize
        token = word_tokenize(text)

        # Remove stopwords
        filtered_tokens = [word for word in token if word not in stop_words]

        # Lemmatize
        tokens = [lemmatizer.lemmatize(t) for t in filtered_tokens]

        return tokens


ColumnTF = ColumnTransformer(
    [
        (
            "tfidf_text",
            TfidfVectorizer(analyzer=nlp_pipeline_generator, max_features=5000),
            "text",
        ),
        (
            "tfidf_title",
            TfidfVectorizer(analyzer=nlp_pipeline_generator, max_features=5000),
            "title",
        ),
        ("scaled_TitleWC", StandardScaler(), ["title_word_count"]),
        ("scaled_TextWC", StandardScaler(), ["text_word_count"]),
        ("scaled_Textl", StandardScaler(), ["text_len"]),
        ("scaled_Titlel", StandardScaler(), ["title_len"]),
    ]
)

pipeline = Pipeline(
    [("vectorizer", ColumnTF), ("RF", RandomForestClassifier(verbose=3))], verbose=True
)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

model_path = os.path.join("models", "fake_news_full_pipeline.pkl")

joblib.dump(pipeline, model_path)
print(f"Model saved successfully at: {model_path}")

folder_name = "models"


if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"'{folder_name}' folder created inside your project.")

model_path = os.path.join(folder_name, "fake_news_full_pipeline.pkl")


joblib.dump(pipeline, model_path)

print(f"Success! Your model is saved at: {model_path}")

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   7.8s
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100


[Parallel(n_jobs=1)]: Done  31 tasks      | elapsed:   29.5s


building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.6min finished


[Pipeline] ................ (step 2 of 2) Processing RF, total= 1.6min


[Parallel(n_jobs=1)]: Done  31 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.9s finished


[[4306  390]
 [ 356 3928]]
              precision    recall  f1-score   support

        fake       0.92      0.92      0.92      4696
        true       0.91      0.92      0.91      4284

    accuracy                           0.92      8980
   macro avg       0.92      0.92      0.92      8980
weighted avg       0.92      0.92      0.92      8980



FileNotFoundError: [Errno 2] No such file or directory: 'models/fake_news_full_pipeline.pkl'

In [3]:
def nlp_pipeline_generator(text_series):

    for text in text_series:
        if not isinstance(text, str):
            return []

        # convert lowecase
        text = text.lower()

        # Remove Punction and Numbers
        text = re.sub(r"[^a-z\s]", "", text)

        # Tokenize
        token = word_tokenize(text)

        # Remove stopwords
        filtered_tokens = [word for word in token if word not in stop_words]

        # Lemmatize
        tokens = [lemmatizer.lemmatize(t) for t in filtered_tokens]

        return tokens


def model_traning():
    features = data.drop(columns="label")
    target = data["label"]

    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, random_state=42, stratify=target
    )

    ColumnTF = ColumnTransformer(
        [
            (
                "tfidf_text",
                TfidfVectorizer(analyzer=nlp_pipeline_generator, max_features=5000),
                "text",
            ),
            (
                "tfidf_title",
                TfidfVectorizer(analyzer=nlp_pipeline_generator, max_features=5000),
                "title",
            ),
            ("scaled_TitleWC", StandardScaler(), ["title_word_count"]),
            ("scaled_TextWC", StandardScaler(), ["text_word_count"]),
            ("scaled_Textl", StandardScaler(), ["text_len"]),
            ("scaled_Titlel", StandardScaler(), ["title_len"]),
        ]
    )

    pipeline = Pipeline(
        [("vectorizer", ColumnTF), ("RF", RandomForestClassifier(verbose=3))],
        verbose=True,
    )

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    model_path = os.path.join("models", "fake_news_full_pipeline.pkl")

    joblib.dump(pipeline, model_path)
    print(f"Model saved successfully at: {model_path}")

    folder_name = "models"

    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(f"'{folder_name}' folder created inside your project.")

    model_path = os.path.join(folder_name, "fake_news_full_pipeline.pkl")

    joblib.dump(pipeline, model_path)

    print(f"Success! Your model is saved at: {model_path}")
    return