In [223]:
# ----- ----- Importing Section ----- ----- 
# Data libraries
import pandas as pd
import numpy as np

# ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import pickle
from sklearn import metrics
# My utils
from sklearn.feature_extraction.text import CountVectorizer
from utils import df_utils
from sklearn.feature_extraction.text import TfidfVectorizer
from utils import nlp_utils


# ----- ----- ----- END ----- ----- ----- 

In [224]:
def read_data():
    true_df = df_utils.csv_to_dataframe(pd, './dataset/True.csv')
    true_df["c"] = 1
    false_df = df_utils.csv_to_dataframe(pd, './dataset/Fake.csv')
    false_df["c"] = 0
    df = pd.concat([true_df, false_df])
    df = df.sample(frac=1)
    df.reset_index(inplace=True)
    df.drop(["index"], axis=1, inplace=True)

    # Todo : Remove this in production code
    # df = df[:100]

    return df

In [225]:
def prepare_data_frame():
    # Read data
    df = read_data()
    # Explore & Plot - 1
    explore_and_plot_df(df)
    # Enhance
    df = enhance(df)
    # Explore & Plot - 2
    explore_and_plot_df(df)
    # Return
    return df


In [226]:
def enhance(df):
    df = df_utils.drop_cols_with_names(df, 'title', 'subject', 'date')
    df = df_utils.drop_rows_with_null(df)
    # Corpus calculations as text
    df = process_df_text(df)

    return df

In [227]:
def process_df_text(df):
    # Lowercase
    df['text'] = df['text'].str.lower()

    # Stopwords
    # df['stopwords'] = df.apply(
    #     lambda row: nlp_utils.count_stopwords(row['text']),
    #     axis=1
    # )
    df['text'] = df.apply(
        lambda row: nlp_utils.remove_stopwords(row['text']),
        axis=1
    )

    # Punctuations
    # df['punctuations'] = df.apply(
    #     lambda row: nlp_utils.count_punctuation(row['text']),
    #     axis=1
    # )
    df['text'] = df.apply(
        lambda row: nlp_utils.remove_punctuation(row['text']),
        axis=1
    )

    # Stem
    df['text_stem'] = df.apply(
        lambda row: nlp_utils.porter_stemmer(row['text']),
        axis=1
    )

    # Lem
    # df['text_lem'] = df.apply(
    #     lambda row: nlp_utils.word_net_lemmatizer(row['text']),
    #     axis=1
    # )

    """
    print('\n\n 3-gram')
    df['3gram'] = nlp_utils.counter_gram(df['text_stem'],3)
  
    print('\n\n 1-gram')
    df['1gram'] = nlp_utils.counter_gram(df['text_stem'],1)
    print('\n\n 2-gram')
    df['2gram'] = nlp_utils.counter_gram(df['text_stem'],2)
    """
    print(df.head())
    df = df_utils.drop_col_with_name(df, "text")
    return df



In [228]:
def explore_and_plot_df(df):
    df_utils.print_dataframe_essential_info(df, np)
    pass

In [240]:

def feature_extraction_binary_transform(df):
    df = df_utils.drop_rows_with_null(df)

    vectorization = CountVectorizer(binary=True)

    xv_train = vectorization.fit_transform(df['text_stem'])
    y = df['c']

    return xv_train, y

In [230]:
def feature_extraction_CountVectorize(df):
    df = df_utils.drop_rows_with_null(df)

    vectorization = CountVectorizer()

    xv_train = vectorization.fit_transform(df['text_stem'])
    y = df['c']

    return xv_train, y

In [231]:
def feature_extraction_TfidfVectorize(df):
    df = df_utils.drop_rows_with_null(df)

    vectorization = TfidfVectorizer()

    xv_train = vectorization.fit_transform(df['text_stem'])
    y = df['c']

    return xv_train, y

In [232]:
def train_model(x_train, y_train):
    """
    Using Two Models MultinomialNB return as NVB & SVC return as SCV
    """
    NVB = MultinomialNB()
    NVB.fit(x_train, y_train)
    SCV = SVC(gamma='auto')
    SCV.fit(x_train, y_train)
    RFC = RandomForestClassifier(random_state=0)
    RFC.fit(x_train, y_train)

    return NVB, SCV, RFC


In [233]:
def save_model(model, path):
    pickle.dump(model, open(path, 'wb'))

In [234]:
def make_predictions(model_path, x_test):
    loaded_model = pickle.load(open(model_path, 'rb'))
    predictions = loaded_model.predict(x_test)
    return predictions

In [235]:
def evaluation(model_path, x_test, y_test):
    loaded_model = pickle.load(open(model_path, 'rb'))
    result = loaded_model.score(x_test, y_test)
    y_hat = loaded_model.predict(x_test)
    print(metrics.confusion_matrix(y_test, y_hat))
    return result

In [236]:
def main_binary_encoding():
    X, y = feature_extraction_binary_transform(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    NVB, SVC, RFC = train_model(X_train, y_train)
    save_model(NVB, 'NVB.sav')
    save_model(SVC, 'SVC.sav')
    save_model(RFC, 'RFC.sav')
    #make_predictions('NVB.sav',x_test)
    #NVB
    print('Naive Bayes Accuracy\n\n')
    print(evaluation('NVB.sav', X_test, y_test))
    print('SVC Accuracy\n\n')
    print(evaluation('SVC.sav', X_test, y_test))
    print('RFC Accuracy\n\n')
    print(evaluation('RFC.sav', X_test, y_test))


In [237]:
def main_CountVectorize():
    X, y = feature_extraction_CountVectorize(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    NVB, SVC, RFC = train_model(X_train, y_train)
    save_model(NVB, 'NVB.sav')
    save_model(SVC, 'SVC.sav')
    save_model(RFC, 'RFC.sav')
    #make_predictions('NVB.sav',x_test)
    #NVB
    print('Naive Bayes Accuracy\n\n')
    print(evaluation('NVB.sav', X_test, y_test))
    print('SVC Accuracy\n\n')
    print(evaluation('SVC.sav', X_test, y_test))
    print('RFC Accuracy\n\n')
    print(evaluation('RFC.sav', X_test, y_test))


In [238]:
def main_TfidfVectorize():
    X, y = feature_extraction_TfidfVectorize(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    NVB, SVC, RFC = train_model(X_train, y_train)
    save_model(NVB, 'NVB.sav')
    save_model(SVC, 'SVC.sav')
    save_model(RFC, 'RFC.sav')
    #make_predictions('NVB.sav',x_test)
    #NVB
    print('Naive Bayes Accuracy\n\n')
    print(evaluation('NVB.sav', X_test, y_test))
    print('SVC Accuracy\n\n')
    print(evaluation('SVC.sav', X_test, y_test))
    print('RFC Accuracy\n\n')
    print(evaluation('RFC.sav', X_test, y_test))


In [239]:

if __name__ == "__main__":
    nlp_utils.download_book()
    df = prepare_data_frame()
    #vocab = sorted(set(word for sentence in df['text_stem'] for word in sentence.split()))
    print('binary_encoding\n\n')
    main_binary_encoding()
    # print('CountVectorize\n\n')
    # main_CountVectorize()
    # print('TfidfVectorize\n\n')
    # main_TfidfVectorize()

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\mmaba\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\mmaba\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     C:\Users\mmaba\AppData\Roaming\nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\mmaba\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     C:\Users\mmaba\AppData\Roaming\nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     C:\Users\mmaba\AppData\R

Data frame shape: (44898, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   c        44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB
Data frame info: None

---Data Correlation ---
     c
c  1.0

---Data frame null count ---
title      0
text       0
subject    0
date       0
c          0
dtype: int64

Total values : 224490
Total missing values : 0
Remaining : 224490

Remaining percentage : 100.0%
Missing percentage : 0.0%
---
                                                text  c  \
0  washington reuters  clinton foundation multipl...  1   
1  unions things america want money invested bern...  0   
2  brave cop and mommy reminds us police officers...  0   
3  government continu

NameError: name 'true' is not defined

In [241]:
print('binary_encoding\n\n')
main_binary_encoding()


binary_encoding


Naive Bayes Accuracy


[[6705  353]
 [ 124 6288]]
0.9645879732739421
SVC Accuracy


[[7030   28]
 [3143 3269]]
0.7645879732739421
RFC Accuracy


[[6970   88]
 [  49 6363]]
0.9898292501855976
