In [1]:
# ----- ----- Importing Section ----- ----- 
# Data libraries
import pandas as pd
import numpy as np

# ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import pickle
from sklearn import metrics
# My utils
from sklearn.feature_extraction.text import CountVectorizer
from utils import df_utils
from sklearn.feature_extraction.text import TfidfVectorizer
from utils import nlp_utils


# ----- ----- ----- END ----- ----- ----- 

In [2]:
def read_data():
    df = df_utils.csv_to_dataframe(pd, './dataset/processed_data.csv')
    df = df.sample(frac=1)
    df.reset_index(inplace=True)
    df.drop(["index"], axis=1, inplace=True)
    df.rename(columns={'Content': 'text'}, inplace=True)
    df.rename(columns={'Category': 'c'}, inplace=True)
    # Todo : Remove this in production code
    # df = df[:3]

    return df

In [3]:
def prepare_data_frame():
    # Read data
    df = read_data()
    # Explore & Plot - 1
    explore_and_plot_df(df)
    # Enhance
    df = enhance(df)
    # Explore & Plot - 2
    explore_and_plot_df(df)
    # Return
    return df


In [4]:
def enhance(df):
    df = df_utils.drop_cols_with_names(df, 'Title', 'Link')
    df = df_utils.drop_rows_with_null(df)
    # Corpus calculations as text
    df = process_df_text(df)

    return df

In [5]:
def process_df_text(df):
    # Lowercase
    # df['text'] = df['text'].str.lower()

    # Stopwords
    # df['stopwords'] = df.apply(
    #     lambda row: nlp_utils.count_stopwords(row['text']),
    #     axis=1
    # )
    # df['text'] = df.apply(
    #     lambda row: nlp_utils.remove_stopwords(row['text']),
    #     axis=1
    # )

    # Punctuations
    # df['punctuations'] = df.apply(
    #     lambda row: nlp_utils.count_punctuation(row['text']),
    #     axis=1
    # )
    df['text'] = df.apply(
        lambda row: nlp_utils.remove_punctuation(row['text']),
        axis=1
    )

    # Stem
    df['text_stem'] = df.apply(
        lambda row: nlp_utils.isir_stemmer(row['text']),
        axis=1
    )

    # Lem
    # df['text_lem'] = df.apply(
    #     lambda row: nlp_utils.word_net_lemmatizer(row['text']),
    #     axis=1
    # )

    # print('\n\n 3-gram')
    # df['3gram'] = nlp_utils.counter_gram(df['text_stem'],3)
    #
    # print('\n\n 1-gram')
    # df['1gram'] = nlp_utils.counter_gram(df['text_stem'],1)
    # print('\n\n 2-gram')
    # df['2gram'] = nlp_utils.counter_gram(df['text_stem'],2)

    print(df.head())
    df = df_utils.drop_col_with_name(df, "text")
    return df



In [6]:
def explore_and_plot_df(df):
    df_utils.print_dataframe_essential_info(df, np)
    pass

In [7]:

def feature_extraction_binary_transform(df):
    df = df_utils.drop_rows_with_null(df)

    vectorization = CountVectorizer(binary=True)

    xv_train = vectorization.fit_transform(df['text_stem'])
    y = df['c']

    return xv_train, y

In [8]:
def feature_extraction_CountVectorize(df):
    df = df_utils.drop_rows_with_null(df)

    vectorization = CountVectorizer()

    xv_train = vectorization.fit_transform(df['text_stem'])
    y = df['c']

    return xv_train, y

In [9]:
def feature_extraction_TfidfVectorize(df):
    df = df_utils.drop_rows_with_null(df)

    vectorization = TfidfVectorizer()

    xv_train = vectorization.fit_transform(df['text_stem'])
    y = df['c']

    return xv_train, y

In [10]:
def train_model(x_train, y_train):
    """
    Using Two Models MultinomialNB return as NVB & SVC return as SCV
    """
    NVB = MultinomialNB()
    NVB.fit(x_train, y_train)
    SCV = SVC(gamma='auto')
    SCV.fit(x_train, y_train)
    RFC = RandomForestClassifier(random_state=0)
    RFC.fit(x_train, y_train)

    return NVB, SCV, RFC


In [11]:
def save_model(model, path):
    pickle.dump(model, open(path, 'wb'))

In [12]:
def make_predictions(model_path, x_test):
    loaded_model = pickle.load(open(model_path, 'rb'))
    predictions = loaded_model.predict(x_test)
    return predictions

In [13]:
def evaluation(model_path, x_test, y_test):
    loaded_model = pickle.load(open(model_path, 'rb'))
    result = loaded_model.score(x_test, y_test)
    y_hat = loaded_model.predict(x_test)
    print(metrics.confusion_matrix(y_test, y_hat))
    return result

In [14]:
def main_binary_encoding():
    X, y = feature_extraction_binary_transform(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    NVB, SVC, RFC = train_model(X_train, y_train)
    save_model(NVB, 'NVB.sav')
    save_model(SVC, 'SVC.sav')
    save_model(RFC, 'RFC.sav')
    #make_predictions('NVB.sav',x_test)
    #NVB
    print('Naive Bayes Accuracy\n\n')
    print(evaluation('NVB.sav', X_test, y_test))
    print('SVC Accuracy\n\n')
    print(evaluation('SVC.sav', X_test, y_test))
    print('RFC Accuracy\n\n')
    print(evaluation('RFC.sav', X_test, y_test))


In [15]:
def main_CountVectorize():
    X, y = feature_extraction_CountVectorize(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    NVB, SVC, RFC = train_model(X_train, y_train)
    save_model(NVB, 'NVB.sav')
    save_model(SVC, 'SVC.sav')
    save_model(RFC, 'RFC.sav')
    #make_predictions('NVB.sav',x_test)
    #NVB
    print('Naive Bayes Accuracy\n\n')
    print(evaluation('NVB.sav', X_test, y_test))
    print('SVC Accuracy\n\n')
    print(evaluation('SVC.sav', X_test, y_test))
    print('RFC Accuracy\n\n')
    print(evaluation('RFC.sav', X_test, y_test))


In [16]:
def main_TfidfVectorize():
    X, y = feature_extraction_TfidfVectorize(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    NVB, SVC, RFC = train_model(X_train, y_train)
    save_model(NVB, 'NVB.sav')
    save_model(SVC, 'SVC.sav')
    save_model(RFC, 'RFC.sav')
    #make_predictions('NVB.sav',x_test)
    #NVB
    print('Naive Bayes Accuracy\n\n')
    print(evaluation('NVB.sav', X_test, y_test))
    print('SVC Accuracy\n\n')
    print(evaluation('SVC.sav', X_test, y_test))
    print('RFC Accuracy\n\n')
    print(evaluation('RFC.sav', X_test, y_test))


In [17]:

# if __name__ == "__main__":
# nlp_utils.download_book()
# df = prepare_data_frame()
#vocab = sorted(set(word for sentence in df['text_stem'] for word in sentence.split()))
# print('binary_encoding\n\n')
# main_binary_encoding()
# print('CountVectorize\n\n')
# main_CountVectorize()
# print('TfidfVectorize\n\n')
# main_TfidfVectorize()

In [18]:
df = prepare_data_frame()

Data frame shape: (8399, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8399 entries, 0 to 8398
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   8399 non-null   object
 1   text    8386 non-null   object
 2   Link    8399 non-null   object
 3   c       8399 non-null   object
dtypes: object(4)
memory usage: 262.6+ KB
Data frame info: None

---Data Correlation ---
Empty DataFrame
Columns: []
Index: []

---Data frame null count ---
Title     0
text     13
Link      0
c         0
dtype: int64

Total values : 33596
Total missing values : 13
Remaining : 33583

Remaining percentage : 99.96130491725205%
Missing percentage : 0.03869508274794618%
---
                                                text                c  \
0  اكتشف باحثون للتو يكون ساهم مرض غامض يقتل الشم...  علوم وتكنولوجيا   
1  وقد اسفرت التفجيرات المدمره هزت مرفا بيروت 4 ا...             أخرى   
2  وفي بيان صحفي قال انطونيو غوتيريش ان الوضع ناج...      

In [19]:
print('BinaryEncoding\n\n')
main_binary_encoding()

BinaryEncoding


Naive Bayes Accuracy


[[836   4   1]
 [  8 835   4]
 [ 32  44 752]]
0.9630365659777425
SVC Accuracy


[[453   0 388]
 [ 21  69 757]
 [  3   0 825]]
0.5353736089030207
RFC Accuracy


[[819   7  15]
 [  8 820  19]
 [ 14  21 793]]
0.9666136724960255


In [20]:
print('CountVectorize\n\n')
main_CountVectorize()

CountVectorize


Naive Bayes Accuracy


[[831   6   4]
 [  5 835   7]
 [ 15  34 779]]
0.9717806041335453
SVC Accuracy


[[708   8 125]
 [  8 629 210]
 [  7  15 806]]
0.8517488076311606
RFC Accuracy


[[824   7  10]
 [  9 820  18]
 [ 14  28 786]]
0.965818759936407


In [21]:
print('TfidfVectorize\n\n')
main_TfidfVectorize()

TfidfVectorize


Naive Bayes Accuracy


[[840   1   0]
 [ 20 824   3]
 [ 55  38 735]]
0.9534976152623211
SVC Accuracy


[[  0   0 841]
 [  0   0 847]
 [  0   0 828]]
0.32909379968203495
RFC Accuracy


[[829   5   7]
 [  8 824  15]
 [ 11  20 797]]
0.9737678855325914
