<a href="https://colab.research.google.com/github/omid-sakaki-ghazvini/Projects/blob/main/Persian_Fake_News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install Dependencies and Setup

<div style="direction:rtl">
<font color='green' size="5px">
 کتابخانه های مورد نیاز را نصب میکنیم
    </font>
</div>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from keras import backend as K
from keras.utils import to_categorical
from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

import warnings

warnings.filterwarnings('ignore')

<div style="direction:rtl">
<font color='green' size="5px">
 از لینک زیر دیتاست را دانلود کرده و در پوشه هم مسیر همین ژوپیتر نوت بوک قرار دهید
    </font>
</div>

## https://www.kaggle.com/datasets/omidsakaki1370/persian-fake-corona-news

# 2. Load Data

<div style="direction:rtl">
<font color='green' size="5px">
توسط خط فرمان زیر، دیتا را فراخوانی میکنیم
    </font>
</div>

In [None]:
df = pd.read_csv('/kaggle/input/persian-fake-corona-news/CoronaALL.csv',sep='\t')
df.head()

In [None]:
df = df.drop(columns=['Unnamed: 0'])
df = df[df["Class"]!=2]
df.info()

In [None]:
df = df.dropna()
df.describe()

# 4.Data Analysis

In [None]:
sns.countplot(data = df, x = df['Class']);

In [None]:
df['Class_Names']=df['Class'].map({0:'Posetive',1:'Negetive'})

In [None]:
plt.figure(figsize=(5, 5))
palette_color = sns.color_palette('pastel')
explode = [0.1 for _ in range(df['Class_Names'].nunique())]

target_counts = df.groupby('Class_Names')['Class_Names'].count()

target_counts.plot.pie(
    colors=palette_color,
    explode=explode,
    autopct="%1.1f%%",
    shadow=True,
    startangle=140,
    textprops={'fontsize': 14},
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5}
)

plt.title('Class Names Distribution', fontsize=18, weight='bold')
plt.axis('equal')
plt.show()

# 3.Data preparation

In [None]:
max_fatures = 1500
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['Text'].values)
X = tokenizer.texts_to_sequences(df['Text'].values)
X = pad_sequences(X,maxlen=150)

y=df['Class']

# 4. Split & Scale Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# 5. Build ML Models

In [None]:
models = {
    'TREE': DecisionTreeClassifier(),
    'RF' : RandomForestClassifier(),
    'SVM': SVC(),
    'AdaBoost': AdaBoostClassifier(),
    'GradBoost': GradientBoostingClassifier(),
    'CATBoost': CatBoostClassifier(),
    'XGBCL': XGBClassifier(),
    'LGBMCL': LGBMClassifier()
}

result = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    result[name]={
        'Accuracy':accuracy
    }

result_df = pd.DataFrame(result).T

In [None]:
result_df

In [None]:
plt.figure(figsize=(10,6))

plt.title("Results")

sns.lineplot(result_df)

plt.ylabel("Accuracy");

# 6. Build CNN Models

## 6.1. Creating Embedding Matrix

In [None]:
input_size = 150

data = pad_sequences(X, maxlen=input_size)
labels = to_categorical(labels)

In [None]:
vocab_size = len(tokenizer.word_index)
embedding_weights = []
embedding_weights.append(np.zeros(vocab_size))

for char, i in tokenizer.word_index.items():
    onehot = np.zeros(vocab_size)
    onehot[i - 1] = 1
    embedding_weights.append(onehot)

embedding_weights = np.array(embedding_weights)

embedding_layer = Embedding(vocab_size + 1, vocab_size, input_length=input_size, weights=[embedding_weights])

## 6.2. Model Architecture

In [None]:
conv_layers = [[256, 7, 3],
               [256, 7, 3],
               [128, 3, -1],
               [128, 3, -1],
               [128, 3, -1],
               [128, 3, 3]]

fully_connected_layers = [128]
num_of_classes = 2
dropout_p = 0.5
optimizer = 'adam'
loss_type = 'categorical_crossentropy'

In [None]:
inputs = Input(shape=(input_size,), name='input', dtype='int64')
x = embedding_layer(inputs)

for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size)(x)
    x = Dropout(dropout_p)(x)
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x)
x = Flatten()(x)

for dense_size in fully_connected_layers:
    x = Dense(dense_size, activation='relu')(x)
    x = Dropout(dropout_p)(x)

predictions = Dense(num_of_classes, activation='softmax')(x)

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss_type, metrics=['accuracy'])

In [None]:
hist=model.fit(X_train, y_train, epochs=100,validation_data=(X_test,y_test), batch_size=100, callbacks=[early_stopping])

In [None]:
hist
plt.figure(0)
plt.plot(hist.history['accuracy'],'r')
plt.plot(hist.history['val_accuracy'],'g')
plt.xticks(np.arange(0, 11, 2.0))
plt.rcParams['figure.figsize'] = (8, 6)
plt.xlabel("Num of Epochs")
plt.ylabel("Accuracy")
plt.title("Training Accuracy vs Validation Accuracy")
plt.legend(['train','validation'])

plt.figure(1)
plt.plot(hist.history['loss'],'r')
plt.plot(hist.history['val_loss'],'g')
plt.xticks(np.arange(0, 11, 2.0))
plt.rcParams['figure.figsize'] = (8, 6)
plt.xlabel("Num of Epochs")
plt.ylabel("Loss")
plt.title("Training Loss vs Validation Loss")
plt.legend(['train','validation'])

plt.show()

In [None]:
prediction=model.predict(X_test)

CNN_Model_accuracy = classification_report(np.argmax(prediction,axis=-1),np.argmax(y_test,axis=-1),output_dict=True)['accuracy']

print('CNN Model Accuracy : ', CNN_Model_accuracy)

In [None]:
models = {
    'TREE': [0.820557],
    'RF' : [0.832482],
    'SVM': [0.659852],
    'AdaBoost': [0.586599],
    'GradBoost': [0.674049],
    'CATBoost': [0.805224],
    'XGBCL': [0.829074],
    'LGBMCL':  [0.805792],
}

In [None]:
pd.DataFrame(models)

In [None]:
models.update({'CNN' : CNN_Model_accuracy})

In [None]:
result_df = pd.DataFrame(models, index = ['Accuracy']).T
result_df

In [None]:
plt.figure(figsize=(10,6))

plt.title("Results")

sns.lineplot(result_df)

plt.ylabel("Accuracy");

## One of the most important disadvantages of convolutional networks is the presence of fully connected layers that include the largest number of learner parameters. These layers are responsible for learning the features extracted by convolutional layers. The fully connected layers in the convolutional network are computationally heavy and time-consuming, that's why in this notebook, instead of the fully connected layers, we use the support vector machine classifier and check the results.

## 7. CNN + ML models

In [None]:
layer_name = 'dense'
model_feat = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)

feat_train = model_feat.predict(X_train)
print(feat_train.shape)
feat_test = model_feat.predict(X_test)
print(feat_test.shape)

In [None]:
models = {
    'TREE': DecisionTreeClassifier(),
    'RF' : RandomForestClassifier(),
    'SVM': SVC(),
    'AdaBoost': AdaBoostClassifier(),
    'GradBoost': GradientBoostingClassifier(),
    'CATBoost': CatBoostClassifier(),
    'XGBCL': XGBClassifier(),
    'LGBMCL': LGBMClassifier()
}

result = {}

for name, model in models.items():
    model.fit(feat_train, np.argmax(y_train,axis=-1))
    y_pred = model.predict(feat_test)

    accuracy = accuracy_score(np.argmax(y_test,axis=-1), y_pred)

    result[name]={
        'Accuracy':accuracy
    }

result_df = pd.DataFrame(result).T

# 8.Result

In [None]:
models = {
    'TREE': [0.820557],
    'RF' : [0.832482],
    'SVM': [0.659852],
    'AdaBoost': [0.586599],
    'GradBoost': [0.674049],
    'CATBoost': [0.805224],
    'XGBCL': [0.829074],
    'LGBMCL':  [0.805792],
    'CNN' : [0.867121]
}

In [None]:
Previous_results = pd.DataFrame(models, index = ['Previous_Accuracy']).T
Previous_results

In [None]:
result_df

In [None]:
fig, axes = plt.subplots(1, 2, sharex=True, figsize=(20,5))

sns.lineplot(Previous_results , ax=axes[0]);
axes[0].set_title('Previous Results');

sns.lineplot(result_df , ax=axes[1]);
axes[1].set_title('Current Results');