<a href="https://colab.research.google.com/github/lymoelopez/automated-filipino-fake-news-detector/blob/main/baselineModelTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baseline Model Training

## Import Libraries

In [None]:
import numpy as np
import pandas as pd

## Load Preprocessed Training Dataset

In [None]:
from google.colab import files
import io

In [47]:
# Import preprocessed training dataset from local files
preprocessedTrainingDataset = files.upload()

Saving train.csv to train.csv


In [41]:
# Store preprocessed training dataset into a Pandas Dataframe
preprocessedTrainingDataFrame = pd.read_csv(io.BytesIO(preprocessedTrainingDataset['train.csv']))

In [42]:
preprocessedTrainingDataFrame.head()  

Unnamed: 0.1,Unnamed: 0,label,article
0,0,0,"<s> ayon Ġsa Ġthe w rap . com , Ġnaghain Ġng Ġ..."
1,1,0,<s> kilala Ġrin Ġang Ġsinger Ġsa Ġpagk ump as ...
2,2,0,"<s> bl ant y re , Ġmal awi Ġ-- Ġbumiyahe Ġpatu..."
3,3,0,"<s> kasama Ġsa Ġprograma Ġang Ġpananalangin , ..."
4,4,0,<s> l inisin Ġang Ġfriendship Ġdepartment Ġdah...


## II. Feature Extraction

In [43]:
trainingData = preprocessedTrainingDataFrame.article
trainingLabel = preprocessedTrainingDataFrame.label

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer
featureExtraction = TfidfVectorizer(max_df=0.7, min_df=5)

# Fit and transform train set
tfidf_TrainingData = featureExtraction.fit_transform(trainingData) 

### Save Fitted TF-IDF Vectorizer

In [45]:
import pickle

def saveModel(model, filename):
  pickle.dump(model, open(filename, 'wb'))

In [46]:
filename = 'baselineTfidfVectiruzer.sav'
saveModel(featureExtraction, filename)

## III. Classification Layer

In [None]:
def trainModel(model, tfidf_TrainingData, trainingLabel):
  model.fit(tfidf_TrainingData, trainingLabel)
  return model

### Train Classifier Models

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression, SGDClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.ensemble import VotingClassifier

In [None]:
logisticRegressionClassifier = LogisticRegression()
xgbClassifier = xgb.XGBClassifier()
svmClassifier = SVC()
sgdClassifier = SGDClassifier(random_state=False)
passiveAggressiveClassifier = PassiveAggressiveClassifier(random_state=False)

estimatorList = [('lr', logisticRegressionClassifier), ('xgb', xgbClassifier), ('svm', svmClassifier), ('sgd', sgdClassifier), ('pac', passiveAggressiveClassifier)]
votingClassifier = VotingClassifier(estimators=estimatorList, voting='hard')



In [None]:
modelsList = [logisticRegressionClassifier, xgbClassifier, svmClassifier, sgdClassifier, passiveAggressiveClassifier, votingClassifier]

for model in modelsList:
  model = trainModel(model, tfidf_TrainingData, trainingLabel)

### Save Trained Models

In [None]:
modelNamesList = ["lr", 'xgb', 'svm', 'sgd', 'pac', 'voting']

for modelName, model, in zip(modelNamesList, modelsList):
  filename = f'{modelName}Baseline.sav'
  saveModel(model, filename)

# References

[1] https://data-flair.training/blogs/advanced-python-project-detecting-fake-news/

[2] https://github.com/nabi-hassan/Fake-news-Detection-using-Ensemble-Learning-/blob/master/newnote.ipynb

[3] https://towardsdatascience.com/fake-news-detection-with-machine-learning-using-python-3347d9899ad1

[4] https://jovian.ai/piero-paialunga/notebook

[5] https://youtu.be/5X27excCyXk

[6] https://medium.com/@yashj302/text-cleaning-using-regex-python-f1dded1ac5bd

[7] https://stackoverflow.com/questions/32705962/removing-any-single-letter-on-a-string-in-python?answertab=scoredesc#tab-top