**Load the necessary Libraries**

In [None]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import ARLSTem
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

**Load Training and testing Dataset**

In [None]:
df_train = pd.read_csv('Train.csv')
df_test = pd.read_csv('Test.csv')

**Clean Dataset**

In [None]:
def clean_text(tweet):
  
  text = re.sub(":","", tweet)
  text = re.sub("\d+", "", text)
  text = re.sub("\.+", "", text)
  text = remove_emoji(text)
  emoji_pattern = re.compile("["
  u"\U0001F600-\U0001F64F" # emoticons
  u"\U0001F300-\U0001F5FF" # symbols & pictographs
  u"\U0001F680-\U0001F6FF" # transport & map symbols
  u"\U0001F1E0-\U0001F1FF" # flags (iOS)
  u"\U00002702-\U000027B0"
  u"\U000024C2-\U0001F251"
  "]+", flags=re.UNICODE)
  text = emoji_pattern.sub(r'', text)
  return text

In [None]:
df_train['claim_s'] = df_train['claim_s'].apply(clean_text)
df_test['claim_s'] = df_test['claim_s'].apply(clean_text)

**Convert DataFrame to a list for BOW and TF-IDF vectorization**

Usually we use CountVectorizer or TFidfVectorizer Class in sklearn.feature_extraction.text sub library.
CountVectorizer object call function fit_transform that accepts List of Strings, where each string represents sentence or any text fragment such as document or paragraph. Then, it returns matrix of numbers
Each row represents embedding vector of the sentence or text fragment.
Each column represent vector of each word or token.


In [None]:
X_train = df_train['claim_s'].to_list()
y_train= df_train['fake_flag']

X_test = df_test['claim_s'].to_list()
y_test = df_test['fake_flag']

In [None]:
#vectorizer = CountVectorizer() #option 1: BOW
vectorizer = TfidfVectorizer() #option 2: TF-IDF

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
feature_names = vectorizer.get_feature_names_out() #get featurenames or tokens in the vocab

In [None]:
df1 = pd.DataFrame(X_train.toarray(), columns = feature_names)
df2 = pd.DataFrame(X_test.toarray(), columns = feature_names)

**Build Machine Learning Algorithm to classify news as fake or not**

In [None]:
# Machine learning model to classify news as fake or not
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(df1, df_train['fake_flag'])
prd = lr.predict(df2)
print(accuracy_score(df_test['fake_flag'] , prd ))