In [None]:
!pip install bangla-stemmer

In [None]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB

from nltk.tokenize import WhitespaceTokenizer

from bangla_stemmer.stemmer import stemmer

In [None]:
labelled_authentic = pd.read_csv("data/BanFakeNews/LabeledAuthentic-7K.csv")
labelled_fake = pd.read_csv("data/BanFakeNews/LabeledFake-1K.csv")

In [None]:
labelled_authentic.head()

In [None]:
labelled_fake.head()

In [None]:
labelled_authentic.drop("articleID", axis = 1, inplace = True)
labelled_authentic.drop("domain", axis = 1, inplace = True)
labelled_authentic.drop("date", axis = 1, inplace = True)
labelled_authentic.drop("category", axis = 1, inplace = True)
labelled_authentic.drop("source", axis = 1, inplace = True)
labelled_authentic.drop("relation", axis = 1, inplace = True)
labelled_authentic.drop("headline", axis = 1, inplace = True)

In [None]:
labelled_authentic.tail()

In [None]:
labelled_fake.drop("articleID", axis = 1, inplace = True)
labelled_fake.drop("domain", axis = 1, inplace = True)
labelled_fake.drop("date", axis = 1, inplace = True)
labelled_fake.drop("category", axis = 1, inplace = True)
labelled_fake.drop("source", axis = 1, inplace = True)
labelled_fake.drop("relation", axis = 1, inplace = True)
labelled_fake.drop("headline", axis = 1, inplace = True)
labelled_fake.drop("F-type", axis = 1, inplace = True)

In [None]:
labelled_fake.tail()

In [None]:
labelled_authentic.to_csv("data/BanFakeNews/Updated/labelled_authentic.csv", index = False)
labelled_fake.to_csv("data/BanFakeNews/Updated/labelled_fake.csv", index = False)

In [None]:
labelled_combined = pd.read_csv("data/BanFakeNews/Updated/labelled_combined.csv")

In [None]:
labelled_combined.head()

In [None]:
labelled_combined.tail()

### Function for standardizing text (removing punctuations and bangla digits)

In [None]:
def standardize_text(df, feature):
    df[feature] = df[feature].str.replace(r"[(),!?@\'\/\`\-\"\_\n]", " ")
    df[feature] = df[feature].str.replace(r"।", " ")
    df[feature] = df[feature].str.replace(r"[১২৩৪৫৬৭৮৯০]", " ")
    return df

In [None]:
data_standardized = standardize_text(labelled_combined, "content")
data_standardized.head()

In [None]:
data_standardized.content[0]

### Function for tokenizing text

In [None]:
def tokenizing(df, token_feature, feature):
    tokenizer = WhitespaceTokenizer()
    df[token_feature] = df[feature].apply(tokenizer.tokenize)

In [None]:
data_tokenized = data_standardized.copy()

# tokenizing(data_set_tokenized, "headline_tokens", "headline")
tokenizing(data_tokenized, "content_tokens", "content")

data_tokenized.head()

### Removing stop words

In [None]:
with open('data/BanFakeNews/Updated/stop_words.txt', 'r', encoding="utf8") as f:
    stop_words = [line.strip() for line in f]

print(stop_words)

In [None]:
data_stop_removed = data_tokenized.copy()

for i in range(0, 8501):
    for s in data_stop_removed["content_tokens"][i]:
        if s in stop_words:
            data_stop_removed["content_tokens"][i].remove(s)

In [None]:
data_stop_removed["content_tokens"][0]

### Function for stemming

In [None]:
def stemming(df, feature, start, end):
    stmr = stemmer.BanglaStemmer()
    
    for i in range(start, end + 1):
        stm = stmr.stem(df[feature][i])
        df[feature][i] = stm

In [None]:
data_stemmed = data_stop_removed.copy()
stemming(data_stemmed, "content_tokens", 0, 8500)

In [None]:
data_stemmed.head()

### Function for turning list to string

In [None]:
def list_to_string(df, feature_1, feature_2, start, end):
    
    for i in range(start, end + 1):
        list = df[feature_2][i]
        df[feature_1][i] = ' '.join(list)

In [None]:
data_final = data_stemmed.copy()
list_to_string(data_final, "content", "content_tokens", 0, 8500)
data_final.head()

In [None]:
data_final.drop("content_tokens", axis = 1, inplace = True)
data_final.head()

In [None]:
data_final.to_csv("data/BanFakeNews/Updated/data_final.csv", index = False)