In [2]:
import pandas as pd
df = pd.read_csv('../datasets/20_newsgroup.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,text,target,title,date
0,0,I was wondering if anyone out there could enli...,7,rec.autos,2023-10-17 19:10:32.493373
1,17,I recently posted an article asking what kind ...,7,rec.autos,2023-10-17 19:10:32.493373
2,29,\nIt depends on your priorities. A lot of peo...,7,rec.autos,2023-10-17 19:10:32.493373


In [3]:
columns_to_drop = ['Unnamed: 0', 'title', 'date']
df = df.drop(columns_to_drop, axis=1)

In [4]:
df.dropna(inplace=True)

In [5]:
"""
Now we do Data Preprocessing.
"""
import string, re, nltk
from string import punctuation
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

regexp = RegexpTokenizer("[\w']+")

def convert_to_lowercase(text):
    return text.lower()
def remove_whitespace(text):
    return text.strip()
def remove_punctuation(text):
    punct_str = string.punctuation
    punct_str = punct_str.replace("'", "") # discarding apostrophe from the string to keep the contractions intact
    return text.translate(str.maketrans("", "", punct_str))
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)
def remove_http(text):
    http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
    pattern = r"({})".format(http) # creating pattern
    return re.sub(pattern, "", text)
# Stopwords
stops = stopwords.words("english") # stopwords
addstops = ["among", "onto", "shall", "thrice", "thus", "twice", "unto", "us", "would"] # additional stopwords
allstops = stops + addstops
def remove_stopwords(text):
    return " ".join([word for word in regexp.tokenize(text) if word not in allstops])
stemmer = PorterStemmer()
def text_stemmer(text):
    text_stem = " ".join([stemmer.stem(word) for word in regexp.tokenize(text)])
    return text_stem
def discard_non_alpha(text):
    word_list_non_alpha = [word for word in regexp.tokenize(text) if word.isalpha()]
    text_non_alpha = " ".join(word_list_non_alpha)
    return text_non_alpha


In [6]:
def text_normalizer(text):
    text = convert_to_lowercase(text)
    text = remove_whitespace(text)
    text = re.sub('\n' , '', text) # converting text to one line
    text = re.sub('\[.*?\]', '', text) # removing square brackets
    text = remove_http(text)
    text = remove_punctuation(text)
    text = remove_html(text)
    text = remove_stopwords(text)
    text = discard_non_alpha(text)
    return text

In [8]:
df['text'] = df['text'].apply(text_normalizer)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,2), max_features=7500, smooth_idf=True, sublinear_tf=True)
vectorizer.fit(df['text'])
df_features = vectorizer.transform(df['text'])
df_target = df['target']

df_features_train, df_features_test, df_target_train, df_target_test = train_test_split(df_features.toarray(), df_target, test_size = 0.2, random_state=42)

In [11]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(df_features_train, df_target_train)
df_target_prediction = nb_model.predict(df_features_test)

In [12]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(df_target_test, df_target_prediction))

              precision    recall  f1-score   support

           0       0.76      0.35      0.48        88
           1       0.56      0.61      0.58       117
           2       0.68      0.57      0.62       134
           3       0.54      0.62      0.58       122
           4       0.66      0.65      0.66       103
           5       0.65      0.79      0.72       112
           6       0.69      0.71      0.70       107
           7       0.77      0.68      0.73       126
           8       0.74      0.71      0.72       123
           9       0.90      0.73      0.80       129
          10       0.64      0.89      0.74       103
          11       0.77      0.81      0.79       126
          12       0.70      0.62      0.66       117
          13       0.89      0.77      0.82       129
          14       0.82      0.71      0.76       113
          15       0.40      0.96      0.56       108
          16       0.64      0.73      0.68        93
          17       0.90    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
