In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
import seaborn as sns
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV


In [8]:
warnings.filterwarnings("ignore")


In [9]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
df_newsgroups = pd.DataFrame({
    'Category': newsgroups.target,
    'Text': newsgroups.data    
})

df_newsgroups.head()

Unnamed: 0,Category,Text
0,10,\n\nI am sure some bashers of Pens fans are pr...
1,3,My brother is in the market for a high-perform...
2,17,\n\n\n\n\tFinally you said what you dream abou...
3,3,\nThink!\n\nIt's the SCSI card doing the DMA t...
4,4,1) I have an old Jasmine drive which I cann...


In [10]:
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()

    text = re.sub(r'[^a-zA-Z\s]', '', text)

    text = text.lower()

    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])

    return text

In [11]:
df_newsgroups['cleaned_text'] = df_newsgroups['Text'].apply(clean_text)

In [26]:
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

df_newsgroups['lemmatized_text'] = df_newsgroups['cleaned_text'].apply(lemmatize_text)

print(df_newsgroups[['cleaned_text', 'lemmatized_text']].head())

                                        cleaned_text  \
0  sure bashers pens fans pretty confused lack ki...   
1  brother market highperformance video card supp...   
2  finally said dream mediterranean new area grea...   
3  think scsi card doing dma transfers disks scsi...   
4  old jasmine drive use new understanding upsate...   

                                     lemmatized_text  
0  sure basher pen fan pretty confused lack kind ...  
1  brother market highperformance video card supp...  
2  finally say dream mediterranean new area great...  
3  think scsi card dma transfer disk scsi card dm...  
4  old jasmine drive use new understanding upsate...  


In [24]:
nlp = spacy.load('en_core_web_sm')

## word embeddings using glove 6b 100d

In [27]:
glove_path = "C:\\Users\\maria\\OneDrive\\Desktop\\glove.6B.100d.txt"  

In [28]:
df_newsgroups['tokens'] = df_newsgroups['lemmatized_text'].apply(lambda x: x.split())


In [29]:
embedding_dict = {}
with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = vector

In [30]:
def text_to_vector(tokens, embedding_dict):
    word_vectors = [embedding_dict[word] for word in tokens if word in embedding_dict]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(100)  

df_newsgroups['doc_vector'] = df_newsgroups['tokens'].apply(lambda tokens: text_to_vector(tokens, embedding_dict))


In [36]:
df_newsgroups

Unnamed: 0,Category,Text,cleaned_text,lemmatized_text,tokens,doc_vector
0,10,\n\nI am sure some bashers of Pens fans are pr...,sure bashers pens fans pretty confused lack ki...,sure basher pen fan pretty confused lack kind ...,"[sure, basher, pen, fan, pretty, confused, lac...","[-0.09148375, 0.1848849, 0.3459299, -0.4693587..."
1,3,My brother is in the market for a high-perform...,brother market highperformance video card supp...,brother market highperformance video card supp...,"[brother, market, highperformance, video, card...","[-0.18886025, -0.020166146, -0.009497073, -0.3..."
2,17,\n\n\n\n\tFinally you said what you dream abou...,finally said dream mediterranean new area grea...,finally say dream mediterranean new area great...,"[finally, say, dream, mediterranean, new, area...","[-0.016622422, 0.2094453, 0.26692814, -0.01739..."
3,3,\nThink!\n\nIt's the SCSI card doing the DMA t...,think scsi card doing dma transfers disks scsi...,think scsi card dma transfer disk scsi card dm...,"[think, scsi, card, dma, transfer, disk, scsi,...","[-0.2786962, 0.04869291, 0.03302267, -0.281397..."
4,4,1) I have an old Jasmine drive which I cann...,old jasmine drive use new understanding upsate...,old jasmine drive use new understanding upsate...,"[old, jasmine, drive, use, new, understanding,...","[-0.10370016, 0.13565378, 0.19263354, -0.00533..."
...,...,...,...,...,...,...
18841,13,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,dn nyedacnsvaxuwecedu david nye dn neurology d...,dn nyedacnsvaxuwecedu david nye dn neurology d...,"[dn, nyedacnsvaxuwecedu, david, nye, dn, neuro...","[-0.16151656, 0.08059155, 0.19148546, 0.007797..."
18842,12,\nNot in isolated ground recepticles (usually ...,isolated ground recepticles usually unusual co...,isolate ground recepticle usually unusual colo...,"[isolate, ground, recepticle, usually, unusual...","[-0.27700564, 0.15688774, 0.055569075, -0.0099..."
18843,3,I just installed a DX2-66 CPU in a clone mothe...,just installed dx cpu clone motherboard tried ...,instal dx cpu clone motherboard try mount cpu ...,"[instal, dx, cpu, clone, motherboard, try, mou...","[-0.4123846, 0.24455638, 0.120964006, -0.19238..."
18844,1,\nWouldn't this require a hyper-sphere. In 3-...,wouldnt require hypersphere space points speci...,not require hypersphere space point specifie s...,"[not, require, hypersphere, space, point, spec...","[-0.09420526, 0.19943206, 0.42651805, 0.169537..."


In [31]:
from sklearn.model_selection import train_test_split

X = np.vstack(df_newsgroups['doc_vector'].values)
y = df_newsgroups['Category'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

svm = SVC()

param_grid = {
    'C': [0.1, 1, 10, 100],  
    'kernel': ['linear', 'rbf'],  
    'gamma': ['scale', 0.1, 0.01, 0.001]  
}

grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)  

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

best_svm = grid_search.best_estimator_
test_accuracy = best_svm.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)


Fitting 3 folds for each of 32 candidates, totalling 96 fits
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Cross-Validation Accuracy: 0.606725980127219
Test Accuracy: 0.6090185676392573


In [34]:
from sklearn.ensemble import RandomForestClassifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 3 folds for each of 216 candidates, totalling 648 fits
{'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}
0.5666617964693049


In [38]:
from sklearn.ensemble import GradientBoostingClassifier

gbm = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3)
gbm.fit(X_train, y_train)
print(f"Test Accuracy: {gbm.score(X_test, y_test)}")


Test Accuracy: 0.5514588859416446
