In [None]:
import pandas as pd 
import numpy as np 
pd.set_option('max_colwidth', None) 
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings 
warnings.filterwarnings('ignore')
import re 
import nltk 

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC




In [2]:
df= pd.read_csv("bbc_news.csv")

In [3]:
df.columns

Index(['title', 'pubDate', 'guid', 'link', 'description'], dtype='object')

In [4]:
df.shape

(42115, 5)

In [5]:
df.head()

Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian atrocities,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-60638042?at_medium=RSS&at_campaign=KARANGA,The Ukrainian president says the country will not forgive or forget those who murder its civilians.
1,War in Ukraine: Taking cover in a town under attack,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-60641873?at_medium=RSS&at_campaign=KARANGA,"Jeremy Bowen was on the frontline in Irpin, as residents came under Russian fire while trying to flee."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?at_medium=RSS&at_campaign=KARANGA,One of the world's biggest fertiliser firms says the conflict could deliver a shock to food supplies.
3,Manchester Arena bombing: Saffie Roussos's parents on hearing the truth,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medium=RSS&at_campaign=KARANGA,The parents of the Manchester Arena bombing's youngest victim speak about their life since she died.
4,Ukraine conflict: Oil price soars to highest level since 2008,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?at_medium=RSS&at_campaign=KARANGA,Consumers are feeling the impact of higher energy costs as fuel prices and household bills jump.


In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
import sys
!{sys.executable} -m pip install nltk



In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

def remove_special_characters(text):
    text = text.lower()
    text= text.strip()
    pattern = '[^A-Za-z0-9]+'
    new_text = ''.join(re.sub(pattern, ' ', text))
    words = new_text.split()
    new_text1 = ' '.join([word for word in words if word not in stopwords.words('english')])

    return new_text1


# Clean both title and description columns in one go
for col in ['title', 'description']:
    df[f'cleaned_{col}'] = df[col].apply(remove_special_characters)



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mehakgoel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
df.loc[0:3, ['title','cleaned_title', 'description','cleaned_description']]

Unnamed: 0,title,cleaned_title,description,cleaned_description
0,Ukraine: Angry Zelensky vows to punish Russian atrocities,ukraine angry zelensky vows punish russian atrocities,The Ukrainian president says the country will not forgive or forget those who murder its civilians.,ukrainian president says country forgive forget murder civilians
1,War in Ukraine: Taking cover in a town under attack,war ukraine taking cover town attack,"Jeremy Bowen was on the frontline in Irpin, as residents came under Russian fire while trying to flee.",jeremy bowen frontline irpin residents came russian fire trying flee
2,Ukraine war 'catastrophic for global food',ukraine war catastrophic global food,One of the world's biggest fertiliser firms says the conflict could deliver a shock to food supplies.,one world biggest fertiliser firms says conflict could deliver shock food supplies
3,Manchester Arena bombing: Saffie Roussos's parents on hearing the truth,manchester arena bombing saffie roussos parents hearing truth,The parents of the Manchester Arena bombing's youngest victim speak about their life since she died.,parents manchester arena bombing youngest victim speak life since died


In [10]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

for col in ['title', 'description']:
    df[f'lemmatized_{col}'] = df[f'cleaned_{col}'].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mehakgoel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/mehakgoel/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [11]:
df.loc[0:3, ['title','lemmatized_title', 'description','lemmatized_description']]

Unnamed: 0,title,lemmatized_title,description,lemmatized_description
0,Ukraine: Angry Zelensky vows to punish Russian atrocities,ukraine angry zelensky vow punish russian atrocities,The Ukrainian president says the country will not forgive or forget those who murder its civilians.,ukrainian president say country forgive forget murder civilians
1,War in Ukraine: Taking cover in a town under attack,war ukraine take cover town attack,"Jeremy Bowen was on the frontline in Irpin, as residents came under Russian fire while trying to flee.",jeremy bowen frontline irpin residents come russian fire try flee
2,Ukraine war 'catastrophic for global food',ukraine war catastrophic global food,One of the world's biggest fertiliser firms says the conflict could deliver a shock to food supplies.,one world biggest fertiliser firm say conflict could deliver shock food supply
3,Manchester Arena bombing: Saffie Roussos's parents on hearing the truth,manchester arena bomb saffie roussos parent hear truth,The parents of the Manchester Arena bombing's youngest victim speak about their life since she died.,parent manchester arena bomb youngest victim speak life since die


In [13]:
import spacy 
nlp = spacy.load('en_core_web_sm')
def spacy_lemmatize_text(text):
    doc = nlp(text)
    lemmatized_words = [token.lemma_ for token in doc]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text  


In [14]:
# for col in ['title', 'description']:
#     df[f'spacy_lemmatized_{col}'] = df[f'cleaned_{col}'].apply(spacy_lemmatize_text) because this is taking to much of our time we will use nlp.pipe

for col in ['title', 'description']:
    texts = df[f'cleaned_{col}'].astype(str).tolist()
    docs = list(nlp.pipe(texts))
    df[f'spacy_lemmatized_{col}'] = [' '.join([token.lemma_ for token in doc]) for doc in docs]



In [None]:

df['text'] = df['spacy_lemmatized_title'].fillna("") + '. ' + df['spacy_lemmatized_description'].fillna("")

In [None]:
df.head(5)

Unnamed: 0,title,pubDate,guid,link,description,cleaned_title,cleaned_description,lemmatized_title,lemmatized_description,spacy_lemmatized_title,spacy_lemmatized_description,text
0,Ukraine: Angry Zelensky vows to punish Russian atrocities,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-60638042?at_medium=RSS&at_campaign=KARANGA,The Ukrainian president says the country will not forgive or forget those who murder its civilians.,ukraine angry zelensky vows punish russian atrocities,ukrainian president says country forgive forget murder civilians,ukraine angry zelensky vow punish russian atrocities,ukrainian president say country forgive forget murder civilians,ukraine angry zelensky vow punish russian atrocity,ukrainian president say country forgive forget murder civilian,ukraine angry zelensky vow punish russian atrocity ukrainian president say country forgive forget murder civilian
1,War in Ukraine: Taking cover in a town under attack,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-60641873?at_medium=RSS&at_campaign=KARANGA,"Jeremy Bowen was on the frontline in Irpin, as residents came under Russian fire while trying to flee.",war ukraine taking cover town attack,jeremy bowen frontline irpin residents came russian fire trying flee,war ukraine take cover town attack,jeremy bowen frontline irpin residents come russian fire try flee,war ukraine take cover town attack,jeremy bowen frontline irpin resident come russian fire try flee,war ukraine take cover town attack jeremy bowen frontline irpin resident come russian fire try flee
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?at_medium=RSS&at_campaign=KARANGA,One of the world's biggest fertiliser firms says the conflict could deliver a shock to food supplies.,ukraine war catastrophic global food,one world biggest fertiliser firms says conflict could deliver shock food supplies,ukraine war catastrophic global food,one world biggest fertiliser firm say conflict could deliver shock food supply,ukraine war catastrophic global food,one world big fertiliser firm say conflict could deliver shock food supply,ukraine war catastrophic global food one world big fertiliser firm say conflict could deliver shock food supply
3,Manchester Arena bombing: Saffie Roussos's parents on hearing the truth,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medium=RSS&at_campaign=KARANGA,The parents of the Manchester Arena bombing's youngest victim speak about their life since she died.,manchester arena bombing saffie roussos parents hearing truth,parents manchester arena bombing youngest victim speak life since died,manchester arena bomb saffie roussos parent hear truth,parent manchester arena bomb youngest victim speak life since die,manchester arena bombing saffie roussos parent hear truth,parents manchester arena bombing young victim speak life since die,manchester arena bombing saffie roussos parent hear truth parents manchester arena bombing young victim speak life since die
4,Ukraine conflict: Oil price soars to highest level since 2008,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?at_medium=RSS&at_campaign=KARANGA,Consumers are feeling the impact of higher energy costs as fuel prices and household bills jump.,ukraine conflict oil price soars highest level since 2008,consumers feeling impact higher energy costs fuel prices household bills jump,ukraine conflict oil price soar highest level since 2008,consumers feel impact higher energy cost fuel price household bill jump,ukraine conflict oil price soar high level since 2008,consumer feel impact high energy cost fuel price household bill jump,ukraine conflict oil price soar high level since 2008 consumer feel impact high energy cost fuel price household bill jump


In [17]:
df = df.drop(['cleaned_title', 'cleaned_description', 'lemmatized_title', 'lemmatized_description'], axis=1)
df.head(5)

Unnamed: 0,title,pubDate,guid,link,description,spacy_lemmatized_title,spacy_lemmatized_description,text
0,Ukraine: Angry Zelensky vows to punish Russian atrocities,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-60638042?at_medium=RSS&at_campaign=KARANGA,The Ukrainian president says the country will not forgive or forget those who murder its civilians.,ukraine angry zelensky vow punish russian atrocity,ukrainian president say country forgive forget murder civilian,ukraine angry zelensky vow punish russian atrocity ukrainian president say country forgive forget murder civilian
1,War in Ukraine: Taking cover in a town under attack,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-60641873?at_medium=RSS&at_campaign=KARANGA,"Jeremy Bowen was on the frontline in Irpin, as residents came under Russian fire while trying to flee.",war ukraine take cover town attack,jeremy bowen frontline irpin resident come russian fire try flee,war ukraine take cover town attack jeremy bowen frontline irpin resident come russian fire try flee
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?at_medium=RSS&at_campaign=KARANGA,One of the world's biggest fertiliser firms says the conflict could deliver a shock to food supplies.,ukraine war catastrophic global food,one world big fertiliser firm say conflict could deliver shock food supply,ukraine war catastrophic global food one world big fertiliser firm say conflict could deliver shock food supply
3,Manchester Arena bombing: Saffie Roussos's parents on hearing the truth,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medium=RSS&at_campaign=KARANGA,The parents of the Manchester Arena bombing's youngest victim speak about their life since she died.,manchester arena bombing saffie roussos parent hear truth,parents manchester arena bombing young victim speak life since die,manchester arena bombing saffie roussos parent hear truth parents manchester arena bombing young victim speak life since die
4,Ukraine conflict: Oil price soars to highest level since 2008,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?at_medium=RSS&at_campaign=KARANGA,Consumers are feeling the impact of higher energy costs as fuel prices and household bills jump.,ukraine conflict oil price soar high level since 2008,consumer feel impact high energy cost fuel price household bill jump,ukraine conflict oil price soar high level since 2008 consumer feel impact high energy cost fuel price household bill jump


In [None]:

import re

def extract_bbc_category(url, title="", description=""):
    url = url.lower().strip()

#     # 1. EARLY IDENTIFICATION
    if "/sport/" in url:
        return "sport"
    if "/business/" in url:
        return "business"
    if "/science/" in url:
        return "science"
    if "/technology/" in url or "/tech/" in url:
        return "technology"
    if "/entertainment/" in url or "/culture/" in url:
        return "entertainment"
    if "/health/" in url:
        return "health"
    if "/politics/" in url or "/uk-politics/" in url:
        return "politics"
    if "/world/" in url:
        return "world"
    
#     # 2. GENERAL NEWS PATTERNS
    # Capture the segment after '/news/'
    if "/news/" in url:
        raw = url.split("/news/")[1]
        raw = raw.split("/")[0]  # first segment
    else:
        # If URL has no identifiable pattern → misc
        return "misc"

    # Remove article IDs like “12345678”
    raw = re.sub(r'\d+', '', raw)
    raw = raw.replace("-", " ").strip()

#     # STRONG SUBCATEGORY NORMALIZATION
    subcat_map = {
        # ---- WORLD NEWS ----
        "world": "world",
        "world europe": "world",
        "world asia": "world",
        "world us canada": "world",
        "world africa": "world",
        "world latin america": "world",
        "europe": "world",
        "africa": "world",
        "asia": "world",
        "latin america": "world",
        "middle east": "world",
        "international": "world",
        "global": "world",



        # ---- UK & POLITICS ----
        "uk": "politics",
        "uk england": "politics",
        "uk scotland": "politics",
        "uk wales": "politics",
        "uk northern ireland": "politics",
        "england": "politics",
        "scotland": "politics",
        "wales": "politics",
        "northern ireland": "politics",
        "election": "politics",
        "politics": "politics",
        "parliament": "politics",


        # ---- BUSINESS ----
        "market": "business",
        "companies": "business",
        "economy": "business",
        "money": "business",
        "business": "business",
        "finance": "business",
        "investment": "business",
        "trade": "business",
        "industry": "business",
        "corporate": "business",
        "stock": "business",
        "shares": "business",
        "financial": "business",
        "business news": "business",    


        # ---- SCIENCE & CLIMATE ----
        "science": "science",
        "environment": "science",
        "climate": "science",
        "green": "science",
        "climate change": "science",
        "sustainability": "science",
        "nature": "science",
        "wildlife": "science",
        "ecosystem": "science",
        "biodiversity": "science",
        "pollution": "science",
        "conservation": "science",
        "research": "science",
        "discovery": "science",
        "space": "science",
        "astronomy": "science",
        "physics": "science",
        "biology": "science",
        "chemistry": "science",
        "geology": "science",
        "ocean": "science",
        "weather": "science",
        "meteorology": "science",
        


        # ---- TECHNOLOGY ----
        "tech": "technology",
        "technology": "technology",
        "digital": "technology",
        "future": "technology",
        "innovation": "technology",
        "gadgets": "technology",
        "internet": "technology",
        "apps": "technology",
        "social media": "technology",
        "ai": "technology",
        "artificial intelligence": "technology",
        "cybersecurity": "technology",
        "software": "technology",
        "hardware": "technology",
        "computing": "technology",
        "programming": "technology",
        "gaming": "technology",
        "blockchain": "technology",
        "cryptocurrency": "technology", 





        # ---- ENTERTAINMENT ----
        "arts": "entertainment",
        "entertainment": "entertainment",
        "culture": "entertainment",
        "film": "entertainment",
        "tv": "entertainment",
        "music": "entertainment",
        "blog": "entertainment",
        "media": "entertainment",
        "theatre": "entertainment",
        "books": "entertainment",
        "literature": "entertainment",
        "celebrity": "entertainment",
        "fashion": "entertainment",
        "lifestyle": "entertainment",
        "arts culture": "entertainment",
        "movies": "entertainment",
        "actors": "entertainment",
        "actress": "entertainment",
        "director": "entertainment",
        "producer": "entertainment",
        "music festival": "entertainment",    

        # ---- SPORT ----
        "football": "sport",
        "cricket": "sport",
        "tennis": "sport",
        "golf": "sport",
        "rugby": "sport",
        "formula": "sport",
        "motorsport": "sport",
        "athletics": "sport",
        "cycling": "sport",
        "boxing": "sport",
        "sport": "sport",
        "sports": "sport",
        "swimming": "sport",
        "olympics": "sport",
        "paralympics": "sport", 
        "world cup": "sport",


        # ---- HEALTH ----
        "health": "health",
        "medical": "health",
        "medicine": "health",
        "covid": "health",
        "coronavirus": "health",
        "pandemic": "health",
        "vaccine": "health",
        "vaccination": "health",
        "nhs": "health",
        "hospital": "health",
        "doctor": "health",
        "doctors": "health",
        "nurse": "health",
        "nursing": "health",
        "mental health": "health",
        "wellbeing": "health",
        "fitness": "health",
        "disease": "health",
        "virus": "health",
        "infection": "health",
        "infections": "health",
        "epidemic": "health",
        "public health": "health",
        "healthcare": "health",
        "treatment": "health",
        "patients": "health",
        "patient": "health",
        "aging": "health",
        "nutrition": "health",
        "diet": "health",
        "food safety": "health",


    }

#     # Match raw key EXACT or PARTIAL
    for key in subcat_map:
        if raw.startswith(key):
            return subcat_map[key]
        
    


df["final_category"] = df["guid"].apply(extract_bbc_category)


In [20]:
df['final_category'].value_counts()

final_category
politics         9665
sport            8395
world            8367
business         2646
entertainment    2495
health            688
misc              643
science           562
technology        430
Name: count, dtype: int64

In [21]:
df["final_category"].apply(type).value_counts()
df = df[df["final_category"] != "misc"]
df = df[df["final_category"].notna()]

In [23]:
df = df.reset_index(drop=True)
df["final_category"].value_counts()


final_category
politics         9665
sport            8395
world            8367
business         2646
entertainment    2495
health            688
science           562
technology        430
Name: count, dtype: int64

In [38]:
X = df["text"]
y = df["final_category"]
print(y.apply(type).value_counts())

final_category
<class 'str'>    33248
Name: count, dtype: int64


In [54]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
# 3. PIPELINES (TF-IDF + MODEL)

# ---- Naive Bayes Pipeline ----
nb_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=50000)),
    ("clf", MultinomialNB())
])
# ---- Logistic Regression Pipeline ----
lr_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=50000)),
    ("clf", LogisticRegression(max_iter=500))
])

# ---- SVM Pipeline ----
svm_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=50000)),
    ("clf", LinearSVC(C=1))
])

# 4. Train & Evaluate
pipelines = {
    "Naive Bayes": nb_pipeline,
    "Logistic Regression": lr_pipeline,
    "SVM": svm_pipeline
}

for name, model in pipelines.items():
    print(f"\n====================== {name} ======================")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))



Accuracy: 0.7646616541353384
               precision    recall  f1-score   support

     business       0.93      0.37      0.53       529
entertainment       0.95      0.15      0.25       499
       health       0.00      0.00      0.00       138
     politics       0.61      0.92      0.73      1933
      science       0.00      0.00      0.00       112
        sport       0.92      0.96      0.94      1679
   technology       0.00      0.00      0.00        86
        world       0.83      0.85      0.84      1674

     accuracy                           0.76      6650
    macro avg       0.53      0.41      0.41      6650
 weighted avg       0.77      0.76      0.72      6650


Accuracy: 0.8413533834586466
               precision    recall  f1-score   support

     business       0.81      0.69      0.75       529
entertainment       0.85      0.68      0.75       499
       health       0.82      0.51      0.63       138
     politics       0.76      0.88      0.82      1933
 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# PIPELINE (TF-IDF + SVM)
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("clf", LinearSVC())
])

# HYPERPARAMETER GRID
param_grid = {
    "tfidf__max_features": [20000, 30000, 50000],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__min_df": [2, 5],
    "tfidf__max_df": [0.7, 0.85, 1.0],

    "clf__C": [0.1, 0.5, 1, 2],
    "clf__class_weight": [None, "balanced"]
}

# GRID SEARCH
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,          
    n_jobs=-1,          
    verbose=2
)

grid.fit(X_train, y_train)


Fitting 3 folds for each of 288 candidates, totalling 864 fits
[CV] END clf__C=0.1, clf__class_weight=None, tfidf__max_df=0.7, tfidf__max_features=20000, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.1, clf__class_weight=None, tfidf__max_df=0.7, tfidf__max_features=20000, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   0.5s
[CV] END clf__C=0.1, clf__class_weight=None, tfidf__max_df=0.7, tfidf__max_features=20000, tfidf__min_df=5, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.1, clf__class_weight=None, tfidf__max_df=0.7, tfidf__max_features=20000, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.1, clf__class_weight=None, tfidf__max_df=0.7, tfidf__max_features=20000, tfidf__min_df=5, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.1, clf__class_weight=None, tfidf__max_df=0.7, tfidf__max_features=20000, tfidf__min_df=5, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf_

0,1,2
,estimator,Pipeline(step...LinearSVC())])
,param_grid,"{'clf__C': [0.1, 0.5, ...], 'clf__class_weight': [None, 'balanced'], 'tfidf__max_df': [0.7, 0.85, ...], 'tfidf__max_features': [20000, 30000, ...], ...}"
,scoring,
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [52]:
print("Best Parameters:", grid.best_params_)
print("Best CV Score:", grid.best_score_)

Best Parameters: {'clf__C': 1, 'clf__class_weight': None, 'tfidf__max_df': 0.7, 'tfidf__max_features': 50000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Best CV Score: 0.8509286412512219


In [53]:
best_model = grid.best_estimator_

preds = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))


Test Accuracy: 0.8669172932330828
               precision    recall  f1-score   support

     business       0.80      0.77      0.78       529
entertainment       0.83      0.76      0.79       499
       health       0.73      0.72      0.73       138
     politics       0.83      0.87      0.85      1933
      science       0.70      0.55      0.62       112
        sport       0.96      0.98      0.97      1679
   technology       0.59      0.44      0.51        86
        world       0.88      0.87      0.88      1674

     accuracy                           0.87      6650
    macro avg       0.79      0.75      0.77      6650
 weighted avg       0.87      0.87      0.87      6650

