In [1]:
import pandas as pd

In [7]:
df = pd.read_csv("../Downloads/AG_news.csv")

In [9]:
df

Unnamed: 0,Class Index,Title,Description
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...
...,...,...,...
7595,1,Around the world,Ukrainian presidential candidate Viktor Yushch...
7596,2,Void is filled with Clement,With the supply of attractive pitching options...
7597,2,Martinez leaves bitter,Like Roger Clemens did almost exactly eight ye...
7598,3,5 of arthritis patients in Singapore take Bext...,SINGAPORE : Doctors in the United States have ...


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7600 entries, 0 to 7599
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class Index  7600 non-null   int64 
 1   Title        7600 non-null   object
 2   Description  7600 non-null   object
dtypes: int64(1), object(2)
memory usage: 178.3+ KB


In [13]:
df.describe()

Unnamed: 0,Class Index
count,7600.0
mean,2.5
std,1.118108
min,1.0
25%,1.75
50%,2.5
75%,3.25
max,4.0


In [19]:
df.count()

Class Index    7600
Title          7600
Description    7600
dtype: int64

In [21]:
df.isnull().sum()

Class Index    0
Title          0
Description    0
dtype: int64

In [23]:
df['Class Index'].value_counts()

Class Index
3    1900
4    1900
2    1900
1    1900
Name: count, dtype: int64

In [25]:
import re
import nltk

In [27]:
df['text'] = df['Title'] + " " + df['Description']

In [29]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)     
    text = re.sub(r'[^a-z\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['text'] = df['text'].apply(clean_text)

In [31]:
from nltk.tokenize import word_tokenize
df['tokens'] = df['text'].apply(word_tokenize)

In [33]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

df['tokens'] = df['tokens'].apply(
    lambda words: [word for word in words if word not in stop_words]
)

In [35]:
def remove_special_characters(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text
df['text'] = df['text'].apply(remove_special_characters)

In [37]:
df['clean_text'] = df['tokens'].apply(lambda words: " ".join(words))

In [39]:
df[['Class Index', 'clean_text']].head()

Unnamed: 0,Class Index,clean_text
0,3,fears n pension talks unions representing work...
1,4,race second private team sets launch date huma...
2,4,ky company wins grant study peptides ap ap com...
3,4,prediction unit helps forecast wildfires ap ap...
4,4,calif aims limit farmrelated smog ap ap southe...


In [41]:
x = df['text']
y = df['Class Index']

In [43]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english',max_features=5000)
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)

In [47]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
mnb = MultinomialNB()
mnb.fit(x_train_tfidf, y_train)
y_pred = mnb.predict(x_test_tfidf)

In [49]:
print("Multinomial NB Accuracy: ",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Multinomial NB Accuracy:  0.8809210526315789
              precision    recall  f1-score   support

           1       0.87      0.90      0.88       373
           2       0.93      0.97      0.95       389
           3       0.83      0.86      0.85       359
           4       0.90      0.79      0.84       399

    accuracy                           0.88      1520
   macro avg       0.88      0.88      0.88      1520
weighted avg       0.88      0.88      0.88      1520



In [51]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000)
lr.fit(x_train_tfidf, y_train)
y_pred = lr.predict(x_test_tfidf)

In [53]:
print("Logistic Regression Accuracy: ",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Logistic Regression Accuracy:  0.8828947368421053
              precision    recall  f1-score   support

           1       0.87      0.90      0.88       373
           2       0.94      0.97      0.95       389
           3       0.84      0.85      0.85       359
           4       0.87      0.82      0.85       399

    accuracy                           0.88      1520
   macro avg       0.88      0.88      0.88      1520
weighted avg       0.88      0.88      0.88      1520



In [55]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier( n_estimators=100,random_state=42)
rf.fit(x_train_tfidf, y_train)
y_pred = rf.predict(x_test_tfidf)

In [57]:
print("Random Forest Accuracy: ",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Random Forest Accuracy:  0.819078947368421
              precision    recall  f1-score   support

           1       0.83      0.84      0.84       373
           2       0.89      0.90      0.89       389
           3       0.78      0.76      0.77       359
           4       0.78      0.77      0.77       399

    accuracy                           0.82      1520
   macro avg       0.82      0.82      0.82      1520
weighted avg       0.82      0.82      0.82      1520

