In [1]:
!pip3 install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

### Loading Dataset

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from datasets import load_dataset

# train_dataset = load_dataset('ag_news', split='train')
# test_dataset = load_dataset('ag_news', split='test')

dataset = load_dataset('ag_news')

train_data = dataset['train']

df = pd.DataFrame(train_data)

# X_train = df['text']
y_train = df['label']

test_data = dataset['test']

df_test = pd.DataFrame(test_data)

# X_test = df_test['text']
y_test = df_test['label']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Data Preprocessing

In [3]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove non-alphanumeric characters
    text = re.sub(r'\W', ' ', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [4]:
X_train = df['text'].apply(preprocess_text)
X_test = df_test['text'].apply(preprocess_text)

In [7]:
gini_accuracies = []
entropy_accuracies = []

for loss in ['gini', 'entropy']:
  for i in range(50, 80, 5):
    model = make_pipeline(TfidfVectorizer(), DecisionTreeClassifier(criterion=loss, max_depth=i))

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    # Display results
    print(f"Accuracy: {accuracy}")
    if(loss == 'gini'):
      gini_accuracies.append(accuracy)
    else:
      entropy_accuracies.append(accuracy)

print(gini_accuracies)
print(entropy_accuracies)

Accuracy: 0.7192105263157895
Accuracy: 0.7309210526315789
Accuracy: 0.7367105263157895
Accuracy: 0.7456578947368421
Accuracy: 0.7523684210526316
Accuracy: 0.7592105263157894
Accuracy: 0.7126315789473684
Accuracy: 0.7288157894736842
Accuracy: 0.7390789473684211
Accuracy: 0.7467105263157895
Accuracy: 0.751578947368421
Accuracy: 0.7531578947368421
[0.7192105263157895, 0.7309210526315789, 0.7367105263157895, 0.7456578947368421, 0.7523684210526316, 0.7592105263157894]
[0.7126315789473684, 0.7288157894736842, 0.7390789473684211, 0.7467105263157895, 0.751578947368421, 0.7531578947368421]


In [8]:
model = make_pipeline(TfidfVectorizer(), DecisionTreeClassifier())

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display results
print(f"Accuracy: {accuracy}")

Accuracy: 0.8168421052631579
