In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('bbc-text.csv')


print(df.head())

        category                                               text
0           tech  tv future in the hands of viewers with home th...
1       business  worldcom boss  left books alone  former worldc...
2          sport  tigers wary of farrell  gamble  leicester say ...
3          sport  yeading face newcastle in fa cup premiership s...
4  entertainment  ocean s twelve raids box office ocean s twelve...


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
X = df['text']
y = df['category']

#list of text
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

tfidf = TfidfVectorizer(max_features=3000, stop_words='english')

# Learn the vocabulary and IDF weights from the training text (fit),
# then convert each training document into a TF-IDF numeric vector (transform).
X_train = tfidf.fit_transform(X_train_raw).toarray()
X_test = tfidf.transform(X_test_raw).toarray()

print(f"Data split into {len(X_train)} training and {len(X_test)} testing samples.")

Data split into 1780 training and 445 testing samples.


In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Find the majority class in the training set
majority_class = y_train.value_counts().idxmax()
print(f"The majority class is: {majority_class}")

# Predict the majority class for all test samples
y_pred_simple = [majority_class] * len(y_test)

# Compute metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred_simple):.4f}")
print(f"Precision (weighted): {precision_score(y_test, y_pred_simple, average='weighted', zero_division=0):.4f}")
print(f"Recall (weighted): {recall_score(y_test, y_pred_simple, average='weighted', zero_division=0):.4f}")

# Recall for the 'sport' class only
print(f"Recall (sport): {recall_score(y_test, y_pred_simple, labels=['sport'], average=None)[0]:.4f}")

The majority class is: sport
Accuracy: 0.2427
Precision (weighted): 0.0589
Recall (weighted): 0.2427
Recall (sport): 1.0000


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# 1. יצירת מילון מילים לכל קטגוריה בעזרת TF-IDF
# אנחנו לוקחים את המילים עם הציון הגבוה ביותר לכל מחלקה
tfidf_gen = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_tfidf = tfidf_gen.fit_transform(X_train_raw)
words = np.array(tfidf_gen.get_feature_names_out())

category_dictionaries = {}

for cat in y_train.unique():
    # מציאת הכתבות ששייכות לקטגוריה הספציפית
    cat_indices = np.where(y_train == cat)[0]
    # חישוב ממוצע ציוני ה-TF-IDF של המילים בקטגוריה זו
    cat_means = np.mean(X_train_tfidf[cat_indices], axis=0).A1
    # לקיחת 20 המילים המובילות
    top_indices = cat_means.argsort()[-20:]
    category_dictionaries[cat] = set(words[top_indices])
    print(f"Top words for {cat}: {list(category_dictionaries[cat])[:5]}...")

# 2. פונקציית חיזוי לפי המילונים
def predict_by_dictionary(text):
    text_words = set(text.lower().split())
    scores = {}
    for cat, dict_words in category_dictionaries.items():
        # ספירה כמה מילים מהטקסט נמצאות במילון של הקטגוריה
        scores[cat] = len(text_words.intersection(dict_words))

    # החזרת הקטגוריה עם הניקוד הגבוה ביותר
    return max(scores, key=scores.get)

# 3. הרצה על סט הבדיקה וחישוב Accuracy
y_pred_dict = X_test_raw.apply(predict_by_dictionary)
print(f"\nDictionary-based Baseline Accuracy: {accuracy_score(y_test, y_pred_dict):.4f}")
print(f"Precision (weighted): {precision_score(y_test, y_pred_dict, average='weighted', zero_division=0):.4f}")
print(f"Recall (weighted): {recall_score(y_test, y_pred_dict, average='weighted', zero_division=0):.4f}")


Top words for tech: ['mr', 'use', 'broadband', 'said', 'microsoft']...
Top words for politics: ['mr', 'government', 'tories', 'said', 'chancellor']...
Top words for sport: ['chelsea', 'win', 'match', 'said', 'club']...
Top words for business: ['growth', 'mr', 'government', 'oil', 'said']...
Top words for entertainment: ['films', 'actor', 'uk', 'said', 'film']...

Dictionary-based Baseline Accuracy: 0.8831
Precision (weighted): 0.8861
Recall (weighted): 0.8831
