In [13]:
import os
import pandas as pd
import regex
import re
import requests
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [14]:
# Preprocesarea textului
def remove_punc(text):
    """Elimină semnele de punctuație din text."""
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_emojis(text):
    """Elimină emoji-urile din text."""
    emoji_pattern = regex.compile(r'\p{Emoji}', flags=regex.UNICODE)
    return emoji_pattern.sub('', text)

def remove_url(text):
    """Elimină URL-urile din text."""
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

def remove_stopwords(text):
    """Elimină stopword-urile din text."""
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

def replace_abbreviations(text, abbreviation_dict):
    """Înlocuiește abrevierile cu forma completă."""
    for abbreviation, full_form in abbreviation_dict.items():
        text = text.replace(abbreviation, full_form)
    return text

def preprocess_text(text, abbreviation_dict):
    """Pipeline pentru preprocesarea textului."""
    if not isinstance(text, str) or text.strip() == "":
        return ""
    text = text.lower()
    text = remove_url(text)
    text = remove_emojis(text)
    text = remove_punc(text)
    text = remove_stopwords(text)
    text = replace_abbreviations(text, abbreviation_dict)
    return text

In [15]:
# Calea către fișierul CSV
file_path = 'C:\\Users\\djtsms\\Desktop\\Master\\MODUL 2\\Semantica si pragmatica limbajului natural-S1\\Proiect Old Team\\english_news_dataset.csv'


In [16]:
# Verifică dacă fișierul există
if not os.path.exists(file_path):
    print(f"Fișierul nu a fost găsit: {file_path}")
    exit()

In [17]:
# Încărcă datele
df = pd.read_csv(file_path)
# Afișează primele 10 rânduri
print(df.head(10))

                                            Headline  \
0  Congress leader Baljinder Singh shot dead at h...   
1  17-year-old girl preparing for NEET dies by su...   
2  Hampers to welcome MPs in new Parliament tomor...   
3  Only 10% women lawmakers in RS, while only 14%...   
4  Ganesh temple decorated with notes, coins wort...   
5  Bee attack during funeral in K'taka leaves 1 d...   
6  Manipur govt forms inquiry commission to probe...   
7  Cabinet approves Bill giving 33% reservation t...   
8  Won't accept his remains: Wife of Army soldier...   

                                             Content News Categories  \
0  Congress leader Baljinder Singh was shot dead ...    ['national']   
1  Another NEET aspirant died by suicide in Rajas...    ['national']   
2  In order to mark the first-ever working day of...    ['national']   
3  Congress President Mallikarjun Kharge, while s...    ['national']   
4  The Sri Sathya Ganapathi Temple in Bengaluru a...    ['national']   
5  One 

In [18]:
# Gruparea claselor rare
threshold = 5
class_counts = df['News Categories'].value_counts()
rare_classes = class_counts[class_counts < threshold].index
df['category_grouped'] = df['News Categories'].apply(lambda x: 'Other' if x in rare_classes else x)


In [19]:
abbreviation_dict = {
    'LOL': 'laugh out loud',
    'BRB': 'be right back',
    'OMG': 'oh my god',
    'AFAIK': 'as far as I know',
    'AFK': 'away from keyboard',
    'ASAP': 'as soon as possible',
    'ATK': 'at the keyboard',
    'ATM': 'at the moment',
    'A3': 'anytime, anywhere, anyplace',
    'BAK': 'back at keyboard',
    'BBL': 'be back later',
    'BBS': 'be back soon',
    'BFN': 'bye for now',
    'B4N': 'bye for now',
    'BRB': 'be right back',
    'BRT': 'be right there',
    'BTW': 'by the way',
    'B4': 'before',
    'B4N': 'bye for now',
    'CU': 'see you',
    'CUL8R': 'see you later',
    'CYA': 'see you',
    'FAQ': 'frequently asked questions',
    'FC': 'fingers crossed',
    'FWIW': 'for what it\'s worth',
    'FYI': 'For Your Information',
    'GAL': 'get a life',
    'GG': 'good game',
    'GN': 'good night',
    'GMTA': 'great minds think alike',
    'GR8': 'great!',
    'G9': 'genius',
    'IC': 'i see',
    'ICQ': 'i seek you',
    'ILU': 'i love you',
    'IMHO': 'in my honest/humble opinion',
    'IMO': 'in my opinion',
    'IOW': 'in other words',
    'IRL': 'in real life',
    'KISS': 'keep it simple, stupid',
    'LDR': 'long distance relationship',
    'LMAO': 'laugh my a.. off',
    'LOL': 'laughing out loud',
    'LTNS': 'long time no see',
    'L8R': 'later',
    'MTE': 'my thoughts exactly',
    'M8': 'mate',
    'NRN': 'no reply necessary',
    'OIC': 'oh i see',
    'PITA': 'pain in the a..',
    'PRT': 'party',
    'PRW': 'parents are watching',
    'QPSA?': 'que pasa?',
    'ROFL': 'rolling on the floor laughing',
    'ROFLOL': 'rolling on the floor laughing out loud',
    'ROTFLMAO': 'rolling on the floor laughing my a.. off',
    'SK8': 'skate',
    'STATS': 'your sex and age',
    'ASL': 'age, sex, location',
    'THX': 'thank you',
    'TTFN': 'ta-ta for now!',
    'TTYL': 'talk to you later',
    'U': 'you',
    'U2': 'you too',
    'U4E': 'yours for ever',
    'WB': 'welcome back',
    'WTF': 'what the f...',
    'WTG': 'way to go!',
    'WUF': 'where are you from?',
    'W8': 'wait...',
    '7K': 'sick laughter',
    'TFW': 'that feeling when',
    'MFW': 'my face when',
    'MRW': 'my reaction when',
    'IFYP': 'i feel your pain',
    'LOL': 'laughing out loud',
    'TNTL': 'trying not to laugh',
    'JK': 'just kidding',
    'IDC': 'i don’t care',
    'ILY': 'i love you',
    'IMU': 'i miss you',
    'ADIH': 'another day in hell',
    'IDC': 'i don’t care',
    'ZZZ': 'sleeping, bored, tired',
    'WYWH': 'wish you were here',
    'TIME': 'tears in my eyes',
    'BAE': 'before anyone else',
    'FIMH': 'forever in my heart',
    'BSAAW': 'big smile and a wink',
    'BWL': 'bursting with laughter',
    'LMAO': 'laughing my a** off',
    'BFF': 'best friends forever',
    'CSL': 'can’t stop laughing',
}

In [20]:
# Preprocesarea datelor
df['Content'] = df['Content'].apply(lambda x: preprocess_text(x, abbreviation_dict))

In [21]:
# Pregătirea datelor pentru model
X = df['Content']
y = df['category_grouped']

In [22]:
X

0         congress leader baljinder singh shot dead hous...
1         another neet aspirant died suicide rajasthans ...
2         order mark firstever working day new parliamen...
3         congress president mallikarjun kharge speaking...
4         sri sathya ganapathi temple bengaluru adorned ...
                                ...                        
199701    pancreas incredibly important organ particular...
199702    recent study published oral diseases reported ...
199703    hospitalacquired infections hais refer infecti...
199704    scientists university oxford uk launched first...
199705    high blood pressure happens force blood pushin...
Name: Content, Length: 199706, dtype: object

In [23]:
# Encoding labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [24]:
# Împărțirea datelor
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [25]:
# Construirea și antrenarea modelului
model = make_pipeline(CountVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

In [26]:
# Evaluare pe datele de test
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.892
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        32
           1       0.00      0.00      0.00         8
           2       1.00      1.00      1.00        11
           3       1.00      0.83      0.91        30
           4       1.00      1.00      1.00         9
           5       0.00      0.00      0.00         7
           6       0.00      0.00      0.00         1
           7       1.00      1.00      1.00         2
           8       1.00      1.00      1.00        18
           9       1.00      0.71      0.83         7
          10       0.00      0.00      0.00         2
          11       1.00      1.00      1.00         2
          12       1.00      1.00      1.00        15
          13       0.00      0.00      0.00         7
          14       1.00      1.00      1.00        10
          15       1.00      1.00      1.00         6
          16       1.00      1.00      1.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
# Clasificarea unui text preluat de pe o pagină web
from bs4 import BeautifulSoup
def classify_text_from_web(url):
    """Preprocesează și clasifică textul preluat de pe o pagină web."""
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Eroare la accesarea paginii: {response.status_code}")
            return

        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        content = ' '.join([p.get_text() for p in paragraphs])
        processed_text = preprocess_text(content, abbreviation_dict)
        predicted_label_encoded = model.predict([processed_text])[0]
        predicted_label = le.inverse_transform([predicted_label_encoded])[0]
        return predicted_label

    except Exception as e:
        print(f"Eroare: {e}")
        return

In [32]:
# Exemplu de utilizare
from bs4 import BeautifulSoup
example_url = 'https://www.cnbc.com/2025/01/09/google-donates-1-million-to-trumps-inauguration-fund.html'
predicted_category = classify_text_from_web(example_url)
if predicted_category:
    print(f"Categoria prezisă: {predicted_category}")

Categoria prezisă: ['business']
