In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

%matplotlib inline

In [2]:
#Example file to see what kind of data we are working with
df_example = pd.read_excel('articles_example_2024.xlsx')
df_example

Unnamed: 0,Date,Title,URL,Robot?,Stat,Heading,Author,Top,Lead article?
0,2024.12.31 21:02,Example article title 1,https://www.newsportal.hu/health/20241231/exam...,Not a robot article,13094,Health,Author_1,0,Not lead article
1,2024.12.31 20:01,Example article title 2,https://www.newsportal.hu/global/20241231/exam...,Not a robot article,50306,Global,Author_1,0,Not lead article
2,2024.12.31 19:03,Example article title 3,https://www.newsportal.hu/economy/20241231/exa...,Not a robot article,9974,Economy,Author_2,2,Afternoon lead article
3,2024.12.31 18:29,Example article title 4,https://www.newsportal.hu/home/20241231/exampl...,Not a robot article,5534,Home,Author_3,3,Not lead article
4,2024.12.31 18:02,Example article title 5,https://www.newsportal.hu/entertainment/202412...,Robot-generated article,3530,Entertainment,Author_1,0,Not lead article
5,2024.12.31 17:31,Example article title 6,https://www.newsportal.hu/global/20241231/exam...,Not a robot article,5256,Global,Author_4,0,Not lead article
6,2024.12.31 17:10,Example article title 7,https://www.newsportal.hu/pension/20241231/exa...,Not a robot article,24768,Pension,Author_3,3,Not lead article
7,2024.12.31 17:02,Example article title 8,https://www.newsportal.hu/global/20241231/exam...,Robot-generated article,2540,Global,Author_1,0,Not lead article
8,2024.12.31 16:34,Example article title 9,https://www.newsportal.hu/enterprise/20241231/...,Not a robot article,29260,Enterprise,Author_4,0,Not lead article
9,2024.12.31 16:01,Example article title 10,https://www.newsportal.hu/savings/20241231/exa...,Not a robot article,25308,Savings,Author_5,2,Afternoon lead article


In [3]:
df = pd.read_excel('articles_2024.xlsx')

In [4]:
df = df[2:-17] #Cutting off articles from 2023 and 2025

In [5]:
df.drop('URL', axis = 1, inplace = True) #Drop URL column

In [6]:
df = df[df['Author'] != 'Automatic'] #Drop automatic articles
df.reset_index(inplace = True, drop = True)

In [7]:
#For some reason there is a 'No heading' category, we will rename it
df[df['Heading'] == 'No heading']
df.iloc[6373, 4] = 'Savings'

In [8]:
df = pd.get_dummies(df, columns = ['Lead article?'], drop_first = False) #Grouping leaders into separate columns with dummy variables
df.iloc[:,-5:] = df.iloc[:,-5:].astype(int) #Convert True and False values in new columns to numbers

1        0
2        0
3        0
4        0
        ..
15294    0
15295    0
15296    0
15297    0
15298    0
Name: Lead article?_Afternoon lead article, Length: 15299, dtype: int64' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  df.iloc[:,-5:] = df.iloc[:,-5:].astype(int) #Convert True and False values in new columns to numbers
1        0
2        0
3        0
4        0
        ..
15294    0
15295    0
15296    1
15297    0
15298    0
Name: Lead article?_Early morning lead article, Length: 15299, dtype: int64' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  df.iloc[:,-5:] = df.iloc[:,-5:].astype(int) #Convert True and False values in new columns to numbers
1        0
2        0
3        0
4        0
        ..
15294    0
15295    0
15296    0
15297    0
15298    0
Name: Lead article?_Morning lead article, Length: 15299, dtype: int64' has dtype incompatible with bool, please explicitly cast to a compatibl

In [9]:
df.drop(['Date', 'Robot?', 'Heading', 'Author', 'Top'], axis = 1, inplace = True) #Drop unnecessary columns

In [10]:
df['Cleaned_title'] = df['Title'].str.lower() #Lowercase titles

In [11]:
#Special characters need to be removed
import re

df['Cleaned_title'] = df['Cleaned_title'].apply(lambda x: re.sub(r'[^a-záéíóöőúüű0-9\s]', '', x)) #Removing missing characters from []
df['Cleaned_title'] = df['Cleaned_title'].str.replace(r'\s+', ' ', regex = True).str.strip() #Merge multiple consecutive spaces into one space

In [12]:
#Removing stop words (e.g. 'and', 'a', 'an', 'the')

from nltk.corpus import stopwords

#Adding unique stop words
unique_stop_words = ['is', 'ha', 'le', 'fog', '2024', '2024ben', 'te', 'előtt', 'dolog', 'nap', 'párizs', 'kvíz', 'mire', 'olimpia']

stop_words = set(stopwords.words('hungarian'))
stop_words.update(unique_stop_words)
df['Cleaned_title'] = df['Cleaned_title'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [13]:
df.reset_index(inplace = True, drop = True)

In [14]:
from collections import Counter

all_words = ' '.join(df['Cleaned_title']).split() #Concatenate all titles into a single string with spaces, then split them
                                                   #along the spaces, putting the words into one big list
word_counts = Counter(all_words)
print(word_counts.most_common(10))

[('magyar', 3143), ('rengeteg', 886), ('magyarok', 783), ('magyarországon', 767), ('óriási', 613), ('súlyos', 557), ('fontos', 482), ('komoly', 393), ('forint', 367), ('sokan', 356)]


In [15]:
all_words = list(set(all_words))
#all_words

In [16]:
len(all_words)

28951

In [17]:
#LET'S LOOK AT THE MODEL ITSELF

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [19]:
#Extract the line(s) containing the longest text
max_length = df['Cleaned_title'].apply(len).max()

In [20]:
vectorizer = TfidfVectorizer(max_features = 200)
X_text = vectorizer.fit_transform(df['Cleaned_title']).toarray()

In [21]:
X_features = df.iloc[:, 2:-1]
X = pd.concat([pd.DataFrame(X_text), X_features], axis = 1) #Convert X_text to DF and merge it with other features

In [22]:
y = df['Stat']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
#Convert column names to strings (this is needed for RandomForestRegressor)
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [25]:
print(X_train.dtypes)

0                                           float64
1                                           float64
2                                           float64
3                                           float64
4                                           float64
                                             ...   
Lead article?_Afternoon lead article          int64
Lead article?_Early morning lead article      int64
Lead article?_Morning lead article            int64
Lead article?_Not lead article                int64
Lead article?_Tabloid lead article            int64
Length: 205, dtype: object


In [26]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [27]:
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared = False)
print(f'RMSE: {rmse}')

RMSE: 49902.72355070007




In [28]:
y_pred

array([ 6890.095     ,  6683.78699248, 12343.08814286, ...,
       86845.21142857, 21590.27299666, 29662.70123119])

In [29]:
#TESTING

In [30]:
probe = pd.DataFrame({
    'Title': ['Őrült árvágtában a magyarok kedvenc gyümölcslevei: mindenki nézhet egy nagyot a kasszáknál'], #Example article title
    'Lead article?_Afternoon lead article': [1],
    'Lead article?_Early morning lead article': [0],
    'Lead article?_Morning lead article': [0],
    'Lead article?_Not lead article': [0],
    'Lead article?_Tabloid lead article': [0]
}
                    )

In [31]:
probe

Unnamed: 0,Title,Lead article?_Afternoon lead article,Lead article?_Early morning lead article,Lead article?_Morning lead article,Lead article?_Not lead article,Lead article?_Tabloid lead article
0,Őrült árvágtában a magyarok kedvenc gyümölcsle...,1,0,0,0,0


In [32]:
probe['Cleaned_title'] = probe['Title'].str.lower() #Lowercase titles

In [33]:
probe['Cleaned_title'] = probe['Cleaned_title'].apply(lambda x: re.sub(r'[^a-záéíóöőúüű0-9\s]', '', x)) #Removing missing characters from []
probe['Cleaned_title'] = probe['Cleaned_title'].str.replace(r'\s+', ' ', regex = True).str.strip() #Merge multiple consecutive spaces into one space

In [34]:
probe['Cleaned_title'] = probe['Cleaned_title'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [35]:
#vectorizer = TfidfVectorizer(max_features = 500)
probe_text = vectorizer.transform(probe['Cleaned_title']).toarray()

In [36]:
probe = pd.concat([pd.DataFrame(probe_text), probe], axis = 1) ##Convert probe_text to DF and merge it with other features

In [37]:
probe.drop(['Title', 'Cleaned_title'], inplace = True, axis = 1)

In [38]:
#Convert column names to strings (this is needed for RandomForestRegressor)
probe.columns = probe.columns.astype(str)

In [39]:
predicted_clicks = model.predict(probe)

In [40]:
print(predicted_clicks)

[38732.86511842]


In [41]:
#Save the model
import joblib
joblib.dump(model, 'readership_predictor.pkl')

['readership_predictor.pkl']

In [42]:
#Use saved model
loaded_model = joblib.load('readership_predictor.pkl')
prediction = loaded_model.predict(probe)
print(prediction)

[38732.86511842]


In [43]:
#Save the trained vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']