# Analisi Dataset

In [38]:
import pandas as pd
import re
from collections import Counter
import numpy as np

In [39]:
# Count occurrences of years in a specific range within a text column of a DataFrame
def count_years_in_texts(df, start_year, end_year, column='texts'):
    years = [str(year) for year in range(start_year, end_year + 1)]
    year_pattern = re.compile(r'\b(' + '|'.join(years) + r')\b')

    all_years = []
    for text in df[column]:
        found_years = year_pattern.findall(str(text))
        all_years.extend(found_years)

    year_counts = Counter(all_years)
    for year in years:
        print(f"{year}: {year_counts.get(year, 0)}")
    
    return year_pattern


# Extract the most common year from the text
def extract_most_common_year(text, year_pattern):
    found_years = year_pattern.findall(text)
    if not found_years:
        return np.nan  # or 'No year found'
    counter = Counter(found_years)
    # returns the most common year
    return counter.most_common(1)[0][0]

## Celebrity

paper di riferimento: https://arxiv.org/abs/1708.07104

tema: gossip e celebrità

pubblicazione: 2017

date paper: non segna date

dataset originale (NON CI SONO date): https://web.eecs.umich.edu/~mihalcea/downloads.html?utm_source=chatgpt.com#FakeNews

----

**RISULTATO:**
* Molto probabile dal 2010 al 2017
* Meno probabile dal 2000 al 2009

In [40]:
percorso_file = "datasets/Celebrity/df_celebrity.csv"
df = None
df = pd.read_csv(percorso_file, sep="\t", encoding="utf-8")

In [41]:
df.head()

Unnamed: 0.1,Unnamed: 0,texts,labels
0,0,Robert Pattinson and FKA Twigs Take Cannes in ...,0
1,1,Kathy Griffin Wants To Get Roasted On Comedy C...,1
2,2,WATCH: Donald Trump's Ex-Wife Marla Maples Get...,0
3,3,Jennifer Hudson And Kelly Clarkson Are Coming ...,0
4,4,'Girl' Fight! Kanye West Confronts Anna Wintou...,1


In [42]:
# print total length
print("Total number of rows:", len(df))

# print number of 0 labels
print("Number of Real News:", (df['labels'] == 0).sum())
# print number of 1 labels
print("Number of Fake News:", (df['labels'] == 1).sum())

Total number of rows: 500
Number of Real News: 250
Number of Fake News: 250


In [43]:
start_year = 2000
end_year = 2017
year_pattern = count_years_in_texts(df, start_year, end_year, column='texts')

2000: 11
2001: 14
2002: 17
2003: 19
2004: 17
2005: 18
2006: 11
2007: 17
2008: 22
2009: 13
2010: 24
2011: 15
2012: 29
2013: 33
2014: 56
2015: 52
2016: 83
2017: 53


In [44]:
df['year'] = df['texts'].apply(extract_most_common_year, year_pattern=year_pattern)

# count NaN "year" rows
print(f"NaN year count: {df['year'].isnull().sum()} over {len(df)} rows")

NaN year count: 288 over 500 rows


In [45]:
df['year'].value_counts().sort_index()

year
2000     2
2001     3
2002     5
2003     6
2004     6
2005     8
2006     4
2007     7
2008     9
2009     6
2010     9
2011     6
2012    12
2013    14
2014    20
2015    24
2016    38
2017    33
Name: count, dtype: int64

## CIDII

paper di riferimento: http://mjs.um.edu.my/index.php/MJCS/article/view/41935

tema: problemi Islamici

pubblicazione: 

date paper: dal 1/1/2016 al 20/1/2021

dataset originale (NON CI SONO date): https://drive.google.com/drive/folders/14HBFsDAe8hbNn6S6qTJ84o-9_IETDOq0

----

**RISULTATO:**
* Non si trovano date neanche nel testo

In [46]:
percorso_file = "datasets/CIDII/df_cidii.csv"
df = None
df = pd.read_csv(percorso_file, sep="\t", encoding="utf-8")

In [47]:
df.head()

Unnamed: 0.1,Unnamed: 0,texts,labels
0,0,"Muhammad permitted lying (Sahih Muslim 6303, B...",1
1,1,And one of His signs is that He created for yo...,0
2,2,"She reported God's Messenger as saying, The be...",0
3,3,Sahl bin Hunaif and Qais bin Sad were sitting ...,0
4,4,It should be noted that in the long history of...,0


In [48]:
# print total length
print("Total number of rows:", len(df))

# print number of 0 labels
print("Number of Real News:", (df['labels'] == 0).sum())
# print number of 1 labels
print("Number of Fake News:", (df['labels'] == 1).sum())

Total number of rows: 722
Number of Real News: 422
Number of Fake News: 300


In [49]:
start_year = 2016
end_year = 2021
year_pattern = count_years_in_texts(df, start_year, end_year, column='texts')

2016: 0
2017: 0
2018: 0
2019: 0
2020: 0
2021: 0


## Fa-KES

paper di riferimento: https://ojs.aaai.org/index.php/ICWSM/article/view/3254

tema: guerra in Siria

pubblicazione: 2019

date apper: dal 2011 al 2018

dataset originale (CI SONO date): https://ojs.aaai.org/index.php/ICWSM/article/view/3254

----

**RISULTATO:**
* Molte news dal 2013 al 2017
* Poche news del 2011 e 2012

In [50]:
percorso_file = "datasets/FaKES/df_fakes.csv"
df = None
df = pd.read_csv(percorso_file, sep="\t", encoding="utf-8")

In [51]:
df.head()

Unnamed: 0.1,Unnamed: 0,texts,labels
0,0,6 September 2016 10 civilians killed 15 injure...,1
1,1,05-09-2013 Damascus Car bomb Kills Four . Four...,1
2,2,Thu Aug 18 2016 70 Militants Killed in Aleppos...,0
3,3,1 February 2016 Russian Aerospace Forces destr...,0
4,4,Date of publication 5 July 2016 The bomber was...,0


In [52]:
# print total length
print("Total number of rows:", len(df))

# print number of 0 labels
print("Number of Real News:", (df['labels'] == 0).sum())
# print number of 1 labels
print("Number of Fake News:", (df['labels'] == 1).sum())

Total number of rows: 804
Number of Real News: 426
Number of Fake News: 378


In [53]:
start_year = 2011
end_year = 2018
year_pattern = count_years_in_texts(df, start_year, end_year, column='texts')

2011: 210
2012: 54
2013: 115
2014: 166
2015: 139
2016: 482
2017: 171
2018: 22


In [None]:
percorso_file_date = "datasets/FaKES/date-FA-KES-Dataset.csv"
df_dates = pd.read_csv(percorso_file_date, encoding="ISO-8859-1")

df_dates = df_dates.rename(columns={"article_content": "texts"})
# join on "texts" column, keeping all rows from df, but adding "date" from df_dates where available
df_merged = pd.merge(df, df_dates[['texts', 'date']], on="texts", how="left")


df_dates['date'] = pd.to_datetime(df_dates['date'], errors='coerce')
df_dates.head()

# check
print(df_merged['date'].isna().sum(), "rows without a match")

39 rows without a match


In [18]:
df_merged['year'] = pd.to_datetime(df_merged['date'], errors='coerce').dt.year
df_merged['year'].value_counts().sort_index()

year
2011.0      2
2012.0      2
2013.0     51
2014.0    114
2015.0     84
2016.0    392
2017.0    158
Name: count, dtype: int64

## FakeVsSatire

paper di riferimento: https://www.researchgate.net/publication/325212005_Fake_News_vs_Satire_A_Dataset_and_Analysis

tema: politica

pubblicazione: 2018

date paper: tra 01/2016 e 10/2017

dataset originale (NON CI SONO date): https://github.com/jgolbeck/fakenews

----

**RISULTATO:**
* Solo news del 2016 e 2017

In [19]:
percorso_file = "datasets/FakeVsSatire/df_fakevssatire.csv"
df = None
df = pd.read_csv(percorso_file, sep="\t", encoding="utf-8")

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,texts,labels
0,0,HOSPITAL INSIDER REVEALS Ã¢ÂÂTHE GOVERNMENT ...,1
1,1,Whaaaat? Trump Chief Was Wiretapped Both Befor...,1
2,2,Obama Challenges ISIS to Meet U.N. Emissions G...,0
3,3,George Soros Furious After Cops Shut Down Leth...,1
4,4,Civil War Monuments are uprooting themselves a...,0


In [21]:
# print total length
print("Total number of rows:", len(df))

# print number of 0 labels
print("Number of Real News:", (df['labels'] == 0).sum())
# print number of 1 labels
print("Number of Fake News:", (df['labels'] == 1).sum())

Total number of rows: 486
Number of Real News: 203
Number of Fake News: 283


In [22]:
start_year = 2016
end_year = 2017
year_pattern = count_years_in_texts(df, start_year, end_year, column='texts')

2016: 108
2017: 233


In [23]:
df['year'] = df['texts'].apply(extract_most_common_year, year_pattern=year_pattern)

# count NaN "year" rows
print(f"NaN year count: {df['year'].isnull().sum()} over {len(df)} rows")

NaN year count: 241 over 486 rows


In [24]:
df['year'].value_counts().sort_index()

year
2016     60
2017    185
Name: count, dtype: int64

## Horne

paper di riferimento: https://ojs.aaai.org/index.php/ICWSM/article/view/14976

tema: politica

pubblicazione: 2017

date paper: probabilmente news del 2016

dataset originale (NON CI SONO date): https://github.com/BenjaminDHorne/fakenewsdata1

----

**RISULTATO:**
* Ci sono più news del 2016 (ma poche sul totale)



In [25]:
percorso_file = "datasets/Horne/df_horne_all.csv"
df = None
df = pd.read_csv(percorso_file, sep="\t", encoding="utf-8")

In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,texts,labels
0,0,"NEW YORK, NYÃÂDuring his Sunday interview wi...",0
1,1,Congress is preparing to do major battle next ...,0
2,2,President-elect Donald Trump is lashing out in...,0
3,3,"WASHINGTON DC ÃÂ On Monday afternoon, an ext...",1
4,4,Senate Democrats dropped their objections Frid...,0


In [27]:
# print total length
print("Total number of rows:", len(df))

# print number of 0 labels
print("Number of Real News:", (df['labels'] == 0).sum())
# print number of 1 labels
print("Number of Fake News:", (df['labels'] == 1).sum())

Total number of rows: 326
Number of Real News: 203
Number of Fake News: 123


In [28]:
start_year = 2015
end_year = 2017
year_pattern = count_years_in_texts(df, start_year, end_year, column='texts')

2015: 47
2016: 153
2017: 17


In [29]:
df['year'] = df['texts'].apply(extract_most_common_year, year_pattern=year_pattern)

# count NaN "year" rows
print(f"NaN year count: {df['year'].isnull().sum()} over {len(df)} rows")

NaN year count: 229 over 326 rows


In [30]:
df['year'].value_counts().sort_index()

year
2015    23
2016    61
2017    13
Name: count, dtype: int64

## Infodemic

paper di riferimento: https://arxiv.org/abs/2011.03327

tema: covid

pubblicazione: 2021

date paper: sicuramente 2020 (forse anche 2021)

dataset originale (NON CI SONO date): https://competitions.codalab.org/competitions/26655#phases

----

**RISULTATO:**
* Solo news del 2020 (ma poche sul totale)

In [31]:
percorso_file = "datasets/Infodemic/df_infodemic.csv"
df = None
df = pd.read_csv(percorso_file, sep="\t", encoding="utf-8")

In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,texts,labels
0,0,The CDC currently reports 99031 deaths. In gen...,0.0
1,1,States reported 1121 deaths a small rise from ...,0.0
2,2,Politically Correct Woman (Almost) Uses Pandem...,1.0
3,3,#IndiaFightsCorona: We have 1524 #COVID testin...,0.0
4,4,Populous states can generate large case counts...,0.0


In [33]:
# print total length
print("Total number of rows:", len(df))

# print number of 0 labels
print("Number of Real News:", (df['labels'] == 0).sum())
# print number of 1 labels
print("Number of Fake News:", (df['labels'] == 1).sum())

Total number of rows: 10561
Number of Real News: 5526
Number of Fake News: 5033


In [34]:
start_year = 2020
end_year = 2021
year_pattern = count_years_in_texts(df, start_year, end_year, column='texts')

2020: 422
2021: 16


In [35]:
df['year'] = df['texts'].apply(extract_most_common_year, year_pattern=year_pattern)

# count NaN "year" rows
print(f"NaN year count: {df['year'].isnull().sum()} over {len(df)} rows")

NaN year count: 10196 over 10561 rows


In [36]:
df['year'].value_counts().sort_index()

year
2020    352
2021     13
Name: count, dtype: int64

## ISOT

paper di riferimento: https://www.researchgate.net/publication/320300831_Detection_of_Online_Fake_News_Using_N-Gram_Analysis_and_Machine_Learning_Techniques

tema: Elezioni USA 2016

pubblicazione: 2017

date paper: news del 2016

dataset originale (NON CI SONO date): https://onlineacademiccommunity.uvic.ca/isot/2022/11/27/fake-news-detection-datasets/?utm_source=chatgpt.com

----

**RISULTATO:**
* Molte news del 2016
* Possibli news del 2015 e 2017 (anni molto presenti anche se nel paper segna solo 2016)

In [37]:
percorso_file = "datasets/ISOT/df_isot.csv"
df = None
df = pd.read_csv(percorso_file, sep="\t", encoding="utf-8")

In [38]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,texts,labels
0,0,0,WASHINGTON (Reuters) - U.S. House Speaker Paul...,0
1,1,1,BRUSSELS (Reuters) - The European UnionÃ¢ÂÂs...,0
2,2,2,MEXICO CITY (Reuters) - Mexican President Enri...,0
3,3,3,Far too many people don t understand the aweso...,1
4,4,4,MAPUTO (Reuters) - Gun battles last week betwe...,0


In [39]:
# print total length
print("Total number of rows:", len(df))

# print number of 0 labels
print("Number of Real News:", (df['labels'] == 0).sum())
# print number of 1 labels
print("Number of Fake News:", (df['labels'] == 1).sum())

Total number of rows: 44271
Number of Real News: 21416
Number of Fake News: 22855


In [40]:
start_year = 2015
end_year = 2017
year_pattern = count_years_in_texts(df, start_year, end_year, column='texts')

2015: 5222
2016: 10223
2017: 5629


In [41]:
df['year'] = df['texts'].apply(extract_most_common_year, year_pattern=year_pattern)

# count NaN "year" rows
print(f"NaN year count: {df['year'].isnull().sum()} over {len(df)} rows")

NaN year count: 32857 over 44271 rows


In [42]:
df['year'].value_counts().sort_index()

year
2015    3213
2016    5759
2017    2442
Name: count, dtype: int64

## Kaggle_clement

tema: politica (e un po attualità)

dataset originale (CI SONO date)

----

**RISULTATO:**
* Solo news dal 2015 al 2017
* Sicure dal dataset originale

In [43]:
percorso_file = "datasets/Kaggle_clement/df_kaggle_clement_def.csv"
df = None

df = pd.read_csv(percorso_file, encoding="utf-8")

In [44]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,labels
0,0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [45]:
# print different values of 'subject' column
df['subject'].unique()

array(['politicsNews', 'worldnews', 'News', 'politics', 'Government News',
       'left-news', 'US_News', 'Middle-east'], dtype=object)

In [46]:
# print total length
print("Total number of rows:", len(df))

# print number of 0 labels
print("Number of Real News:", (df['labels'] == 0).sum())
# print number of 1 labels
print("Number of Fake News:", (df['labels'] == 1).sum())

Total number of rows: 44898
Number of Real News: 21417
Number of Fake News: 23481


In [47]:
years = [str(year) for year in range(2015, 2018)]
year_pattern = re.compile(r'\b(201[5-7])\b')

all_years = []
for text in df['date']:
    found_years = year_pattern.findall(text)
    all_years.extend(found_years)

year_counts = Counter(all_years)
for year in years:
    print(f"{year}: {year_counts.get(year, 0)}")

2015: 2485
2016: 16470
2017: 25904


## Kaggle_meg

tema: ??

dataset originale (CI SONO date): ??

----

**RISULTATO:**
* TUTTE news del 2016
* Sicure dal dataset originale


In [48]:
percorso_file = "datasets/Kaggle_meg/fake.csv"
df = None

df = pd.read_csv(percorso_file, encoding="utf-8")

In [49]:
df.head()

Unnamed: 0,uuid,ord_in_thread,author,published,title,text,language,crawled,site_url,country,domain_rank,thread_title,spam_score,main_img_url,replies_count,participants_count,likes,comments,shares,type
0,6a175f46bcd24d39b3e962ad0f29936721db70db,0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,english,2016-10-27T01:49:27.168+03:00,100percentfedup.com,US,25689.0,Muslims BUSTED: They Stole Millions In Gov’t B...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
1,2bdc29d12605ef9cf3f09f9875040a7113be5d5b,0,reasoning with facts,2016-10-29T08:47:11.259+03:00,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,english,2016-10-29T08:47:11.259+03:00,100percentfedup.com,US,25689.0,Re: Why Did Attorney General Loretta Lynch Ple...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
2,c70e149fdd53de5e61c29281100b9de0ed268bc3,0,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,english,2016-10-31T01:41:49.479+02:00,100percentfedup.com,US,25689.0,BREAKING: Weiner Cooperating With FBI On Hilla...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
3,7cf7c15731ac2a116dd7f629bd57ea468ed70284,0,Fed Up,2016-11-01T05:22:00.000+02:00,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,english,2016-11-01T15:46:26.304+02:00,100percentfedup.com,US,25689.0,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,0.068,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias
4,0206b54719c7e241ffe0ad4315b808290dbe6c0f,0,Fed Up,2016-11-01T21:56:00.000+02:00,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,english,2016-11-01T23:59:42.266+02:00,100percentfedup.com,US,25689.0,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,0.865,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias


In [50]:
# print different values of 'domain_rank' column
df['domain_rank'].unique()

array([25689.,    nan, 65078., 18533., 37161.,  5772., 78393., 25842.,
        3734., 94243., 42160., 12993., 48959., 61606.,  6659.,  7587.,
       93026.,   967., 16425., 22665., 10352., 87365.,   486., 23950.,
       39980., 36674., 15846., 60570., 11584., 77628., 22680., 25709.,
         553., 48175., 35381., 32069., 40299., 40480., 34811., 19116.,
       12387., 70635., 51784., 56000., 78345., 60463., 17592., 67400.,
       65917., 82758.,  2774., 68648., 32398., 79662., 53149., 44479.,
       26413., 30230., 23040., 21114., 68691., 98679., 54210.,  6416.,
       88660., 43327., 27046., 23349., 10414., 61423., 91187.,  4617.,
       12191., 75353., 38347., 18495., 41475., 17423., 22432., 44451.,
       16994., 57497., 28149., 25945.,   538., 31016., 63847., 62759.,
       22891., 91245.,  1616., 96853., 18896., 91878., 33221., 63072.,
       56487., 65975., 65060.,  8968., 18369., 93150.,  2182., 19375.,
       42575., 37614.,  9645., 14958., 30890., 84363., 24453., 49688.,
      

In [51]:
# print total length
print("Total number of rows:", len(df))


# print number of 0 labels
print("Number of Real News:", (df['spam_score'] <= 0.5).sum())
# print number of 1 labels
print("Number of Fake News:", (df['spam_score'] > 0.5).sum())

Total number of rows: 12999
Number of Real News: 12723
Number of Fake News: 276


In [52]:
years = [str(year) for year in range(2015, 2018)]
year_pattern = re.compile(r'\b(201[5-7])\b')

all_years = []
for text in df['published']:
    found_years = year_pattern.findall(text)
    all_years.extend(found_years)

year_counts = Counter(all_years)
for year in years:
    print(f"{year}: {year_counts.get(year, 0)}")

2015: 0
2016: 12999
2017: 0


## LIAR-PLUS

paper di riferimento: https://aclanthology.org/W18-5513/ 

tema: politica

pubblicazione: 2018

date paper: non è specificato, ma è un estensione di LIAR

dataset originale (NON CI SONO date): https://github.com/Tariq60/LIAR-PLUS/tree/master

----

**RISULTATO:**

* Non si capisce che date siano

In [53]:
percorso_file = "datasets/LIAR-PLUS/df_liar_plus_complete.csv"
df = None
df = pd.read_csv(percorso_file, sep="\t", encoding="utf-8")

In [54]:
df.head()

Unnamed: 0.1,Unnamed: 0,texts,labels
0,0,Says the Annies List political group supports ...,1.0
1,1,When did the decline of coal start? It started...,0.0
2,2,"Hillary Clinton agrees with John McCain ""by vo...",0.0
3,3,Health care reform legislation is likely to ma...,1.0
4,4,The economic turnaround started at the end of ...,0.0


In [55]:
# print total length
print("Total number of rows:", len(df))

# print number of 0 labels
print("Number of Real News:", (df['labels'] == 0).sum())
# print number of 1 labels
print("Number of Fake News:", (df['labels'] == 1).sum())

Total number of rows: 12786
Number of Real News: 7130
Number of Fake News: 5654


In [56]:
start_year = 2005
end_year = 2018
year_pattern = count_years_in_texts(df, start_year, end_year, column='texts')

2005: 18
2006: 32
2007: 38
2008: 87
2009: 86
2010: 87
2011: 103
2012: 87
2013: 45
2014: 43
2015: 35
2016: 24
2017: 5
2018: 1


In [57]:
df['year'] = df['texts'].apply(extract_most_common_year, year_pattern=year_pattern)

# count NaN "year" rows
print(f"NaN year count: {df['year'].isnull().sum()}")

NaN year count: 12162


In [58]:
df['year'].value_counts().sort_index()

year
2005    18
2006    30
2007    34
2008    82
2009    85
2010    79
2011    89
2012    74
2013    40
2014    33
2015    30
2016    24
2017     5
2018     1
Name: count, dtype: int64

## Politifact

paper di riferimento: https://arxiv.org/abs/1708.01967

tema: politica

pubblicazione: 2017

date paper: non viene specificato

dataset originale (NON CI SONO date, e non è Politifact ma è LIAR): https://sites.cs.ucsb.edu/~william/data/

----

**RISULTATO:**

* Non si capisce che date siano

In [59]:
percorso_file = "datasets/Politifact (from FakeNewsNet)/df_politifact.csv"
df = None
df = pd.read_csv(percorso_file, sep="\t", encoding="utf-8")

In [60]:
df.head()

Unnamed: 0.1,Unnamed: 0,texts,labels
0,0,Rose Garden\n\n5:35 P.M. EDT\n\nMR. BARDEN: He...,0
1,1,Money woes in the most populous US state have ...,0
2,2,Username\n\nPassword\n\nNeed help? Contact the...,0
3,3,President Donald Trump has sent a strong messa...,1
4,4,BREAKING!\n\nLiberal rag Huffington Post is re...,1


In [61]:
# print total length
print("Total number of rows:", len(df))

# print number of 0 labels
print("Number of Real News:", (df['labels'] == 0).sum())
# print number of 1 labels
print("Number of Fake News:", (df['labels'] == 1).sum())

Total number of rows: 504
Number of Real News: 321
Number of Fake News: 183


In [62]:
start_year = 2005
end_year = 2018
year_pattern = count_years_in_texts(df, start_year, end_year, column='texts')

2005: 103
2006: 54
2007: 63
2008: 96
2009: 68
2010: 75
2011: 51
2012: 46
2013: 54
2014: 94
2015: 54
2016: 86
2017: 45
2018: 39


In [63]:
df['year'] = df['texts'].apply(extract_most_common_year, year_pattern=year_pattern)

# count NaN "year" rows
print(f"NaN year count: {df['year'].isnull().sum()}")

NaN year count: 276


In [64]:
df['year'].value_counts().sort_index()

year
2005    12
2006     9
2007    17
2008    29
2009    19
2010    23
2011    11
2012    11
2013     7
2014    13
2015    16
2016    36
2017    13
2018    12
Name: count, dtype: int64

## NDF

paper di riferimento: https://colinglab.fileli.unipi.it/wp-content/uploads/2022/10/1-s2.0-S0020025522008167-main.pdf

tema: incendio Notre Dame

pubblicazione: 2022

date paper: Incendio Notre-Dame di Aprile 2019

dataset originale (NON CI SONO date): https://github.com/Unipisa/NDFDataset

----

**RISULTATO:**
* Molte news del 2019
* 445 su 554 sarebbero senza data

In [65]:
percorso_file = "datasets/UNIPI-NLE-NDF/df_ndf.csv"
df = None

df = pd.read_csv(percorso_file, sep="\t", encoding="utf-8")

In [66]:
df.head()

Unnamed: 0.1,Unnamed: 0,texts,labels
0,0,So sad to see a piece of historical majesty go...,0
1,1,The Vice-President of the French student union...,1
2,2,"All the wonderful craftsmanship, that has been...",0
3,3,"Early, unconfirmed reports claim Notre Dame fi...",1
4,4,"""This story is from 2016 and unrelated to the ...",1


In [67]:
# print total length
print("Total number of rows:", len(df))

# print number of 0 labels
print("Number of Real News:", (df['labels'] == 0).sum())
# print number of 1 labels
print("Number of Fake News:", (df['labels'] == 1).sum())

Total number of rows: 554
Number of Real News: 338
Number of Fake News: 216


In [68]:
start_year = 2019
end_year = 2019
year_pattern = count_years_in_texts(df, start_year, end_year, column='texts')

2019: 141


In [69]:
df['year'] = df['texts'].apply(extract_most_common_year, year_pattern=year_pattern)

# count NaN "year" rows
print(f"NaN year count: {df['year'].isnull().sum()} over {len(df)} rows")

NaN year count: 445 over 554 rows


In [70]:
df['year'].value_counts().sort_index()

year
2019    109
Name: count, dtype: int64

## Altro

### Fake and Real News Dataset 

**RISULTATO:**
* Stessi dati di Kaggle_clement

In [71]:
percorso_file_true = "datasets/0_more/FakeAndRealNewsDataset/True.csv"
percorso_file_fake = "datasets/0_more/FakeAndRealNewsDataset/Fake.csv"
df_true = None
df_fake = None

df_true = pd.read_csv(percorso_file_true, encoding="utf-8")
df_fake = pd.read_csv(percorso_file_fake, encoding="utf-8")

In [72]:
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [73]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [74]:
# merde two dataset
df_merged = pd.concat([df_true, df_fake], ignore_index=True)

In [75]:
# print total length
print("Total number of rows:", len(df_merged))

print("Number of Real News:", len(df_true))
print("Number of Fake News:", len(df_fake))

Total number of rows: 44898
Number of Real News: 21417
Number of Fake News: 23481


In [76]:
years = [str(year) for year in range(2015, 2018)]
year_pattern = re.compile(r'\b(201[5-7])\b')

all_years = []
for text in df_merged['date']:
    found_years = year_pattern.findall(text)
    all_years.extend(found_years)

year_counts = Counter(all_years)
for year in years:
    print(f"{year}: {year_counts.get(year, 0)}")

2015: 2485
2016: 16470
2017: 25904


### Getting Real About FakeNews

**RISULTATO:**
* Stessi dati di Kaggle_meg

In [77]:
percorso_file = "datasets/0_more/GettingRealAboutFakeNews.csv"
df = None

df = pd.read_csv(percorso_file, encoding="utf-8")

In [78]:
df.head()

Unnamed: 0,uuid,ord_in_thread,author,published,title,text,language,crawled,site_url,country,domain_rank,thread_title,spam_score,main_img_url,replies_count,participants_count,likes,comments,shares,type
0,6a175f46bcd24d39b3e962ad0f29936721db70db,0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,english,2016-10-27T01:49:27.168+03:00,100percentfedup.com,US,25689.0,Muslims BUSTED: They Stole Millions In Gov’t B...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
1,2bdc29d12605ef9cf3f09f9875040a7113be5d5b,0,reasoning with facts,2016-10-29T08:47:11.259+03:00,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,english,2016-10-29T08:47:11.259+03:00,100percentfedup.com,US,25689.0,Re: Why Did Attorney General Loretta Lynch Ple...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
2,c70e149fdd53de5e61c29281100b9de0ed268bc3,0,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,english,2016-10-31T01:41:49.479+02:00,100percentfedup.com,US,25689.0,BREAKING: Weiner Cooperating With FBI On Hilla...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
3,7cf7c15731ac2a116dd7f629bd57ea468ed70284,0,Fed Up,2016-11-01T05:22:00.000+02:00,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,english,2016-11-01T15:46:26.304+02:00,100percentfedup.com,US,25689.0,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,0.068,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias
4,0206b54719c7e241ffe0ad4315b808290dbe6c0f,0,Fed Up,2016-11-01T21:56:00.000+02:00,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,english,2016-11-01T23:59:42.266+02:00,100percentfedup.com,US,25689.0,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,0.865,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias


### FakeNewsNet: BuzzFeed

**RISULTATO:**
* Stessi dati di Kaggle_clement

In [79]:
percorso_file_true = "datasets/0_more/FakeNewsNet/BuzzFeed_real_news_content.csv"
percorso_file_fake = "datasets/0_more/FakeNewsNet/BuzzFeed_fake_news_content.csv"
df_true = None
df_fake = None

df_true = pd.read_csv(percorso_file_true, encoding="utf-8")
df_fake = pd.read_csv(percorso_file_fake, encoding="utf-8")

In [80]:
df_true.head()

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data
0,Real_1-Webpage,Another Terrorist Attack in NYC…Why Are we STI...,"On Saturday, September 17 at 8:30 pm EST, an e...",http://eaglerising.com/36942/another-terrorist...,http://eaglerising.com/wp-content/uploads/2016...,"View All Posts,Leonora Cravotta",http://eaglerising.com,{'$date': 1474528230000},,http://constitution.com/wp-content/uploads/201...,http://eaglerising.com/36942/another-terrorist...,"{""description"": ""\u201cWe believe at this poin..."
1,Real_10-Webpage,"Donald Trump: Drugs a 'Very, Very Big Factor' ...",Less than a day after protests over the police...,http://abcn.ws/2d4lNn9,http://a.abcnews.com/images/Politics/AP_donald...,"More Candace,Adam Kelsey,Abc News,More Adam",http://abcn.ws,,,http://www.googleadservices.com/pagead/convers...,http://abcnews.go.com/Politics/donald-trump-dr...,"{""fb_title"": ""Trump: Drugs a 'Very, Very Big F..."
2,Real_11-Webpage,"Obama To UN: ‘Giving Up Liberty, Enhances Secu...","Obama To UN: ‘Giving Up Liberty, Enhances Secu...",http://rightwingnews.com/barack-obama/obama-un...,http://rightwingnews.com/wp-content/uploads/20...,Cassy Fiano,http://rightwingnews.com,{'$date': 1474476044000},https://www.youtube.com/embed/ji6pl5Vwrvk,http://rightwingnews.com/wp-content/uploads/20...,http://rightwingnews.com/barack-obama/obama-un...,"{""googlebot"": ""noimageindex"", ""og"": {""site_nam..."
3,Real_12-Webpage,Trump vs. Clinton: A Fundamental Clash over Ho...,Getty Images Wealth Of Nations Trump vs. Clint...,http://politi.co/2de2qs0,http://static.politico.com/e9/11/6144cdc24e319...,"Jack Shafer,Erick Trickey,Zachary Karabell",http://politi.co,{'$date': 1474974420000},,https://static.politico.com/dims4/default/8a1c...,http://www.politico.com/magazine/story/2016/09...,"{""description"": ""He sees it as zero-sum. She b..."
4,Real_13-Webpage,"President Obama Vetoes 9/11 Victims Bill, Sett...",President Obama today vetoed a bill that would...,http://abcn.ws/2dh2NFs,http://a.abcnews.com/images/US/AP_Obama_BM_201...,"John Parkinson,More John,Abc News,More Alexander",http://abcn.ws,,,http://www.googleadservices.com/pagead/convers...,http://abcnews.go.com/Politics/president-obama...,"{""fb_title"": ""President Obama Vetoes 9/11 Vict..."


In [81]:
df_fake.head()

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data
0,Fake_1-Webpage,Proof The Mainstream Media Is Manipulating The...,I woke up this morning to find a variation of ...,http://www.addictinginfo.org/2016/09/19/proof-...,http://addictinginfo.addictinginfoent.netdna-c...,Wendy Gittleson,http://www.addictinginfo.org,{'$date': 1474243200000},,"http://i.imgur.com/JeqZLhj.png,http://addictin...",http://addictinginfo.com/2016/09/19/proof-the-...,"{""publisher"": ""Addicting Info | The Knowledge ..."
1,Fake_10-Webpage,Charity: Clinton Foundation Distributed “Water...,Former President Bill Clinton and his Clinton ...,http://eaglerising.com/36899/charity-clinton-f...,http://eaglerising.com/wp-content/uploads/2016...,View All Posts,http://eaglerising.com,{'$date': 1474416521000},,http://constitution.com/wp-content/uploads/201...,http://eaglerising.com/36899/charity-clinton-f...,"{""description"": ""The possibility that CHAI dis..."
2,Fake_11-Webpage,A Hillary Clinton Administration May be Entire...,After collapsing just before trying to step in...,http://eaglerising.com/36880/a-hillary-clinton...,http://eaglerising.com/wp-content/uploads/2016...,"View All Posts,Tony Elliott",http://eaglerising.com,{'$date': 1474416638000},,http://constitution.com/wp-content/uploads/201...,http://eaglerising.com/36880/a-hillary-clinton...,"{""description"": ""Hillary Clinton may be the fi..."
3,Fake_12-Webpage,Trump’s Latest Campaign Promise May Be His Mos...,"Donald Trump is, well, deplorable. He’s sugges...",http://www.addictinginfo.org/2016/09/19/trumps...,http://addictinginfo.addictinginfoent.netdna-c...,John Prager,http://www.addictinginfo.org,{'$date': 1474243200000},,"http://i.imgur.com/JeqZLhj.png,http://2.gravat...",http://addictinginfo.com/2016/09/19/trumps-lat...,"{""publisher"": ""Addicting Info | The Knowledge ..."
4,Fake_13-Webpage,Website is Down For Maintenance,Website is Down For Maintenance,http://www.proudcons.com/clinton-foundation-ca...,,,http://www.proudcons.com,,,,,"{""og"": {""url"": ""http://www.proudcons.com"", ""ty..."


In [82]:
# merde two dataset
df_merged = pd.concat([df_true, df_fake], ignore_index=True)

In [83]:
# print total length
print("Total number of rows:", len(df_merged))

print("Number of Real News:", len(df_true))
print("Number of Fake News:", len(df_fake))

Total number of rows: 182
Number of Real News: 91
Number of Fake News: 91


In [84]:
import ast

# Applica solo se la cella non è NaN
df_merged['publish_date'] = df_merged['publish_date'].apply(
    lambda x: ast.literal_eval(x)['$date'] if pd.notna(x) else x
)

# Converte solo i valori non NaN in datetime
df_merged['publish_date'] = pd.to_datetime(
    df_merged['publish_date'], 
    unit='ms', 
    errors='coerce'  # forza a NaT se qualcosa non è convertibile
)
# riconverti in stringa per l'analisi successiva
df_merged['publish_date'] = df_merged['publish_date'].astype(str)

In [85]:
years = [str(year) for year in range(2015, 2018)]
year_pattern = re.compile(r'\b(201[5-7])\b')

all_years = []
for text in df_merged['publish_date']:
    found_years = year_pattern.findall(text)
    all_years.extend(found_years)

year_counts = Counter(all_years)
for year in years:
    print(f"{year}: {year_counts.get(year, 0)}")

2015: 2
2016: 130
2017: 1


### FakeNewsNet: PolitiFact 

**RISULTATO:**
* Stessi dati di Kaggle_clement

In [86]:
percorso_file_true = "datasets/0_more/FakeNewsNet/PolitiFact_real_news_content.csv"
percorso_file_fake = "datasets/0_more/FakeNewsNet/PolitiFact_fake_news_content.csv"
df_true = None
df_fake = None

df_true = pd.read_csv(percorso_file_true, encoding="utf-8")
df_fake = pd.read_csv(percorso_file_fake, encoding="utf-8")

In [87]:
df_true.head()

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data
0,Real_1-Webpage,Trump Just Insulted Millions Who Lost Everythi...,16.8k SHARES SHARE THIS STORY\n\nHillary Clint...,http://occupydemocrats.com/2016/09/27/trump-ju...,http://occupydemocrats.com/wp-content/uploads/...,"Brett Bose,Grant Stern,Steve Bernstein,Natalie...",http://occupydemocrats.com,{'$date': 1474934400000},,http://occupydemocrats.com/wp-content/uploads/...,http://occupydemocrats.com/2016/09/27/trump-ju...,"{""generator"": ""Powered by Visual Composer - dr..."
1,Real_10-Webpage,Famous dog killed in spot she waited a year fo...,Famous dog killed in spot she waited a year fo...,http://rightwingnews.com/top-news/famous-dog-k...,http://rightwingnews.com/wp-content/uploads/20...,,http://rightwingnews.com,{'$date': 1474948336000},,http://rightwingnews.com/wp-content/uploads/20...,http://rightwingnews.com/top-news/famous-dog-k...,"{""googlebot"": ""noimageindex"", ""og"": {""site_nam..."
2,Real_100-Webpage,House oversight panel votes Clinton IT chief i...,Story highlights The House Oversight panel vot...,http://cnn.it/2deaH2d,http://i2.cdn.cnn.com/cnnnext/dam/assets/16091...,"Tom Lobianco,Deirdre Walsh",http://cnn.it,,,http://i2.cdn.cnn.com/cnnnext/dam/assets/17050...,http://www.cnn.com/2016/09/22/politics/bryan-p...,"{""description"": ""Members of the House Oversigh..."
3,Real_101-Webpage,America Just Tragically Lost A Country Music I...,We are absolutely heartbroken to hear about th...,http://newsbake.com/entertainment-news/music-e...,http://newsbake.com/wp-content/uploads/2016/05...,Nancy Wells,http://newsbake.com,{'$date': 1474898600000},https://www.youtube.com/embed/8ozTJcu-_BU,http://0.gravatar.com/avatar/0d702c6042933cd78...,http://newsbake.com/entertainment-news/music-e...,"{""shareaholic"": {""site_name"": ""NewsBake"", ""lan..."
4,Real_102-Webpage,Monuments to the Battle for the New South,"Nine years ago, a driver lost control of his p...",http://politi.co/2dd9U1x,http://static.politico.com/25/ed/85332de14c45b...,"Jack Shafer,Lisa Rab",http://politi.co,{'$date': 1473941820000},,http://static.politico.com/25/ed/85332de14c45b...,http://www.politico.com/magazine/story/2016/09...,"{""description"": ""Virginia, increasingly divers..."


In [88]:
df_fake.head()

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data
0,Real_1-Webpage,Trump Just Insulted Millions Who Lost Everythi...,16.8k SHARES SHARE THIS STORY\n\nHillary Clint...,http://occupydemocrats.com/2016/09/27/trump-ju...,http://occupydemocrats.com/wp-content/uploads/...,"Brett Bose,Grant Stern,Steve Bernstein,Natalie...",http://occupydemocrats.com,{'$date': 1474934400000},,http://occupydemocrats.com/wp-content/uploads/...,http://occupydemocrats.com/2016/09/27/trump-ju...,"{""generator"": ""Powered by Visual Composer - dr..."
1,Real_10-Webpage,Famous dog killed in spot she waited a year fo...,Famous dog killed in spot she waited a year fo...,http://rightwingnews.com/top-news/famous-dog-k...,http://rightwingnews.com/wp-content/uploads/20...,,http://rightwingnews.com,{'$date': 1474948336000},,http://rightwingnews.com/wp-content/uploads/20...,http://rightwingnews.com/top-news/famous-dog-k...,"{""googlebot"": ""noimageindex"", ""og"": {""site_nam..."
2,Real_100-Webpage,House oversight panel votes Clinton IT chief i...,Story highlights The House Oversight panel vot...,http://cnn.it/2deaH2d,http://i2.cdn.cnn.com/cnnnext/dam/assets/16091...,"Tom Lobianco,Deirdre Walsh",http://cnn.it,,,http://i2.cdn.cnn.com/cnnnext/dam/assets/17050...,http://www.cnn.com/2016/09/22/politics/bryan-p...,"{""description"": ""Members of the House Oversigh..."
3,Real_101-Webpage,America Just Tragically Lost A Country Music I...,We are absolutely heartbroken to hear about th...,http://newsbake.com/entertainment-news/music-e...,http://newsbake.com/wp-content/uploads/2016/05...,Nancy Wells,http://newsbake.com,{'$date': 1474898600000},https://www.youtube.com/embed/8ozTJcu-_BU,http://0.gravatar.com/avatar/0d702c6042933cd78...,http://newsbake.com/entertainment-news/music-e...,"{""shareaholic"": {""site_name"": ""NewsBake"", ""lan..."
4,Real_102-Webpage,Monuments to the Battle for the New South,"Nine years ago, a driver lost control of his p...",http://politi.co/2dd9U1x,http://static.politico.com/25/ed/85332de14c45b...,"Jack Shafer,Lisa Rab",http://politi.co,{'$date': 1473941820000},,http://static.politico.com/25/ed/85332de14c45b...,http://www.politico.com/magazine/story/2016/09...,"{""description"": ""Virginia, increasingly divers..."


In [89]:
# merde two dataset
df_merged = pd.concat([df_true, df_fake], ignore_index=True)
df_merged

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data
0,Real_1-Webpage,Trump Just Insulted Millions Who Lost Everythi...,16.8k SHARES SHARE THIS STORY\n\nHillary Clint...,http://occupydemocrats.com/2016/09/27/trump-ju...,http://occupydemocrats.com/wp-content/uploads/...,"Brett Bose,Grant Stern,Steve Bernstein,Natalie...",http://occupydemocrats.com,{'$date': 1474934400000},,http://occupydemocrats.com/wp-content/uploads/...,http://occupydemocrats.com/2016/09/27/trump-ju...,"{""generator"": ""Powered by Visual Composer - dr..."
1,Real_10-Webpage,Famous dog killed in spot she waited a year fo...,Famous dog killed in spot she waited a year fo...,http://rightwingnews.com/top-news/famous-dog-k...,http://rightwingnews.com/wp-content/uploads/20...,,http://rightwingnews.com,{'$date': 1474948336000},,http://rightwingnews.com/wp-content/uploads/20...,http://rightwingnews.com/top-news/famous-dog-k...,"{""googlebot"": ""noimageindex"", ""og"": {""site_nam..."
2,Real_100-Webpage,House oversight panel votes Clinton IT chief i...,Story highlights The House Oversight panel vot...,http://cnn.it/2deaH2d,http://i2.cdn.cnn.com/cnnnext/dam/assets/16091...,"Tom Lobianco,Deirdre Walsh",http://cnn.it,,,http://i2.cdn.cnn.com/cnnnext/dam/assets/17050...,http://www.cnn.com/2016/09/22/politics/bryan-p...,"{""description"": ""Members of the House Oversigh..."
3,Real_101-Webpage,America Just Tragically Lost A Country Music I...,We are absolutely heartbroken to hear about th...,http://newsbake.com/entertainment-news/music-e...,http://newsbake.com/wp-content/uploads/2016/05...,Nancy Wells,http://newsbake.com,{'$date': 1474898600000},https://www.youtube.com/embed/8ozTJcu-_BU,http://0.gravatar.com/avatar/0d702c6042933cd78...,http://newsbake.com/entertainment-news/music-e...,"{""shareaholic"": {""site_name"": ""NewsBake"", ""lan..."
4,Real_102-Webpage,Monuments to the Battle for the New South,"Nine years ago, a driver lost control of his p...",http://politi.co/2dd9U1x,http://static.politico.com/25/ed/85332de14c45b...,"Jack Shafer,Lisa Rab",http://politi.co,{'$date': 1473941820000},,http://static.politico.com/25/ed/85332de14c45b...,http://www.politico.com/magazine/story/2016/09...,"{""description"": ""Virginia, increasingly divers..."
...,...,...,...,...,...,...,...,...,...,...,...,...
235,Real_95-Webpage,"Donald Trump, Germany’s disfavored son – POLITICO","KALLSTADT, Germany — Few places in Germany are...",http://politi.co/2csN1WG,http://www.politico.eu/wp-content/uploads/2016...,"Matthew Karnitschnig,Janosch Delcker",http://politi.co,{'$date': 1474601447000},,http://g8fip1kplyr33r3krz5b97d1.wpengine.netdn...,http://www.politico.eu/article/donald-trump-an...,"{""description"": ""In the idyllic hamlet of Kall..."
236,Real_96-Webpage,BREAKING: Hollywood Legend Just Died Of Terrib...,Hollywood loses yet another one of their deare...,http://newsbake.com/entertainment-news/tv/brea...,http://newsbake.com/wp-content/uploads/2016/09...,Nancy Wells,http://newsbake.com,{'$date': 1474250978000},https://www.youtube.com/embed/pcj4boVT4fc,http://0.gravatar.com/avatar/0d702c6042933cd78...,http://newsbake.com/entertainment-news/tv/brea...,"{""shareaholic"": {""site_name"": ""NewsBake"", ""lan..."
237,Real_97-Webpage,Worst. President. Ever.,"As my 25th wedding anniversary approached, I t...",http://politi.co/2d1qa5t,http://static.politico.com/2f/8c/e44158e84ee8a...,"Jack Shafer,Robert Strauss",http://politi.co,{'$date': 1474196700000},,"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",http://www.politico.com/magazine/story/2016/09...,"{""description"": ""People are debating who will ..."
238,Real_98-Webpage,Don King drops N-word while introducing Donald...,Story highlights Trump was sitting in a chair ...,http://cnn.it/2dh5vq9,http://i2.cdn.cnn.com/cnnnext/dam/assets/16092...,Jeremy Diamond,http://cnn.it,,,http://i2.cdn.cnn.com/cnnnext/dam/assets/12102...,http://www.cnn.com/2016/09/21/politics/don-kin...,"{""description"": ""The controversial boxing prom..."


In [90]:
# print total length
print("Total number of rows:", len(df_merged))

print("Number of Real News:", len(df_true))
print("Number of Fake News:", len(df_fake))

Total number of rows: 240
Number of Real News: 120
Number of Fake News: 120


In [91]:
# Applica solo se la cella non è NaN
df_merged['publish_date'] = df_merged['publish_date'].apply(
    lambda x: ast.literal_eval(x)['$date'] if pd.notna(x) else x
)

# Converte solo i valori non NaN in datetime
df_merged['publish_date'] = pd.to_datetime(
    df_merged['publish_date'], 
    unit='ms', 
    errors='coerce'  # forza a NaT se qualcosa non è convertibile
)
# riconverti in stringa per l'analisi successiva
df_merged['publish_date'] = df_merged['publish_date'].astype(str)

In [92]:
years = [str(year) for year in range(2015, 2018)]
year_pattern = re.compile(r'\b(201[5-7])\b')

all_years = []
for text in df_merged['publish_date']:
    found_years = year_pattern.findall(text)
    all_years.extend(found_years)

year_counts = Counter(all_years)
for year in years:
    print(f"{year}: {year_counts.get(year, 0)}")

2015: 0
2016: 162
2017: 0
