In [3]:
# Importing Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [4]:
# Load the dataset
fake_news_data = pd.read_csv('Resources/train.csv')
fake_news_data.head()

Unnamed: 0,ID,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,barely_true_counts,false_counts,...,sentiment_score,sentiment_magnitude,anger,fear,joy,disgust,sad,speaker_id,list,sentiment_code
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,...,-0.5,0.5,0.121137,0.008926,0.026096,0.263479,0.531887,_0_,"[0, 1]",_NEG_
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,-0.4,0.8,0.095352,0.124566,0.191357,0.016999,0.102045,_1_,"[0, 1]",_NEG_
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.3,0.3,0.039559,0.024162,0.500384,0.454228,0.052453,_2_,"[1, 0]",_NEG_
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,...,-0.3,0.3,0.004804,0.194674,0.375055,0.022509,0.383403,_3_,"[0, 1]",_NEG_
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,...,0.0,0.0,0.044237,0.215996,0.222402,0.045672,0.274343,_4_,"[0, 1]",


In [9]:
# Check for missing values
fake_news_data.isnull().sum()

ID                      0
label                   0
statement               0
subject                 0
speaker                 0
speaker_job             0
state_info              0
party_affiliation       0
barely_true_counts      0
false_counts            0
half_true_counts        0
mostly_true_counts      0
pants_on_fire_counts    0
context                 0
sentiment               0
sentiment_score         0
sentiment_magnitude     0
anger                   0
fear                    0
joy                     0
disgust                 0
sad                     0
speaker_id              0
list                    0
sentiment_code          0
dtype: int64

In [10]:
# Drop rows with missing values
fake_news_data.dropna(inplace=True)

In [11]:
# Check the shape of the data
fake_news_data.shape

(6565, 25)

In [14]:
# Preprocess the 'statement' column (lowercase, remove punctuation, etc.)
fake_news_data['statement'] = fake_news_data['statement'].apply(lambda x: x.lower())
fake_news_data

Unnamed: 0,ID,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,barely_true_counts,false_counts,...,sentiment_score,sentiment_magnitude,anger,fear,joy,disgust,sad,speaker_id,list,sentiment_code
0,2635.json,false,says the annies list political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,...,-0.5,0.5,0.121137,0.008926,0.026096,0.263479,0.531887,_0_,"[0, 1]",_NEG_
1,10540.json,half-true,when did the decline of coal start? it started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,-0.4,0.8,0.095352,0.124566,0.191357,0.016999,0.102045,_1_,"[0, 1]",_NEG_
2,324.json,mostly-true,"hillary clinton agrees with john mccain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.3,0.3,0.039559,0.024162,0.500384,0.454228,0.052453,_2_,"[1, 0]",_NEG_
5,12465.json,true,the chicago bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,...,-0.3,0.3,0.044534,0.037243,0.294698,0.035936,0.598971,_5_,"[1, 0]",_NEG_
7,153.json,half-true,i'm the only person on this stage who has work...,ethics,barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.2,0.2,0.021023,0.077569,0.032182,0.038037,0.438594,_2_,"[0, 1]",_NEG_
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11511,7013.json,barely-true,says u.s. rep. charles bass wants to privatize...,social-security,ann-mclane-kuster,Attorney,New Hampshire,democrat,2.0,1.0,...,-0.3,0.3,0.151639,0.198275,0.135165,0.106559,0.406074,_240_,"[0, 1]",_NEG_
11513,2661.json,pants-fire,"in the past two years, democrats have spent mo...","federal-budget,history",eric-cantor,House Majority Leader,Virginia,republican,9.0,6.0,...,-0.5,0.5,0.097422,0.064776,0.088630,0.187085,0.715000,_54_,"[0, 1]",_NEG_
11514,3419.json,half-true,"for the first time in more than a decade, impo...","energy,oil-spill,trade",barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.2,0.2,0.039024,0.103790,0.104699,0.046181,0.758238,_4_,"[0, 1]",_NEG_
11515,12548.json,mostly-true,says donald trump has bankrupted his companies...,candidates-biography,hillary-clinton,Presidential candidate,New York,democrat,40.0,29.0,...,-0.7,0.7,0.064967,0.180882,0.074947,0.054953,0.710395,_8_,"[1, 0]",_NEG_


In [15]:
# Analyze the distribution of classes (labels)
fake_news_data['label'].value_counts()

label
half-true      1377
mostly-true    1331
false          1257
true           1090
barely-true    1069
pants-fire      441
Name: count, dtype: int64

In [16]:
# Analyze the distribution of classes (labels)
fake_news_data['state_info'].value_counts()

state_info
Texas             913
New York          640
Florida           580
Wisconsin         534
Illinois          519
                 ... 
Virginia            1
United Kingdom      1
China               1
Russia              1
Mississippi         1
Name: count, Length: 74, dtype: int64

In [17]:
# Analyze the distribution of classes (labels)
fake_news_data['party_affiliation'].value_counts()

party_affiliation
republican                      3345
democrat                        2516
none                             367
independent                      123
libertarian                       32
newsmaker                         32
columnist                         27
activist                          23
organization                      23
journalist                        22
talk-show-host                    14
state-official                    12
labor-leader                      12
business-leader                    6
tea-party-member                   4
green                              3
education-official                 1
democratic-farmer-labor            1
ocean-state-tea-party-action       1
constitution-party                 1
Name: count, dtype: int64

In [39]:
# Removing .json after ID 
fake_news_data['ID'] = fake_news_data['ID'].str.replace('.json', '')
fake_news_data

Unnamed: 0,ID,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,barely_true_counts,false_counts,...,sentiment_score,sentiment_magnitude,anger,fear,joy,disgust,sad,speaker_id,list,sentiment_code
0,2635,false,says the annies list political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,...,-0.5,0.5,0.121137,0.008926,0.026096,0.263479,0.531887,_0_,"[0, 1]",_NEG_
1,10540,half-true,when did the decline of coal start? it started...,energy,scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,-0.4,0.8,0.095352,0.124566,0.191357,0.016999,0.102045,_1_,"[0, 1]",_NEG_
1,10540,half-true,when did the decline of coal start? it started...,history,scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,-0.4,0.8,0.095352,0.124566,0.191357,0.016999,0.102045,_1_,"[0, 1]",_NEG_
1,10540,half-true,when did the decline of coal start? it started...,job-accomplishments,scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,-0.4,0.8,0.095352,0.124566,0.191357,0.016999,0.102045,_1_,"[0, 1]",_NEG_
2,324,mostly-true,"hillary clinton agrees with john mccain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.3,0.3,0.039559,0.024162,0.500384,0.454228,0.052453,_2_,"[1, 0]",_NEG_
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11514,3419,half-true,"for the first time in more than a decade, impo...",oil-spill,barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.2,0.2,0.039024,0.103790,0.104699,0.046181,0.758238,_4_,"[0, 1]",_NEG_
11514,3419,half-true,"for the first time in more than a decade, impo...",trade,barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.2,0.2,0.039024,0.103790,0.104699,0.046181,0.758238,_4_,"[0, 1]",_NEG_
11515,12548,mostly-true,says donald trump has bankrupted his companies...,candidates-biography,hillary-clinton,Presidential candidate,New York,democrat,40.0,29.0,...,-0.7,0.7,0.064967,0.180882,0.074947,0.054953,0.710395,_8_,"[1, 0]",_NEG_
11518,9117,barely-true,no one claims the report vindicating new jerse...,candidates-biography,rudy-giuliani,Attorney,New York,republican,9.0,11.0,...,-0.6,0.6,0.137646,0.051547,0.168532,0.564583,0.166095,_20_,"[0, 1]",_NEG_


In [40]:
# Grouping subject column data
grouped_subject_data = fake_news_data.groupby('subject').size().reset_index
grouped_subject_data

<bound method Series.reset_index of subject
10-news-tampa-bay      2
Alcohol               15
abc-news-week         63
abortion             145
afghanistan           26
                    ... 
wealth                40
weather               23
welfare               32
women                126
workers              180
Length: 139, dtype: int64>

In [41]:
# Split 'subject' column by comma
fake_news_data['subject'] = fake_news_data['subject'].str.split(',')

# Explode the resulting list so that each element becomes a separate row
fake_news_data = fake_news_data.explode('subject')

# Group by 'subject' and count occurrences
subject_counts = fake_news_data.groupby('subject').size().reset_index(name='counts')
subject_counts

Unnamed: 0,subject,counts
0,10-news-tampa-bay,2
1,Alcohol,15
2,abc-news-week,63
3,abortion,145
4,afghanistan,26
...,...,...
134,wealth,40
135,weather,23
136,welfare,32
137,women,126


In [38]:
# Sort in descending order
subject_counts.sort_values(by='counts', ascending=False)

Unnamed: 0,subject,counts
40,economy,846
62,health-care,731
122,taxes,639
48,federal-budget,513
75,jobs,498
...,...,...
0,10-news-tampa-bay,2
32,death-penalty,1
46,fake-news,1
50,fires,1
