In [124]:
# Importing Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler


In [125]:
# Load the dataset
fake_news_data = pd.read_csv('Resources/train.csv')
fake_news_data.head(10)

Unnamed: 0,ID,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,barely_true_counts,false_counts,...,sentiment_score,sentiment_magnitude,anger,fear,joy,disgust,sad,speaker_id,list,sentiment_code
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,...,-0.5,0.5,0.121137,0.008926,0.026096,0.263479,0.531887,_0_,"[0, 1]",_NEG_
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,-0.4,0.8,0.095352,0.124566,0.191357,0.016999,0.102045,_1_,"[0, 1]",_NEG_
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.3,0.3,0.039559,0.024162,0.500384,0.454228,0.052453,_2_,"[1, 0]",_NEG_
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,...,-0.3,0.3,0.004804,0.194674,0.375055,0.022509,0.383403,_3_,"[0, 1]",_NEG_
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,...,0.0,0.0,0.044237,0.215996,0.222402,0.045672,0.274343,_4_,"[0, 1]",
5,12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,...,-0.3,0.3,0.044534,0.037243,0.294698,0.035936,0.598971,_5_,"[1, 0]",_NEG_
6,2342.json,barely-true,Jim Dunnam has not lived in the district he re...,candidates-biography,republican-party-texas,,Texas,republican,3.0,1.0,...,-0.6,0.6,0.093937,0.019159,0.278842,0.110865,0.172604,_6_,"[0, 1]",_NEG_
7,153.json,half-true,I'm the only person on this stage who has work...,ethics,barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.2,0.2,0.021023,0.077569,0.032182,0.038037,0.438594,_2_,"[0, 1]",_NEG_
8,5602.json,half-true,"However, it took $19.5 million in Oregon Lotte...",jobs,oregon-lottery,,,organization,0.0,0.0,...,-0.3,0.3,0.116767,0.099105,0.219329,0.051303,0.114302,_7_,"[0, 1]",_NEG_
9,9741.json,mostly-true,Says GOP primary opponents Glenn Grothman and ...,"energy,message-machine-2014,voting-record",duey-stroebel,State representative,Wisconsin,republican,0.0,0.0,...,-0.4,0.4,0.163977,0.101336,0.109979,0.330754,0.203412,_8_,"[1, 0]",_NEG_


In [126]:
# Check for missing values
fake_news_data.isnull().sum()

ID                         0
label                      0
statement                  0
subject                    0
speaker                    0
speaker_job             3239
state_info              2486
party_affiliation          0
barely_true_counts         0
false_counts               0
half_true_counts           0
mostly_true_counts         0
pants_on_fire_counts       0
context                  112
sentiment               1541
sentiment_score            0
sentiment_magnitude        0
anger                      0
fear                       0
joy                        0
disgust                    0
sad                        0
speaker_id                 0
list                       0
sentiment_code          1541
dtype: int64

In [127]:
# Drop rows with missing values
fake_news_data.dropna(inplace=True)

In [128]:
# Check the shape of the data
fake_news_data.shape

fake_news_drop = fake_news_data
fake_news_drop

Unnamed: 0,ID,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,barely_true_counts,false_counts,...,sentiment_score,sentiment_magnitude,anger,fear,joy,disgust,sad,speaker_id,list,sentiment_code
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,...,-0.5,0.5,0.121137,0.008926,0.026096,0.263479,0.531887,_0_,"[0, 1]",_NEG_
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,-0.4,0.8,0.095352,0.124566,0.191357,0.016999,0.102045,_1_,"[0, 1]",_NEG_
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.3,0.3,0.039559,0.024162,0.500384,0.454228,0.052453,_2_,"[1, 0]",_NEG_
5,12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,...,-0.3,0.3,0.044534,0.037243,0.294698,0.035936,0.598971,_5_,"[1, 0]",_NEG_
7,153.json,half-true,I'm the only person on this stage who has work...,ethics,barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.2,0.2,0.021023,0.077569,0.032182,0.038037,0.438594,_2_,"[0, 1]",_NEG_
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11511,7013.json,barely-true,Says U.S. Rep. Charles Bass wants to privatize...,social-security,ann-mclane-kuster,Attorney,New Hampshire,democrat,2.0,1.0,...,-0.3,0.3,0.151639,0.198275,0.135165,0.106559,0.406074,_240_,"[0, 1]",_NEG_
11513,2661.json,pants-fire,"In the past two years, Democrats have spent mo...","federal-budget,history",eric-cantor,House Majority Leader,Virginia,republican,9.0,6.0,...,-0.5,0.5,0.097422,0.064776,0.088630,0.187085,0.715000,_54_,"[0, 1]",_NEG_
11514,3419.json,half-true,"For the first time in more than a decade, impo...","energy,oil-spill,trade",barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.2,0.2,0.039024,0.103790,0.104699,0.046181,0.758238,_4_,"[0, 1]",_NEG_
11515,12548.json,mostly-true,Says Donald Trump has bankrupted his companies...,candidates-biography,hillary-clinton,Presidential candidate,New York,democrat,40.0,29.0,...,-0.7,0.7,0.064967,0.180882,0.074947,0.054953,0.710395,_8_,"[1, 0]",_NEG_


In [129]:
# Preprocess the 'statement' column (lowercase, remove punctuation, etc.)
# fake_news_drop['statement'] = fake_news_drop['statement'].apply(lambda x: x.lower())
# fake_news_drop.head(10)

In [130]:
# Analyze the distribution of classes (labels)
fake_news_drop['label'].value_counts()

label
half-true      1377
mostly-true    1331
false          1257
true           1090
barely-true    1069
pants-fire      441
Name: count, dtype: int64

In [131]:
# Analyze the distribution of classes (state_info)
fake_news_drop['state_info'].value_counts()

state_info
Texas             913
New York          640
Florida           580
Wisconsin         534
Illinois          519
                 ... 
Virginia            1
United Kingdom      1
China               1
Russia              1
Mississippi         1
Name: count, Length: 74, dtype: int64

In [132]:
# Analyze the distribution of classes (party_affiliation)
fake_news_drop['party_affiliation'].value_counts()

party_affiliation
republican                      3345
democrat                        2516
none                             367
independent                      123
libertarian                       32
newsmaker                         32
columnist                         27
activist                          23
organization                      23
journalist                        22
talk-show-host                    14
state-official                    12
labor-leader                      12
business-leader                    6
tea-party-member                   4
green                              3
education-official                 1
democratic-farmer-labor            1
ocean-state-tea-party-action       1
constitution-party                 1
Name: count, dtype: int64

In [133]:
# Removing .json after ID 
fake_news_drop['ID'] = fake_news_drop['ID'].str.replace('.json', '')
fake_news_drop.head(10)

Unnamed: 0,ID,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,barely_true_counts,false_counts,...,sentiment_score,sentiment_magnitude,anger,fear,joy,disgust,sad,speaker_id,list,sentiment_code
0,2635,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,...,-0.5,0.5,0.121137,0.008926,0.026096,0.263479,0.531887,_0_,"[0, 1]",_NEG_
1,10540,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,-0.4,0.8,0.095352,0.124566,0.191357,0.016999,0.102045,_1_,"[0, 1]",_NEG_
2,324,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.3,0.3,0.039559,0.024162,0.500384,0.454228,0.052453,_2_,"[1, 0]",_NEG_
5,12465,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,...,-0.3,0.3,0.044534,0.037243,0.294698,0.035936,0.598971,_5_,"[1, 0]",_NEG_
7,153,half-true,I'm the only person on this stage who has work...,ethics,barack-obama,President,Illinois,democrat,70.0,71.0,...,-0.2,0.2,0.021023,0.077569,0.032182,0.038037,0.438594,_2_,"[0, 1]",_NEG_
9,9741,mostly-true,Says GOP primary opponents Glenn Grothman and ...,"energy,message-machine-2014,voting-record",duey-stroebel,State representative,Wisconsin,republican,0.0,0.0,...,-0.4,0.4,0.163977,0.101336,0.109979,0.330754,0.203412,_8_,"[1, 0]",_NEG_
10,7115,mostly-true,"For the first time in history, the share of th...",elections,robert-menendez,U.S. Senator,New Jersey,democrat,1.0,3.0,...,-0.3,0.3,0.042787,0.071241,0.448579,0.247391,0.189096,_9_,"[1, 0]",_NEG_
11,4148,half-true,"Since 2000, nearly 12 million Americans have s...","economy,jobs,new-hampshire-2012,poverty",bernie-s,U.S. Senator,Vermont,independent,18.0,12.0,...,-0.8,0.8,0.260516,0.174539,0.035247,0.115204,0.693537,_10_,"[0, 1]",_NEG_
12,5947,false,When Mitt Romney was governor of Massachusetts...,"history,state-budget",mitt-romney,Former governor,Massachusetts,republican,34.0,32.0,...,-0.3,0.3,0.168539,0.084288,0.386121,0.188206,0.239488,_11_,"[0, 1]",_NEG_
14,8705,barely-true,Most of the (Affordable Care Act) has already ...,health-care,george-will,Columnist,Maryland,columnist,7.0,6.0,...,-0.2,0.2,0.135979,0.083334,0.115334,0.078517,0.492167,_13_,"[0, 1]",_NEG_


In [134]:
# Split 'subject' column by comma
fake_news_drop['subject'] = fake_news_drop['subject'].str.split(',')

# Explode the resulting list so that each element becomes a separate row
fake_news_subject = fake_news_drop.explode('subject')

# Group by 'subject' and count occurrences
subject_counts = fake_news_subject.groupby('subject').size().reset_index(name='counts')
subject_counts

Unnamed: 0,subject,counts
0,10-news-tampa-bay,2
1,Alcohol,15
2,abc-news-week,63
3,abortion,145
4,afghanistan,26
...,...,...
134,wealth,40
135,weather,23
136,welfare,32
137,women,126


In [135]:
# Sort in descending order
subject_counts.sort_values(by='counts', ascending=False)

Unnamed: 0,subject,counts
40,economy,846
62,health-care,731
122,taxes,639
48,federal-budget,513
75,jobs,498
...,...,...
0,10-news-tampa-bay,2
32,death-penalty,1
46,fake-news,1
50,fires,1


In [136]:
fake_news_drop.dtypes

ID                       object
label                    object
statement                object
subject                  object
speaker                  object
speaker_job              object
state_info               object
party_affiliation        object
barely_true_counts      float64
false_counts            float64
half_true_counts        float64
mostly_true_counts      float64
pants_on_fire_counts    float64
context                  object
sentiment                object
sentiment_score         float64
sentiment_magnitude     float64
anger                   float64
fear                    float64
joy                     float64
disgust                 float64
sad                     float64
speaker_id               object
list                     object
sentiment_code           object
dtype: object

In [137]:
# Drop the 'speaker_id' column from the DataFrame
fake_news_drop.drop(columns=['speaker_id'], inplace=True)
fake_news_drop.head()

Unnamed: 0,ID,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,barely_true_counts,false_counts,...,sentiment,sentiment_score,sentiment_magnitude,anger,fear,joy,disgust,sad,list,sentiment_code
0,2635,false,Says the Annies List political group supports ...,[abortion],dwayne-bohac,State representative,Texas,republican,0.0,1.0,...,NEGATIVE,-0.5,0.5,0.121137,0.008926,0.026096,0.263479,0.531887,"[0, 1]",_NEG_
1,10540,half-true,When did the decline of coal start? It started...,"[energy, history, job-accomplishments]",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,NEGATIVE,-0.4,0.8,0.095352,0.124566,0.191357,0.016999,0.102045,"[0, 1]",_NEG_
2,324,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",[foreign-policy],barack-obama,President,Illinois,democrat,70.0,71.0,...,NEGATIVE,-0.3,0.3,0.039559,0.024162,0.500384,0.454228,0.052453,"[1, 0]",_NEG_
5,12465,true,The Chicago Bears have had more starting quart...,[education],robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,...,NEGATIVE,-0.3,0.3,0.044534,0.037243,0.294698,0.035936,0.598971,"[1, 0]",_NEG_
7,153,half-true,I'm the only person on this stage who has work...,[ethics],barack-obama,President,Illinois,democrat,70.0,71.0,...,NEGATIVE,-0.2,0.2,0.021023,0.077569,0.032182,0.038037,0.438594,"[0, 1]",_NEG_


In [138]:
# Drop the 'sentiment_code' column from the DataFrame
fake_news_drop.drop(columns=['sentiment_code'], inplace=True)
fake_news_drop.head()

Unnamed: 0,ID,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,barely_true_counts,false_counts,...,context,sentiment,sentiment_score,sentiment_magnitude,anger,fear,joy,disgust,sad,list
0,2635,false,Says the Annies List political group supports ...,[abortion],dwayne-bohac,State representative,Texas,republican,0.0,1.0,...,a mailer,NEGATIVE,-0.5,0.5,0.121137,0.008926,0.026096,0.263479,0.531887,"[0, 1]"
1,10540,half-true,When did the decline of coal start? It started...,"[energy, history, job-accomplishments]",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,a floor speech.,NEGATIVE,-0.4,0.8,0.095352,0.124566,0.191357,0.016999,0.102045,"[0, 1]"
2,324,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",[foreign-policy],barack-obama,President,Illinois,democrat,70.0,71.0,...,Denver,NEGATIVE,-0.3,0.3,0.039559,0.024162,0.500384,0.454228,0.052453,"[1, 0]"
5,12465,true,The Chicago Bears have had more starting quart...,[education],robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,...,a an online opinion-piece,NEGATIVE,-0.3,0.3,0.044534,0.037243,0.294698,0.035936,0.598971,"[1, 0]"
7,153,half-true,I'm the only person on this stage who has work...,[ethics],barack-obama,President,Illinois,democrat,70.0,71.0,...,"a Democratic debate in Philadelphia, Pa.",NEGATIVE,-0.2,0.2,0.021023,0.077569,0.032182,0.038037,0.438594,"[0, 1]"


In [139]:
# Analyze the distribution of classes (speaker_job)
fake_news_drop['speaker_job'].value_counts()

speaker_job
President                                  467
U.S. Senator                               457
Governor                                   365
President-Elect                            275
U.S. senator                               248
                                          ... 
Ohio State University president              1
Radio personality                            1
DeKalb County Chief Executive Officer        1
Human resources director                     1
Member of the House of Representatinves      1
Name: count, Length: 974, dtype: int64

In [140]:
# Analyze the distribution of classes (speaker)
fake_news_drop['speaker'].value_counts()

speaker
barack-obama           465
donald-trump           275
hillary-clinton        231
mitt-romney            165
john-mccain            149
                      ... 
mike-hymes               1
gloria-romero-roses      1
charles-odimgbe          1
gun-owners-america       1
sal-esquivel             1
Name: count, Length: 1604, dtype: int64

In [141]:
# Analyze the distribution of classes (sentiment)
fake_news_drop['sentiment'].value_counts()

sentiment
NEGATIVE    5589
POSITIVE     976
Name: count, dtype: int64

In [147]:
# # Split our preprocessed data into our features and target arrays
# X = fake_news_drop.drop('barely_true_counts', axis=1).values
# y = fake_news_drop['barely_true_counts'].values

# # Split the preprocessed data into a training and testing dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

# # Create a StandardScaler instances
# scaler = StandardScaler()

# # Fit the StandardScaler
# X_scaler = scaler.fit(X_train)

# # Scale the data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

In [148]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the categorical and numeric features
categorical_features = ['speaker', 'state', 'party', 'context', 'sentiment_code']
numeric_features = ['barely_true_counts', 'false_counts', 'sentiment_magnitude', 
                    'anger', 'fear', 'joy', 'disgust', 'sad']

# Create separate pipelines for categorical and numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Use OneHotEncoder for categorical variables
])

# Use ColumnTransformer to apply transformations to the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Apply preprocessing to X
X_processed = pipeline.fit_transform(X)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42) 

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [None]:
# fake_news_drop.to_csv("cleaned_data.csv", index=False)