In [None]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import warnings

In [None]:
# Read File
file_path = Path("Resources/Data/news-jl.csv")
news_df = pd.read_csv(file_path, header=0, encoding='utf-8')
# Display sample data
news_df.head(10)

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


## Step 1 - Clean the data

In [None]:
#check count of all rows
news_df.describe()

Unnamed: 0,id,title,text,label
count,7576,7185,6929,6755
unique,7531,7085,6644,437
top,#NAME?,OnPolitics | 's politics blog,"Killing Obama administration rules, dismantlin...",REAL
freq,16,5,58,3161


In [None]:
#drop NAs from all rows
news_df = news_df.dropna()


In [None]:
news_df.head()

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [None]:
#recheck all rows
news_df.describe()

Unnamed: 0,id,title,text,label
count,6754,6754,6754,6754
unique,6736,6667,6472,436
top,#NAME?,OnPolitics | 's politics blog,"Killing Obama administration rules, dismantlin...",REAL
freq,9,5,58,3161


In [None]:
#see datatype
news_df.dtypes

id       object
title    object
text     object
label    object
dtype: object

In [None]:
# Changing the data type of 'id' to float

news_df['id'] = pd.to_numeric(news_df['id'], errors='coerce')

# Using .loc to modify the 'id' column
# news_df_cleaned.loc[:, 'id'] = pd.to_numeric(news_df_cleaned['id'], errors='coerce')


In [None]:
news_df.dtypes

id       float64
title     object
text      object
label     object
dtype: object

In [None]:
# Changing the data type of 'title' 'text' 'label' to string

# news_df['title'] = news_df['title'].astype(str)
# news_df['text'] = news_df['text'].astype(str)
# news_df['label'] = news_df['label'].astype(str)

news_df = news_df.astype(dtype={'title': 'string', 'text': 'string', 'label': 'string'})


In [None]:
news_df.head()

Unnamed: 0,id,title,text,label
0,8476.0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294.0,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608.0,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142.0,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875.0,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [None]:
news_df.dtypes

id       float64
title     string
text      string
label     string
dtype: object

In [None]:
#drop the id column
filter_df = news_df.loc[:,["title", "text", "label"]]
filter_df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


## Step 2 - Get the columns that we need by transforming the columns

In [None]:
# Splitting the 'TextColumn' for each row based on whitespace
filter_df['title_array'] = filter_df['title'].apply(lambda x: x.split())
filter_df['text_array'] = filter_df['text'].apply(lambda x: x.split())

#only grab the label and the new array columns
transform_array_df = filter_df[['label', 'title_array', 'text_array']]

transform_array_df.head()

Unnamed: 0,label,title_array,text_array
0,FAKE,"[You, Can, Smell, Hillary’s, Fear]","[Daniel, Greenfield,, a, Shillman, Journalism,..."
1,FAKE,"[Watch, The, Exact, Moment, Paul, Ryan, Commit...","[Google, Pinterest, Digg, Linkedin, Reddit, St..."
2,REAL,"[Kerry, to, go, to, Paris, in, gesture, of, sy...","[U.S., Secretary, of, State, John, F., Kerry, ..."
3,FAKE,"[Bernie, supporters, on, Twitter, erupt, in, a...","[—, Kaydee, King, (@KaydeeKing), November, 9,,..."
4,REAL,"[The, Battle, of, New, York:, Why, This, Prima...","[It's, primary, day, in, New, York, and, front..."


In [None]:
#join array together

join_array_df = transform_array_df.loc[:,["label"]]

# Combine the arrays from both columns into a single array
join_array_df['combo_array'] = transform_array_df.apply(lambda row: np.concatenate([row['title_array'], row['text_array']]), axis=1)


join_array_df.head()

Unnamed: 0,label,combo_array
0,FAKE,"[You, Can, Smell, Hillary’s, Fear, Daniel, Gre..."
1,FAKE,"[Watch, The, Exact, Moment, Paul, Ryan, Commit..."
2,REAL,"[Kerry, to, go, to, Paris, in, gesture, of, sy..."
3,FAKE,"[Bernie, supporters, on, Twitter, erupt, in, a..."
4,REAL,"[The, Battle, of, New, York:, Why, This, Prima..."


### Title and Label Columns ONLY

In [None]:
# Expand only the title array into new columns
title_expand_array_df = transform_array_df.loc[:, ["label"]]

# Create new columns for each element in the arrays
title_array_columns = pd.DataFrame(transform_array_df['title_array'].tolist(), dtype='string')

# Concatenate the new columns with the original DataFrame
title_expand_array_df = pd.concat([title_expand_array_df, title_array_columns], axis=1)

# Fill in NA with "NA"
title_expand_array_df = title_expand_array_df.fillna("NA")

# Display the DataFrame
title_expand_array_df.head()

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,123,124,125,126,127,128,129,130,131,132
0,FAKE,You,Can,Smell,Hillary’s,Fear,,,,,...,,,,,,,,,,
1,FAKE,Watch,The,Exact,Moment,Paul,Ryan,Committed,Political,Suicide,...,,,,,,,,,,
2,REAL,Kerry,to,go,to,Paris,in,gesture,of,sympathy,...,,,,,,,,,,
3,FAKE,Bernie,supporters,on,Twitter,erupt,in,anger,against,the,...,,,,,,,,,,
4,REAL,The,Battle,of,New,York:,Why,This,Primary,Matters,...,,,,,,,,,,


In [None]:
title_expand_array_df.dtypes

label    string
0        string
1        string
2        string
3        string
          ...  
128      string
129      string
130      string
131      string
132      string
Length: 134, dtype: object

In [None]:
#should we drop duplicate words?


### Text and Label Columns ONLY

In [None]:
#expand only the title array into new columns
text_expand_array_df = transform_array_df.loc[:,["label"]]

# Create new columns for each element in the arrays
text_array_columns = pd.DataFrame(transform_array_df['text_array'].tolist(), dtype='string')

# Concatenate the new columns with the original DataFrame
text_expand_array_df = pd.concat([text_expand_array_df, text_array_columns], axis=1)

#fill in na to "none"
text_expand_array_df = text_expand_array_df.fillna("NA")

text_expand_array_df.head()

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,5510,5511,5512,5513,5514,5515,5516,5517,5518,5519
0,FAKE,Daniel,"Greenfield,",a,Shillman,Journalism,Fellow,at,the,Freedom,...,,,,,,,,,,
1,FAKE,Google,Pinterest,Digg,Linkedin,Reddit,Stumbleupon,Print,Delicious,Pocket,...,,,,,,,,,,
2,REAL,U.S.,Secretary,of,State,John,F.,Kerry,said,Monday,...,,,,,,,,,,
3,FAKE,—,Kaydee,King,(@KaydeeKing),November,9,2016,The,lesson,...,,,,,,,,,,
4,REAL,It's,primary,day,in,New,York,and,front-runners,Hillary,...,,,,,,,,,,


### Combo(text and title) and Label Columns ONLY

In [None]:
#expand only the combo array into new columns
combo_expand_array_df = join_array_df.loc[:,["label"]]

# Create new columns for each element in the arrays
# combo_array_columns = join_array_df['combo_array'].apply(lambda x: pd.Series(x, dtype='string'))
combo_array_columns = pd.DataFrame(join_array_df['combo_array'].tolist(), dtype='string')


# Concatenate the new columns with the original DataFrame
combo_expand_array_df = pd.concat([combo_expand_array_df, combo_array_columns], axis=1)

#fill in na to "none"
combo_expand_array_df = combo_expand_array_df.fillna("")

combo_expand_array_df.head()

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,5516,5517,5518,5519,5520,5521,5522,5523,5524,5525
0,FAKE,You,Can,Smell,Hillary’s,Fear,Daniel,"Greenfield,",a,Shillman,...,,,,,,,,,,
1,FAKE,Watch,The,Exact,Moment,Paul,Ryan,Committed,Political,Suicide,...,,,,,,,,,,
2,REAL,Kerry,to,go,to,Paris,in,gesture,of,sympathy,...,,,,,,,,,,
3,FAKE,Bernie,supporters,on,Twitter,erupt,in,anger,against,the,...,,,,,,,,,,
4,REAL,The,Battle,of,New,York:,Why,This,Primary,Matters,...,,,,,,,,,,


## Step 2 - prep data to be trained

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split


# Drop 'label' column for X and assign 'label' column to y
X = title_expand_array_df.drop('label', axis=1)
y = title_expand_array_df['label']

# Create a column transformer with a pipeline for one-hot encoding each column
transformers = []

for i in range(len(X.columns)):
    transformers.append((str(i), Pipeline([('onehot', OneHotEncoder())]), [i]))

ct = ColumnTransformer(transformers=transformers, remainder='passthrough')

# Fit and transform X
X_transformed = ct.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=1/3, random_state=0)

In [None]:
print(X)

           0           1        2             3          4           5    \
0          You         Can    Smell     Hillary’s       Fear          NA   
1        Watch         The    Exact        Moment       Paul        Ryan   
2        Kerry          to       go            to      Paris          in   
3       Bernie  supporters       on       Twitter      erupt          in   
4          The      Battle       of           New      York:         Why   
...        ...         ...      ...           ...        ...         ...   
6554     Poll:     Clinton  Support        Spikes  Following  Democratic   
6556   Florida        Once    Again             a      Focus          in   
6558       The     House’s      new  conservative  politburo          NA   
6560  Bizarre!       Drone  Records      Speeding        UFO        Over   
6561   Prepare    Yourself      For           The     Higher    Energies   

             6          7         8     9    ... 123 124 125 126 127 128 129  \
0      

In [None]:
print(y)

0       FAKE
1       FAKE
2       REAL
3       FAKE
4       REAL
        ... 
6554      NA
6556      NA
6558      NA
6560      NA
6561      NA
Name: label, Length: 7764, dtype: string


### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
print(X_train)

  (0, 293)	41.549174640028745
  (0, 5329)	25.455874528045893
  (0, 8414)	71.95137381062794
  (0, 9373)	29.388225996277022
  (0, 15381)	71.95137381062794
  (0, 18010)	71.95137381062794
  (0, 21161)	71.95137381062794
  (0, 24873)	8.057116885527806
  (0, 26899)	71.95137381062794
  (0, 29494)	71.95137381062794
  (0, 29865)	71.95137381062794
  (0, 32364)	71.95137381062794
  (0, 33391)	2.3786847596201
  (0, 34468)	2.654903625841938
  (0, 35303)	3.0157440636409456
  (0, 35916)	3.409598749405188
  (0, 36408)	3.8248426316516375
  (0, 36785)	4.334260543281542
  (0, 37092)	4.8529218054803245
  (0, 37346)	5.458166759875881
  (0, 37535)	6.019976313662221
  (0, 37720)	6.5916893740379
  (0, 37857)	7.0281534186187
  (0, 37973)	7.692515774943026
  (0, 38082)	8.313853806669863
  :	:
  (5175, 39496)	1.0
  (5175, 39498)	1.0
  (5175, 39500)	1.0
  (5175, 39502)	1.0
  (5175, 39504)	1.0
  (5175, 39506)	1.0
  (5175, 39508)	1.0
  (5175, 39510)	1.0
  (5175, 39512)	1.0
  (5175, 39514)	1.0
  (5175, 39516)	1.0
  (5

In [None]:
print(X_test)

  (0, 349)	50.882220731294005
  (0, 3436)	1.0
  (0, 7656)	9.752939608083153
  (0, 10430)	41.549174640028745
  (0, 14438)	1.0
  (0, 16225)	1.0
  (0, 19555)	1.0
  (0, 22751)	29.388225996277022
  (0, 26388)	14.719759680502934
  (0, 28816)	14.14503346127581
  (0, 30535)	71.95137381062794
  (0, 31592)	71.95137381062794
  (0, 33505)	1.0
  (0, 34188)	71.95137381062794
  (0, 35741)	32.190075596850676
  (0, 35954)	1.0
  (0, 36408)	3.8248426316516375
  (0, 36785)	4.334260543281542
  (0, 37092)	4.8529218054803245
  (0, 37346)	5.458166759875881
  (0, 37535)	6.019976313662221
  (0, 37720)	6.5916893740379
  (0, 37857)	7.0281534186187
  (0, 37973)	7.692515774943026
  (0, 38082)	8.313853806669863
  :	:
  (2587, 39496)	1.0
  (2587, 39498)	1.0
  (2587, 39500)	1.0
  (2587, 39502)	1.0
  (2587, 39504)	1.0
  (2587, 39506)	1.0
  (2587, 39508)	1.0
  (2587, 39510)	1.0
  (2587, 39512)	1.0
  (2587, 39514)	1.0
  (2587, 39516)	1.0
  (2587, 39518)	1.0
  (2587, 39520)	1.0
  (2587, 39522)	1.0
  (2587, 39524)	1.0
  (2

### Training the Logistic Regression model on the Training set

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=1000, random_state = 0)
classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

### Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[  0   0   0 ...   0   1   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   1   0]
 ...
 [  0   0   0 ...  95 165   0]
 [  0   0   0 ... 206 440   0]
 [  0   0   0 ...   0   0   0]]


0.3693972179289026

### Visualising the Training set results

### Visualising the Test set results