### Importing Packages

### Loading Data:
Need to download actual dataset from https://drive.google.com/file/d/1vD4DtyJOIjRzchPtCQu-KPrUjgTiWSmo/view and unzip via Terminal (unzip NeuralNews.zip)

### Creating new environment to avoid clashes
python -m venv /Users/yzhao/ai4allc6g3

Step 1: Filtering to election based articles

In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/yzhao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
news_df = pd.read_csv("news_dataset.csv")
news_df.head()

Unnamed: 0,Labels,Articles
0,fake,A longtime champion of the homeless and batter...
1,fake,"Tucked away in the Marais, two warring groups ..."
2,fake,There are plenty of things that can impede wom...
3,fake,"New York City is home to more than 2,500 tiny ..."
4,fake,A man wearing a hat emblazoned with the words ...


In [4]:
news_df.shape

(64000, 2)

In [5]:
def text_preprocessing(text):
    """
    A function that accepts string, text, and removes the punctuation, pronouns,
    and commonly used words that don't provide additional information such as 
    'the', 'a', etc.
    """
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = stopwords.words('english')
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

news_df['Cleaned_Article'] = news_df['Articles'].apply(text_preprocessing)

In [6]:
news_df['Cleaned_Article'].iloc[:5]

0    longtime champion homeless battered times unio...
1    tucked away marais two warring groups compete ...
2    plenty things impede women’s road career succe...
3    new york city home 2500 tiny churches yearroun...
4    man wearing hat emblazoned words “driving libe...
Name: Cleaned_Article, dtype: object

In [7]:
# contains documentation: https://pandas.pydata.org/docs/reference/api/pandas.Series.str.contains.html
# Filter to text that only includes the substring election, trump, biden, and harris
filtered_df = news_df[news_df["Cleaned_Article"].str.contains("election|trump|biden|harris", case=False)]

filtered_df.reset_index(drop=True, inplace=True)
filtered_df.loc[filtered_df["Labels"] == "fake", "Labels"] = 1
filtered_df.loc[filtered_df["Labels"] == "real", "Labels"] = 0
filtered_df = filtered_df.rename(columns={'Labels': 'Fake', 'Articles':'Article'})
filtered_df["Fake"] = filtered_df["Fake"].astype(int)

In [8]:
print(f'The shape of the filtered data frame is: {filtered_df.shape}')
print (f"Number of real articles {filtered_df.shape[0] - sum(filtered_df['Fake'])}")
print (f"Number of AI generated articles {sum(filtered_df['Fake'])}")
filtered_df.iloc[:5,:]

The shape of the filtered data frame is: (11559, 3)
Number of real articles 4636
Number of AI generated articles 6923


Unnamed: 0,Fake,Article,Cleaned_Article
0,1,There are plenty of things that can impede wom...,plenty things impede women’s road career succe...
1,1,"Louisiana, New York, San Diego, New York.\nNo ...",louisiana new york san diego new york city pro...
2,1,Paul Manafort pleaded guilty to two counts of ...,paul manafort pleaded guilty two counts conspi...
3,1,Dozens of lawmakers with significant oppositio...,dozens lawmakers significant opposition trump’...
4,1,An ex-pimp whose book How to Lead A Slave Love...,expimp whose book lead slave lover paradise in...


In [16]:
count_vectorizer = CountVectorizer()
bag_words = count_vectorizer.fit_transform(filtered_df['Cleaned_Article'])
word_counts = pd.DataFrame({'word': vectorizer.get_feature_names_out(), 'count': bag_words.toarray().sum(axis=0)})
word_counts.sort_values(by='count', ascending=True, inplace=True)

In [17]:
word_counts["count"].describe()

count    104567.000000
mean         29.982126
std         255.024821
min           1.000000
25%           1.000000
50%           2.000000
75%           7.000000
max       30584.000000
Name: count, dtype: float64

In [18]:
# word needs to appear at least 30 times (the mean word count for the vocabulary)
tfidf_vectorizer = TfidfVectorizer(min_df = 30)

In [19]:
fake_tfidf = tfidf_vectorizer.fit_transform(filtered_df["Cleaned_Article"])
tfidf_vocabulary = tfidf_vectorizer.get_feature_names_out()
count_vocabulary = count_vectorizer.get_feature_names_out(word_counts)
size_tfidf = tfidf_vocabulary.shape[0]
size_vocab = count_vocabulary.shape[0]
size_compare = np.round(size_tfidf / size_vocab, 2)
print ("Size of tfidf vocabulary is " + str(size_compare) +
      " of the overall vocabulary.")
print (tfidf_vocabulary)

Size of tfidf vocabulary is 0.09 of the overall vocabulary.
['10' '100' '1000' ... 'zone' 'zones' 'zuckerberg']
