### Importing Packages

### Loading Data:
Need to download actual dataset from https://drive.google.com/file/d/1vD4DtyJOIjRzchPtCQu-KPrUjgTiWSmo/view and unzip via Terminal (unzip NeuralNews.zip)

### Creating new environment to avoid clashes
python -m venv /Users/yzhao/ai4allc6g3

Step 1: Filtering to election based articles

In [2]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# installs used 
# pip install transformers
# pip install ipywidgets
# pip3 install torch torchvision torchaudio (may vary based on device)
# pip install tensorflow
# pip install tf_keras
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline

pipe = pipeline("text-classification", model="launch/POLITICS")

In [29]:
from gensim import corpora
from gensim.models import LdaModel

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/yzhao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
news_df = pd.read_csv("news_dataset.csv")
news_df.head()

Unnamed: 0,Labels,Articles
0,fake,A longtime champion of the homeless and batter...
1,fake,"Tucked away in the Marais, two warring groups ..."
2,fake,There are plenty of things that can impede wom...
3,fake,"New York City is home to more than 2,500 tiny ..."
4,fake,A man wearing a hat emblazoned with the words ...


In [5]:
news_df.shape

(64000, 2)

## Vectorizer and Bag of Words Approach

In [6]:
def text_preprocessing(text):
    """
    A function that accepts string, text, and removes the punctuation, pronouns,
    and commonly used words that don't provide additional information such as 
    'the', 'a', etc.
    """
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = stopwords.words('english')
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

news_df['Cleaned_Article'] = news_df['Articles'].apply(text_preprocessing)

In [32]:
news_df['Cleaned_Article'].iloc[:5]

0    longtime champion homeless battered times unio...
1    tucked away marais two warring groups compete ...
2    plenty things impede women’s road career succe...
3    new york city home 2500 tiny churches yearroun...
4    man wearing hat emblazoned words “driving libe...
Name: Cleaned_Article, dtype: object

In [55]:
# contains documentation: https://pandas.pydata.org/docs/reference/api/pandas.Series.str.contains.html
# Filter to text that only includes the substring election, trump, biden, and harris
filtered_df = news_df[news_df["Cleaned_Article"].str.contains(""" election|
             campaign| vote| ballot| voting| polling| candidate| nominee| politician|
             leader| opposition| incumbent| poll| polling| approval rating|
             electorate| conservative| liberal| democrat| republican| left-wing|
             right-wing| centrist| far-right| far-left| populist|
             governor| mayor| senator| representative| joe biden| bernie sanders| 
             elizabeth warren| pete buttigieg| andrew yang| tulsi gabbard| 
             kamala harris""", case=False)]

filtered_df.reset_index(drop=True, inplace=True)
filtered_df.loc[filtered_df["Labels"] == "fake", "Labels"] = 1
filtered_df.loc[filtered_df["Labels"] == "real", "Labels"] = 0
filtered_df = filtered_df.rename(columns={'Labels': 'Fake', 'Articles':'Article'})
filtered_df["Fake"] = filtered_df["Fake"].astype(int)

In [None]:
filtered_noprez = news_df[news_df["Cleaned_Article"].str.contains("""election|
            campaign|vote|ballot|voting|polling|candidate|nominee|politician|
            leader|opposition|incumbent|poll|polling|approval rating|
            electorate|conservative|liberal|democrat|republican|left-wing|
            right-wing|centrist|far-right|far-left|populist|
            governor|mayor|senator|representative""", case=False)]

In [56]:
print(f'The shape of the filtered data frame is: {filtered_df.shape}')
print (f"Number of real articles {filtered_df.shape[0] - sum(filtered_df['Fake'])}")
print (f"Number of AI generated articles {sum(filtered_df['Fake'])}")

The shape of the filtered data frame is: (17344, 3)
Number of real articles 8665
Number of AI generated articles 8679


In [48]:
for article in filtered_df["Cleaned_Article"]:
    print (article)

friday guam raised security level “high alert” — move guam gov eddie calvo called “timely necessary” order prepare missile launch north korea calvo said guam government working closely federal government prepared disaster plan 13 million residents us territory “if put action change world” calvo said told cnn guam “must act protect homeland” citing “unusual increasing military activities” region calvo said guam “increasingly concerned north korea’s many aggressive weapons programs” however guam officials also noted island enjoys “large array defenses” launch would “dramatic effect” north korea according guam’s daily star north korea launched missile since 2006 reason concerned north korea following us president donald trump’s praise “rocket man” kim jong un earlier month tuesday briefing press trip asia trump revealed trumpkim summit plans hold singapore track although many experts said summit would “historic mistake” trump insists back meeting happen “first term” “i firmly committed me

In [10]:
count_vectorizer = CountVectorizer()
bag_words = count_vectorizer.fit_transform(filtered_df['Cleaned_Article'])
word_counts = pd.DataFrame({'word': count_vectorizer.get_feature_names_out(), 'count': bag_words.toarray().sum(axis=0)})
word_counts.sort_values(by='count', ascending=True, inplace=True)

In [11]:
word_counts["count"].describe()

count    57187.000000
mean        14.937661
std         92.200608
min          1.000000
25%          1.000000
50%          2.000000
75%          6.000000
max       9792.000000
Name: count, dtype: float64

In [12]:
# word needs to appear at least 10 times (the mean word count for the vocabulary)
tfidf_vectorizer = TfidfVectorizer(min_df = 10)

In [13]:
fake_tfidf = tfidf_vectorizer.fit_transform(filtered_df["Cleaned_Article"])
tfidf_vocabulary = tfidf_vectorizer.get_feature_names_out()
count_vocabulary = count_vectorizer.get_feature_names_out(word_counts)
size_tfidf = tfidf_vocabulary.shape[0]
size_vocab = count_vocabulary.shape[0]
size_compare = np.round(size_tfidf / size_vocab, 2)
print ("Size of tfidf vocabulary is " + str(size_compare) +
      " of the overall vocabulary.")
print (tfidf_vocabulary)

Size of tfidf vocabulary is 0.15 of the overall vocabulary.
['10' '100' '1000' ... 'zero' 'zone' 'zones']


## Classification of article party alignment

In [37]:
news_sample = news_df.sample(n = 10)

In [38]:
def classify_party(article):
    result = pipe(article)[0]
    return (pd.Series([result['label'], result['score']]))

In [39]:
def find_leng(article):
    return (len(article.split()) < 500)
short_articles = news_sample[news_sample["Cleaned_Article"].apply(find_leng)]
short_articles

Unnamed: 0,Labels,Articles,Cleaned_Article
6177,fake,A proposed federal plan to create a flight pat...,proposed federal plan create flight pattern lo...
25699,fake,Jaya Bachchan and Gautam Makhija aren’t sure i...,jaya bachchan gautam makhija aren’t sure presi...
53407,real,"""Capital in the Twenty-First Century,"" Thomas ...",capital twentyfirst century thomas pikettys ne...
52829,real,"""Romancing the Joan"" is intended to complement...",romancing joan intended complement conventiona...
43318,real,The contentious plan to turn yet another New Y...,contentious plan turn yet another new york hos...
61333,real,In so lovingly collecting and editing Flann O'...,lovingly collecting editing flann obriens wide...
12677,fake,A New York County horse stable closed after si...,new york county horse stable closed six horses...
49498,real,"The diminutive size, steep topography or far-f...",diminutive size steep topography farflung natu...
14951,fake,You hear a lot about the “golden generation” o...,hear lot “golden generation” grandparents nowa...
32889,real,"MINNEAPOLIS -- Lutheran congregants, I've lear...",minneapolis lutheran congregants ive learned s...


In [40]:
short_articles[['label', 'confidence']] = short_articles['Cleaned_Article'].apply(classify_party)

RuntimeError: The expanded size of the tensor (546) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 546].  Tensor sizes: [1, 514]

In [24]:

tokenizer = AutoTokenizer.from_pretrained("launch/POLITICS")
model = AutoModelForMaskedLM.from_pretrained("launch/POLITICS")
encoded = short_articles['Cleaned_Article'].apply(tokenizer, return_tensors="pt")

Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors


In [26]:
short_articles.shape
for article in short_articles["Cleaned_Article"]:
    if (len(article.split()) >= 512):
        print (len(article.split()))

In [42]:
tokenizer.model_max_length
tokenizer.max_len_single_sentence

510

In [45]:
text = short_articles['Cleaned_Article'].iloc[1]
#text.apply(tokenizer, return_tensors="pt")
tokenizer(text, return_tensors="pt")

{'input_ids': tensor([[    0,   267,  5210,   741,  1488, 14717,   821,  4255,   424,   475,
          7352, 12733,  2025,    17,    27,    90,   686,   394,  2003,  2990,
          6168,  2583,   146,   637,   120,  2834, 12076,  1727,    24,    17,
            27,    29,   678,  3610,  3553, 46806,   183,  4981,   212,  4115,
          4247, 21104,  3774, 20285,   293,  8775, 11433,  1519,  4247,  3774,
         20285,   293,    93,  4247,   797,  3188, 32407,  1339,  2236,  8952,
          2110,    93,  8852,  3752,  1300, 10535,   156,  1637,   741,  9718,
          5552,    78,  1145,  6168,  2583,   484,   107,   536,  3744,  4023,
         30467,  1926, 38187,   260,   825,  1855,  1243,   475,  7352, 12733,
         12812,   942,  4190, 13605,  1272,  2114, 10409,   982,   187,   842,
          3320, 22542,  5560, 12442,   342,  4667,   946,  5327,  3114,  6955,
          2074, 11738,  1830,  4247, 29901,   741,  1488, 14717,  4201,   559,
          8036, 12076,   637,  1920,  