### Importing Packages

### Loading Data:
Need to download actual dataset from https://drive.google.com/file/d/1vD4DtyJOIjRzchPtCQu-KPrUjgTiWSmo/view and unzip via Terminal (unzip NeuralNews.zip)

### Creating new environment to avoid clashes
python -m venv /Users/yzhao/ai4allc6g3

Step 1: Filtering to election based articles

In [2]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# installs used 
# pip install transformers
# pip install ipywidgets
# pip3 install torch torchvision torchaudio (may vary based on device)
# pip install tensorflow
# pip install tf_keras
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline

pipe = pipeline("text-classification", model="launch/POLITICS")

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/yzhao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
news_df = pd.read_csv("news_dataset.csv")
news_df.head()

Unnamed: 0,Labels,Articles
0,fake,A longtime champion of the homeless and batter...
1,fake,"Tucked away in the Marais, two warring groups ..."
2,fake,There are plenty of things that can impede wom...
3,fake,"New York City is home to more than 2,500 tiny ..."
4,fake,A man wearing a hat emblazoned with the words ...


In [5]:
news_df.shape

(64000, 2)

## Vectorizer and Bag of Words Approach

In [6]:
def text_preprocessing(text):
    """
    A function that accepts string, text, and removes the punctuation, pronouns,
    and commonly used words that don't provide additional information such as 
    'the', 'a', etc.
    """
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = stopwords.words('english')
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

news_df['Cleaned_Article'] = news_df['Articles'].apply(text_preprocessing)

In [7]:
news_df['Cleaned_Article'].iloc[:5]

0    longtime champion homeless battered times unio...
1    tucked away marais two warring groups compete ...
2    plenty things impede women’s road career succe...
3    new york city home 2500 tiny churches yearroun...
4    man wearing hat emblazoned words “driving libe...
Name: Cleaned_Article, dtype: object

In [8]:
# contains documentation: https://pandas.pydata.org/docs/reference/api/pandas.Series.str.contains.html
# Filter to text that only includes the substring election, trump, biden, and harris
filtered_df = news_df[news_df["Cleaned_Article"].str.contains("election|trump|biden|harris|ballot|president|nomination|vote", case=False)]

filtered_df.reset_index(drop=True, inplace=True)
filtered_df.loc[filtered_df["Labels"] == "fake", "Labels"] = 1
filtered_df.loc[filtered_df["Labels"] == "real", "Labels"] = 0
filtered_df = filtered_df.rename(columns={'Labels': 'Fake', 'Articles':'Article'})
filtered_df["Fake"] = filtered_df["Fake"].astype(int)

In [9]:
print(f'The shape of the filtered data frame is: {filtered_df.shape}')
print (f"Number of real articles {filtered_df.shape[0] - sum(filtered_df['Fake'])}")
print (f"Number of AI generated articles {sum(filtered_df['Fake'])}")
filtered_df = filtered_df.sample(n = 3000)
filtered_df.iloc[:5,:]

The shape of the filtered data frame is: (21604, 3)
Number of real articles 10454
Number of AI generated articles 11150


Unnamed: 0,Fake,Article,Cleaned_Article
7732,1,"Speaking to reporters on Friday, Secretary of ...",speaking reporters friday secretary state mike...
13091,0,"McLEAN, Va. -- President Obama on Tuesday pres...",mclean va president obama tuesday pressed cong...
4409,1,"Walking into the Capitol Friday morning, journ...",walking capitol friday morning journalists und...
2154,1,"On February 9, 1990, in South Africa, filmmake...",february 9 1990 south africa filmmaker trish o...
9959,1,All four lanes of the FDR Drive and the FDR Dr...,four lanes fdr drive fdr drivebarney street ex...


In [10]:
count_vectorizer = CountVectorizer()
bag_words = count_vectorizer.fit_transform(filtered_df['Cleaned_Article'])
word_counts = pd.DataFrame({'word': count_vectorizer.get_feature_names_out(), 'count': bag_words.toarray().sum(axis=0)})
word_counts.sort_values(by='count', ascending=True, inplace=True)

In [11]:
word_counts["count"].describe()

count    57187.000000
mean        14.937661
std         92.200608
min          1.000000
25%          1.000000
50%          2.000000
75%          6.000000
max       9792.000000
Name: count, dtype: float64

In [12]:
# word needs to appear at least 30 times (the mean word count for the vocabulary)
tfidf_vectorizer = TfidfVectorizer(min_df = 10)

In [13]:
fake_tfidf = tfidf_vectorizer.fit_transform(filtered_df["Cleaned_Article"])
tfidf_vocabulary = tfidf_vectorizer.get_feature_names_out()
count_vocabulary = count_vectorizer.get_feature_names_out(word_counts)
size_tfidf = tfidf_vocabulary.shape[0]
size_vocab = count_vocabulary.shape[0]
size_compare = np.round(size_tfidf / size_vocab, 2)
print ("Size of tfidf vocabulary is " + str(size_compare) +
      " of the overall vocabulary.")
print (tfidf_vocabulary)

Size of tfidf vocabulary is 0.15 of the overall vocabulary.
['10' '100' '1000' ... 'zero' 'zone' 'zones']


## Classification of article party alignment

In [15]:
news_sample = news_df.sample(n = 1000)

In [16]:
def classify_party(article):
    result = pipe(article)[0]
    return (pd.Series([result['label'], result['score']]))

In [22]:
def find_leng(article):
    return (len(article.split()) < 500)
short_articles = news_sample[news_sample["Cleaned_Article"].apply(find_leng)]
short_articles

Unnamed: 0,Labels,Articles,Cleaned_Article
50187,real,"""Who is that?"" a befuddled Glen Campbell asks ...",befuddled glen campbell asks wife kim watches ...
53296,real,Coach Alain Vigneault knows the Rangers may tr...,coach alain vigneault knows rangers may troop ...
60576,real,"Senator Rand Paul, with a fiercely libertarian...",senator rand paul fiercely libertarian conserv...
9895,fake,Wine Cocktail Mixology Class\nFind out how to ...,wine cocktail mixology class find make perfect...
56049,real,"The Yankees' trade for Castro, a three-time Al...",yankees trade castro threetime allstar affecte...
...,...,...,...
31292,fake,Yale Medical School has revoked the admissions...,yale medical school revoked admissions privile...
27587,fake,Inspired by the phenomenon of “hooked” — the d...,inspired phenomenon “hooked” — desire belong s...
29322,fake,"On Monday, former Trump administration advisor...",monday former trump administration advisor hal...
54678,real,Yet the real strength of the Thakoon collectio...,yet real strength thakoon collection rendering...


In [23]:
short_articles[['label', 'confidence']] = short_articles['Cleaned_Article'].apply(classify_party)

RuntimeError: The expanded size of the tensor (532) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 532].  Tensor sizes: [1, 514]

In [24]:

tokenizer = AutoTokenizer.from_pretrained("launch/POLITICS")
model = AutoModelForMaskedLM.from_pretrained("launch/POLITICS")
encoded = short_articles['Cleaned_Article'].apply(tokenizer, return_tensors="pt")

Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors


In [26]:
short_articles.shape
for article in short_articles["Cleaned_Article"]:
    if (len(article.split()) >= 512):
        print (len(article.split()))