# 207 Final Project - Peter Benzoni

## Data Setup
 
### Load the data, creating the appropriate files if they dont already exist

In [24]:
import pandas as pd
BALANCED_DATASET = False
# Check if combined-news-articles.csv already exists:
# If it does, load the dataframe from the csv file
# If it doesn't, create the dataframe from the original datasets
try:
    df = pd.read_csv('combined-news-articles.csv')
    BALANCED_DATASET = True

except FileNotFoundError:
    site_map = {
        "http://en.people.cn/": "People's Daily",
        "http://global.chinadaily.com.cn/": "China Daily",
        "http://www.globaltimes.cn/": "Global Times",
        "http://www.xinhuanet.com/english/": "Xinhua" ,
        "https://www.cgtn.com/": "CGTN",
        "xinhuanet.com (english)": "Xinhua",
    }

    # Load the datasets
    df = pd.read_csv('all-the-news-2-1.csv')
    df_china = pd.read_csv('China_Articles.csv')


### Filter the Data to desired dates and publications

In [25]:
if not BALANCED_DATASET:
    # Filter out articles from specified publications, excluding tabloid, significantly slanted, analysis/long-form, and opinion publications (and including news-first publications)
    included_publications = ["Axios", "Business Insider", "CNBC", "CNN", "Fox News", "Reuters", "The Hill", "The New York Times", "Washington Post",]
    df = df[df['publication'].isin(included_publications)]

    # filter to only articles from 7-2019 to 7-2020 to match df_china
    df = df[((df['year'] == 2019) & (df['month'] >= 7)) | ((df['year'] == 2020) & (df['month'] <= 7))]

    # filter out articles from specified publications for df_china
    excluded_publications_china = ['china-un.org (english)']
    df_china = df_china[~df_china['site'].isin(excluded_publications_china)]

    # replace sites with publication names for df_china
    df_china['publication'] = df_china['site'].map(site_map)


### Normalize formatting and columns

In [36]:
if not BALANCED_DATASET:
    # match date columns for df_china to df columns (date, year, month)
    df_china['date'] = pd.to_datetime(df_china['published'], format='mixed')
    df_china['year'] = df_china['date'].dt.year
    df_china['month'] = df_china['date'].dt.month

    # Handling missing values - remove empty articles
    df = df[df['title'].notna()]
    df_china = df_china[df_china['title'].notna()]
    df = df[df['article'].notna()]
    df_china = df_china[df_china['excerpt'].notna()]    

    # Convert the data into a format suitable for NLP analysis; Matching the first 1000 characters available in the other dataset
    df['combined_content'] = df['title'].astype(str) + ': ' + df['article'].str[:1000].astype(str)
    df_china['combined_content'] = df_china['title'].astype(str) + ': ' + df_china['excerpt'].astype(str)


### Balance datasets
For the "all the news" dataset: The smallest publication here is Buzzfeed News with 3,657 articles. To ensure that Axios is represented each month, set N to be around 600 (since 7674/12 = 614). 

For the Chinese state media dataset: The smallest publication is Global Times with 5,242 articles. To ensure representation each month, set N to around 430 (since 5242/12 = 437).

That way, the data should be balanced by publication and date. We could balance by actor as well (china vs non-china, but its useful to have a boarder context the chinese media fits into) 

In [37]:
if not BALANCED_DATASET:
    # get and print length of each publication , to justify balancing the datasets
    print(df['publication'].value_counts())
    print(df_china['publication'].value_counts())
    
    balanced_df = pd.DataFrame()
    balanced_df_china = pd.DataFrame()

    for year in df['year'].unique():
        for month in df['month'].unique():
            # Sample N articles from each publication in df for the specific month and year
            N = 610
            temp_df = df[(df['year'] == year) & (df['month'] == month)]
            sampled_df = temp_df.groupby('publication').apply(lambda x: x.sample(min(len(x), N))).reset_index(drop=True)
            balanced_df = pd.concat([balanced_df, sampled_df], ignore_index=True)
            
            # Do the same for df_china
            N = 430
            temp_df_china = df_china[(df_china['year'] == year) & (df_china['month'] == month)]
            sampled_df_china = temp_df_china.groupby('publication').apply(lambda x: x.sample(min(len(x), N))).reset_index(drop=True)
            balanced_df_china = pd.concat([balanced_df_china, sampled_df_china], ignore_index=True)



publication
Reuters               202042
CNBC                   71754
Business Insider       48583
The New York Times     42655
Washington Post        40483
The Hill               37940
CNN                    29921
Axios                   7374
Name: count, dtype: int64
publication
Xinhua            56695
CGTN              17820
People's Daily    10701
China Daily        9417
Global Times       5242
Name: count, dtype: int64


### Combine the Balanced datasets

In [38]:
if not BALANCED_DATASET:

    # Combine the dataframes, keeping only the intersection of columns
    df = pd.concat([balanced_df, balanced_df_china], axis=0, join='inner')
    print(df['publication'].value_counts())
    

    # Now, df['combined_content'] can be used for NLP analysis
    print(df.head())

    # Save the dataframe to a csv file for future use
    df.to_csv('combined-news-articles.csv', index=False)

publication
The New York Times    5546
Business Insider      5497
CNBC                  5490
CNN                   5490
Reuters               5490
The Hill              5490
Axios                 3661
Washington Post       3290
Xinhua                3061
CGTN                  2929
People's Daily        2770
China Daily           2753
Global Times          2729
Name: count, dtype: int64
                  date  year  month            author  \
0  2019-07-23 00:00:00  2019    7.0  Kia Kokalitcheva   
1  2019-07-11 00:00:00  2019    7.0      Erica Pandey   
2  2019-07-15 00:00:00  2019    7.0       Dan Primack   
3  2019-07-23 00:00:00  2019    7.0     Alayna Treene   
4  2019-07-11 00:00:00  2019    7.0         Ben Geman   

                                               title publication  \
0              DoorDash's "tips" model is under fire       Axios   
1  Amazon's competitors will see their revenue ju...       Axios   
2     AB InBev cancels $9 billion-plus Hong Kong IPO       Axios

## Topic Modeling

Start by installing necessary libraries

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.coherencemodel import CoherenceModel
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec, Phrases
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
import string
import re
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\benzo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\benzo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\benzo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

### NLP Preprocessing

In [50]:
# Tokenization and Stop-word removal
def preprocess(documents):
    # Lowercasing
    documents = [doc.lower() for doc in documents]

    # Tokenization
    documents_tokens = [nltk.word_tokenize(doc) for doc in documents]

    # Stop Words Removal
    stop_words = set(stopwords.words('english'))
    documents_tokens = [[word for word in doc if word not in stop_words] for doc in documents_tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    documents_lemmatized = [[lemmatizer.lemmatize(word) for word in doc] for doc in documents_tokens]

    # Stemming
    stemmer = SnowballStemmer("english")
    documents_stemmed = [[stemmer.stem(word) for word in doc] for doc in documents_tokens]

    # Removing Numbers and Punctuation
    regex = re.compile('[%s]' % re.escape(string.punctuation + string.digits))
    documents_tokens = [[regex.sub('', word) for word in doc] for doc in documents_tokens]
    documents_tokens = [[word for word in doc if word] for doc in documents_tokens]  # Remove empty strings

    # TF-IDF Transformation
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform([' '.join(doc) for doc in documents_tokens])

    # Removing Frequent and Rare Words
    freq_thresh = 0.95
    rare_thresh = 0.05
    word_freq = X.sum(axis=0).tolist()[0]
    features = vectorizer.get_feature_names_out()
    features_list = list(features)
    docs_filtered = [[word for word in doc if word in features_list and word_freq[features_list.index(word)] < freq_thresh and word_freq[features_list.index(word)] > rare_thresh] for doc in documents_tokens]

    # Removing Short Words
    documents_tokens = [[word for word in doc if len(word) > 2] for doc in documents_tokens]

    # Bi-grams and N-grams
    bigram = Phrases(documents_tokens, min_count=5, threshold=50)
    documents_bigrams = [bigram[doc] for doc in documents_tokens]

    # Word Embeddings - save this for later?
    # model = Word2Vec(sentences=documents_tokens, vector_size=100, window=5, min_count=1, workers=4)
    # model.save("word2vec.model")


    return documents_tokens, documents_bigrams

documents_tokens, documents_bigrams = preprocess(df['combined_content'])

: 

## Building models

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
# Before we pick how many topics we want to model, we can use the coherence score to evaluate the model
# Comment this out after the first run, as it takes a while to run
# model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=df['tokens'], start=5, limit=50, step=5)
# print(coherence_values)


### LDA - Single word model

In [None]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(documents_tokens)

# Filter out words that occur in less than 20 documents, or more than 70% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.7)

# Convert the dictionary into a bag-of-words corpus.
corpus = [dictionary.doc2bow(doc) for doc in documents_tokens]

# Train the LDA model
num_topics = 25  
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42)

# Print the topics
pprint.pprint(lda.print_topics(num_words=15))

### LDA - Bigrams

In [None]:
# Create a dictionary representation of the documents with bigrams.
dictionary_bigrams = Dictionary(documents_bigrams)

# Filter out words that occur in less than 20 documents, or more than 70% of the documents.
dictionary_bigrams.filter_extremes(no_below=20, no_above=0.7)

# Convert the dictionary into a bag-of-words corpus.
corpus_bigrams = [dictionary_bigrams.doc2bow(doc) for doc in documents_bigrams]

# Train the LDA model on the bigram corpus
num_topics = 25 
lda_bigrams = LdaModel(corpus=corpus_bigrams, id2word=dictionary_bigrams, num_topics=num_topics, random_state=42)

# Print the topics for the model trained on bigrams
pprint.pprint(lda_bigrams.print_topics(num_words=15))