# 207 final project
### Peter Benzoni

## Data loading, filtering, and cleaning 

In [1]:
import pandas as pd

# Load the datasets
df1 = pd.read_csv('articles1.csv')
df2 = pd.read_csv('articles2.csv')
df3 = pd.read_csv('articles3.csv')

# Combine the dataframes
df = pd.concat([df1, df2, df3], ignore_index=True)

# Drop any duplicate articles based on 'id'
df.drop_duplicates(subset='id', keep='first', inplace=True)

# Filter out articles from specified publications
excluded_publications = ['Breitbart', 'New York Post', 'Fox News', 'Talking Points Memo']
df = df[~df['publication'].isin(excluded_publications)]

# Handling missing values - filling NA values with a placeholder or removing them
df.fillna('Unknown', inplace=True)  


# Convert the data into a format suitable for NLP analysis 
df['combined_content'] = df['title'] + ': ' + df['content']

# Now, df['combined_content'] can be used for NLP analysis
print(df.head())

   Unnamed: 0     id                                              title  \
0           0  17283  House Republicans Fret About Winning Their Hea...   
1           1  17284  Rift Between Officers and Residents as Killing...   
2           2  17285  Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...   
3           3  17286  Among Deaths in 2016, a Heavy Toll in Pop Musi...   
4           4  17287  Kim Jong-un Says North Korea Is Preparing to T...   

      publication                         author        date    year month  \
0  New York Times                     Carl Hulse  2016-12-31  2016.0  12.0   
1  New York Times  Benjamin Mueller and Al Baker  2017-06-19  2017.0   6.0   
2  New York Times                   Margalit Fox  2017-01-06  2017.0   1.0   
3  New York Times               William McDonald  2017-04-10  2017.0   4.0   
4  New York Times                  Choe Sang-Hun  2017-01-02  2017.0   1.0   

                                             content  \
0  WASHINGTON  —   Congr

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim.corpora import Dictionary
from gensim.models import LdaModel

# Tokenization, Stop-word removal, and Bag-of-Words transformation
vectorizer = CountVectorizer(stop_words='english', max_features=10000)
X = vectorizer.fit_transform(df['combined_content'])
tokens = vectorizer.get_feature_names_out()

# Convert to Gensim's format
corpus = Dictionary([tokens]).doc2bow(tokens)

# LDA Topic Modeling
lda_model = LdaModel([corpus], num_topics=10, id2word={i:word for i, word in enumerate(tokens)})

# Print the topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

ModuleNotFoundError: No module named 'sklearn'