
# Google Daily News - Exploratory Data Analysis (EDA)

This notebook explores the `Google_Daily_News.csv` dataset to identify trending topics and news category distributions over time.


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
from datetime import datetime

# Load dataset
df = pd.read_csv("Google_Daily_News.csv")
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df.dropna(subset=['title', 'date'], inplace=True)
df = df.sort_values('date')
df.head()


In [None]:

# News count per day
plt.figure(figsize=(12, 4))
df['date'].dt.date.value_counts().sort_index().plot()
plt.title("Jumlah Berita per Hari")
plt.xlabel("Tanggal")
plt.ylabel("Jumlah Berita")
plt.tight_layout()
plt.show()


In [None]:

# Distribusi kategori berita
plt.figure(figsize=(10, 5))
df['category'].value_counts().head(10).plot(kind='bar')
plt.title("Distribusi Kategori Berita")
plt.xlabel("Kategori")
plt.ylabel("Jumlah")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:

# Word cloud dan top keywords
from sklearn.feature_extraction.text import CountVectorizer

# Lowercase titles for processing
titles = df['title'].dropna().str.lower()

# Generate word cloud
text = ' '.join(titles)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud - Topik Populer dalam Judul Berita")
plt.show()

# Tampilkan 10 kata paling umum
vectorizer = CountVectorizer(stop_words='english')
word_matrix = vectorizer.fit_transform(titles)
word_counts = word_matrix.sum(axis=0).A1
vocab = vectorizer.get_feature_names_out()
word_freq = dict(zip(vocab, word_counts))
top_keywords = Counter(word_freq).most_common(10)

# Plot
plt.figure(figsize=(8, 4))
sns.barplot(x=[count for word, count in top_keywords], y=[word for word, count in top_keywords])
plt.title("10 Kata Paling Umum dalam Judul Berita")
plt.xlabel("Jumlah")
plt.ylabel("Kata")
plt.tight_layout()
plt.show()
