Seongmin Hwang 20102127

Marion Schmitt 25170158

Seungwon Jeon 16102288

**Data Science Practice : Project (FIXED VERSION)**

# Analyzing the Relationship between News Bias and Audience Influence

**Objective: Predict the influence of a media outlet based on its bias**

In [1]:
import pandas as pd
import numpy as np
import json
import os

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud

# Korean Font Setting
plt.rcParams['font.family'] = 'Malgun Gothic'  # Windows
# plt.rcParams['font.family'] = 'AppleGothic'  # Mac
plt.rcParams['axes.unicode_minus'] = False

### Bias Lexique

In [2]:
# AI HUB (json) file
folder = "data_aihub" # fichier en .7z
files_to_load = [f"talksets-train-{i}.json" for i in range(1, 6)]

aihub_data = []
for file_name in files_to_load:
    path = os.path.join(folder, file_name)
    with open(path, 'r', encoding='utf-8') as f :
        aihub_data.extend(json.load(f))

print(f"Total AI Hub data loaded: {len(aihub_data):,}")

FileNotFoundError: [Errno 2] No such file or directory: 'data_aihub/talksets-train-1.json'

In [None]:
# Extract immoral sentences
immoral_sentences = [
    s["text"]
    for entry in aihub_data
    for s in entry.get("sentences", [])
    if s.get("is_immoral") is True and "text" in s
]

print(f"Total immoral sentences: {len(immoral_sentences):,}")
print("\nSample immoral sentences:")
for i, sent in enumerate(immoral_sentences[:5]):
    print(f"{i+1}. {sent}")

### Immoral lexic : N-gram

In [None]:
# N-gram : unethical lexicon
vectorizer = CountVectorizer(
    ngram_range=(2, 3),
    min_df=5,
)

X_immoral = vectorizer.fit_transform(immoral_sentences)
unethical_terms = vectorizer.get_feature_names_out()

print(f"Total unethical N-grams extracted: {len(unethical_terms):,}")
print("\nSample unethical N-grams:")
for i, term in enumerate(unethical_terms[:20]):
    print(f"{i+1}. {term}")

### Bias Score

In [None]:
df_bigkinds = pd.read_excel('NewsResult_20241127-20251127.xlsx', sheet_name='sheet')
print("Columns in BigKinds data:")
print(df_bigkinds.columns.tolist())
print(f"\nTotal news articles: {len(df_bigkinds):,}")

# Columns definition
MEDIA_COL = "언론사"
TEXT_COL = "본문"  # Use 본문 instead of 키워드

df_bigkinds.dropna(subset=[MEDIA_COL, TEXT_COL], inplace=True)
print(f"After dropping NaN: {len(df_bigkinds):,}")

# Extract first 1000 characters
df_bigkinds["excerpt"] = df_bigkinds[TEXT_COL].astype(str).str[:1000]

# Show sample
print("\nSample excerpt:")
print(df_bigkinds["excerpt"].iloc[0][:200])

### Count immoral N-gram

In [None]:
# Count N-grams
vectorizer_news = CountVectorizer(
    vocabulary=unethical_terms,
    ngram_range=(2, 3)
)

X_news = vectorizer_news.transform(df_bigkinds["excerpt"])
unethical_counts = np.asarray(X_news.sum(axis=1)).ravel()

# Compute bias score
df_bigkinds["unethical_count"] = unethical_counts
df_bigkinds["word_count"] = df_bigkinds["excerpt"].str.split().str.len()
df_bigkinds["Bias_Score"] = (df_bigkinds["unethical_count"] / (df_bigkinds["word_count"] + 1)) * 100

# Data validation
print("\n=== Data Validation ===")
print("\nBias Score statistics:")
print(df_bigkinds["Bias_Score"].describe())
print("\nWord count statistics:")
print(df_bigkinds["word_count"].describe())
print("\nUnethical count distribution:")
print(df_bigkinds["unethical_count"].value_counts().head(10))

### Exploratory Data Analysis

In [None]:
# Distribution of Bias Score
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df_bigkinds["Bias_Score"], bins=50, edgecolor='black')
axes[0].set_xlabel('Bias Score (%)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Bias Score')
axes[0].grid(True, alpha=0.3)

# Box plot by media
top_media = df_bigkinds[MEDIA_COL].value_counts().head(10).index
df_top = df_bigkinds[df_bigkinds[MEDIA_COL].isin(top_media)]
df_top.boxplot(column='Bias_Score', by=MEDIA_COL, ax=axes[1], rot=45)
axes[1].set_title('Bias Score by Top 10 Media Outlets')
axes[1].set_xlabel('Media')
axes[1].set_ylabel('Bias Score (%)')
plt.suptitle('')

plt.tight_layout()
plt.show()

# Summary by media
print("\n=== Bias Score Summary by Media ===")
media_summary = df_bigkinds.groupby(MEDIA_COL).agg({
    'Bias_Score': ['mean', 'median', 'std', 'count']
}).round(3)
media_summary.columns = ['Mean', 'Median', 'Std', 'Article Count']
print(media_summary.sort_values('Mean', ascending=False).head(15))

### Influence Score

In [None]:
df_traffic = pd.read_excel('datalab_all.xlsx', sheet_name='Sheet1')
print("Columns in traffic data:")
print(df_traffic.columns.tolist())

# Columns definition
ID_COL = '날짜'
MEDIA_COLUMNS = [col for col in df_traffic.columns if col != ID_COL]

# Processing
results_dict = {}
for media in MEDIA_COLUMNS:
    values = pd.to_numeric(df_traffic[media], errors="coerce").dropna()
    results_dict[media] = values.mean()

# Create influence dataframe
influence_df = pd.DataFrame(results_dict.items(), columns=["Media_Name", "AVG_TREND_INDEX"])
influence_df["Influence_Score"] = np.log(influence_df["AVG_TREND_INDEX"] + 1e-6)

# CRITICAL: Standardize media names (한자 → 한글)
# BigKinds uses 한글, Naver Datalab uses 한자
name_mapping = {
    '朝鮮日報': '조선일보',  # ← 조선일보 mapping
    '東亞日報': '동아일보',
    '中央日報': '중앙일보',
    '京鄉新聞': '경향신문',
    '國民日報': '국민일보',
    '文化日報': '문화일보',
    '世界日報': '세계일보',
    '韓國日報': '한국일보',
    '韓겨레': '한겨레',
    '서울新聞': '서울신문',
    '內일新聞': '내일신문',
}

print("\n=== Name Mapping (한자 → 한글) ===")
for hanja, hangul in name_mapping.items():
    if hanja in influence_df['Media_Name'].values:
        print(f"  {hanja} → {hangul}")

influence_df['Media_Name'] = influence_df['Media_Name'].replace(name_mapping)

print("\n=== Influence Score by Media (after name standardization) ===")
print(influence_df.sort_values('Influence_Score', ascending=False))

### Fusion Bias + Influence

In [None]:
# Aggregate Bias Score by Media
bias_by_media = df_bigkinds.groupby(MEDIA_COL).agg({
    'Bias_Score': 'mean',
    'unethical_count': 'sum',
    'word_count': 'sum'
}).reset_index()

print(f"media which extracted from BigKinds: {len(bias_by_media)}개")
print("\nMedia List:")
for media in sorted(bias_by_media[MEDIA_COL]):
    print(f"  - {media}")

In [None]:
# Check matching before merge
print("\n=== MATCHING ANALYSIS ===")
print(f"BigKinds media: {len(bias_by_media)}개")
print(f"Influence media: {len(influence_df)}개")

bigkinds_set = set(bias_by_media[MEDIA_COL])
influence_set = set(influence_df['Media_Name'])

matched = bigkinds_set & influence_set
only_bigkinds = bigkinds_set - influence_set
only_influence = influence_set - bigkinds_set

print(f"\nthe number of media that matched: {len(matched)}")
for media in sorted(matched):
    print(f"  - {media}")

if only_bigkinds:
    print(f"\nonly exists in BigKinds (Influence data X): {len(only_bigkinds)}")
    for media in sorted(only_bigkinds):
        print(f"  - {media}")

if only_influence:
    print(f"\nonly exists in Influence (BigKinds article X): {len(only_influence)}개")
    for media in sorted(only_influence):
        print(f"  - {media}")

In [None]:
# Merge data
final_df = bias_by_media.merge(
    influence_df,
    left_on=MEDIA_COL,
    right_on="Media_Name",
    how="inner"
)

print(f"\n✓ final dataset: {len(final_df)} media")
print("\n=== Final Dataset ===")
print(final_df[[MEDIA_COL, "Bias_Score", "Influence_Score"]].sort_values('Bias_Score', ascending=False))

### Correlation Analysis

In [None]:
# Calculate correlation
correlation = final_df['Bias_Score'].corr(final_df['Influence_Score'])
print(f"\n=== Correlation between Bias and Influence ===")
print(f"Pearson correlation coefficient: {correlation:.4f}")

# Scatter plot with regression line
plt.figure(figsize=(10, 6))
plt.scatter(final_df['Bias_Score'], final_df['Influence_Score'],
            s=100, alpha=0.6, edgecolors='black')

# Add media names
for idx, row in final_df.iterrows():
    plt.annotate(row[MEDIA_COL],
                (row['Bias_Score'], row['Influence_Score']),
                fontsize=9, alpha=0.7)

# Add regression line
z = np.polyfit(final_df['Bias_Score'], final_df['Influence_Score'], 1)
p = np.poly1d(z)
plt.plot(final_df['Bias_Score'], p(final_df['Bias_Score']),
         "r--", alpha=0.8, linewidth=2, label=f'Fit line (r={correlation:.3f})')

plt.xlabel('Bias Score (%)', fontsize=12)
plt.ylabel('Influence Score (log scale)', fontsize=12)
plt.title('Relationship between Bias and Influence', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### Clustering (K-Means)

In [None]:
# Elbow method
X = final_df[["Bias_Score", "Influence_Score"]].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

inertias = []
K_range = range(2, min(8, len(final_df)))

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (K)', fontsize=12)
plt.ylabel('Inertia', fontsize=12)
plt.title('Elbow Method for Optimal K', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nInertia values:")
for k, inertia in zip(K_range, inertias):
    print(f"K={k}: {inertia:.2f}")

In [None]:
# Apply K-Means
optimal_k = 3

kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
final_df["cluster"] = kmeans.fit_predict(X_scaled)

print(f"\n=== Cluster Summary (K={optimal_k}) ===")
for cluster_id in range(optimal_k):
    cluster_data = final_df[final_df['cluster'] == cluster_id]
    print(f"\nCluster {cluster_id}:")
    print(f"  How many are in this cluster: {len(cluster_data)}")
    print(f"  Avg Bias Score: {cluster_data['Bias_Score'].mean():.2f}%")
    print(f"  Avg Influence Score: {cluster_data['Influence_Score'].mean():.2f}")
    print(f"  Media: {', '.join(cluster_data[MEDIA_COL].tolist())}")

print("\n=== Full Results ===")
print(final_df[[MEDIA_COL, "Bias_Score", "Influence_Score", "cluster"]].sort_values('cluster'))

### Visualization

In [None]:
# Enhanced scatter plot
plt.figure(figsize=(12, 8))

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']
cluster_colors = [colors[i % len(colors)] for i in final_df['cluster']]

scatter = plt.scatter(
    final_df["Bias_Score"],
    final_df["Influence_Score"],
    c=cluster_colors,
    s=200,
    alpha=0.6,
    edgecolors='black',
    linewidths=1.5
)

# Add labels
for idx, row in final_df.iterrows():
    plt.annotate(
        row[MEDIA_COL],
        (row['Bias_Score'], row['Influence_Score']),
        xytext=(5, 5),
        textcoords='offset points',
        fontsize=10,
        alpha=0.8,
        bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7)
    )

# Add cluster centers
centers = scaler.inverse_transform(kmeans.cluster_centers_)
plt.scatter(
    centers[:, 0],
    centers[:, 1],
    marker='X',
    s=500,
    c='red',
    edgecolors='black',
    linewidths=2,
    label='Cluster Centers',
    zorder=10
)

plt.xlabel('Bias Score (%)', fontsize=14, fontweight='bold')
plt.ylabel('Influence Score (log scale)', fontsize=14, fontweight='bold')
plt.title('News Bias vs Audience Influence (with K-Means Clustering)',
          fontsize=16, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3, linestyle='--')
plt.tight_layout()
plt.show()

### Key Insights & Conclusions

In [None]:
print("="*70)
print("KEY FINDINGS")
print("="*70)

print("\n1. OVERALL CORRELATION")
print(f"   - Bias-Influence correlation: {correlation:.4f}")
if abs(correlation) < 0.3:
    print("   → Weak correlation")
elif abs(correlation) < 0.7:
    print("   → Moderate correlation")
else:
    print("   → Strong correlation")

print("\n2. BIAS DISTRIBUTION")
print(f"   - Mean: {final_df['Bias_Score'].mean():.2f}%")
print(f"   - Median: {final_df['Bias_Score'].median():.2f}%")
print(f"   - Range: {final_df['Bias_Score'].min():.2f}% - {final_df['Bias_Score'].max():.2f}%")

print("\n3. INFLUENCE DISTRIBUTION")
print(f"   - Mean: {final_df['Influence_Score'].mean():.2f}")
print(f"   - Median: {final_df['Influence_Score'].median():.2f}")

print("\n4. EXTREME CASES")
print(f"   Highest Bias: {final_df.loc[final_df['Bias_Score'].idxmax(), MEDIA_COL]} ({final_df['Bias_Score'].max():.2f}%)")
print(f"   Lowest Bias: {final_df.loc[final_df['Bias_Score'].idxmin(), MEDIA_COL]} ({final_df['Bias_Score'].min():.2f}%)")
print(f"   Highest Influence: {final_df.loc[final_df['Influence_Score'].idxmax(), MEDIA_COL]} ({final_df['Influence_Score'].max():.2f})")
print(f"   Lowest Influence: {final_df.loc[final_df['Influence_Score'].idxmin(), MEDIA_COL]} ({final_df['Influence_Score'].min():.2f})")

print("\n" + "="*70)

In [None]:
# Export results
final_df.to_csv('final_results_with_chosun.csv', index=False, encoding='utf-8-sig')
print("✓ Results exported to 'final_results.csv'")
print(f"✓ Total media outlets: {len(final_df)}")

if '조선일보' in final_df[MEDIA_COL].values:
    chosun_data = final_df[final_df[MEDIA_COL] == '조선일보'].iloc[0]
    print(f"   - Bias Score: {chosun_data['Bias_Score']:.4f}%")
    print(f"   - Influence Score: {chosun_data['Influence_Score']:.4f}")
    print(f"   - Cluster: {int(chosun_data['cluster'])}")