In [1]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
df = pd.read_csv('data/output.csv')

In [3]:
df['countries']

# Proprocessing

In [10]:
# download nltk data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.downloader.download('maxent_ne_chunker')
nltk.downloader.download('words')
nltk.downloader.download('treebank')
nltk.downloader.download('maxent_treebank_pos_tagger')


[nltk_data] Downloading package punkt to /Users/zmao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/zmao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/zmao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/zmao/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [95]:
import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
doc = nlp("Thyroid-associated orbitopathy (TO) is an autoimmune-mediated orbital inflammation that can lead to disfigurement and blindness. Multiple genetic loci have been associated with Graves' disease, but the genetic basis for TO is largely unknown. This study aimed to identify loci associated with TO in individuals with Graves' disease, using a genome-wide association scan (GWAS) for the first time to our knowledge in TO.Genome-wide association scan was performed on pooled DNA from an Australian Caucasian discovery cohort of 265 participants with Graves' disease and TO (cases) and 147 patients with Graves disease without TO (controls).")

# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.


[('Graves', 177, 183, 'ORG'), ('Graves', 316, 322, 'ORG'), ('first', 385, 390, 'ORDINAL'), ('Australian', 484, 494, 'NORP'), ('Caucasian', 495, 504, 'NORP'), ('265', 525, 528, 'CARDINAL'), ('Graves', 547, 553, 'PERSON'), ('147', 582, 585, 'CARDINAL')]


In [94]:
import en_core_web_sm
nlp = en_core_web_sm.load()

ModuleNotFoundError: No module named 'en_core_web_sm'

In [92]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [108]:
from krovetzstemmer import Stemmer
import pycountry
lemmatizer = WordNetLemmatizer()
stemmer = Stemmer() 

def is_adjective(word):
    pos = nltk.pos_tag([word])[0][1]
    return pos in ['JJ', 'JJR', 'JJS']

# text data cleaning
def clean_text_lem(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(word) for word in text if word not in stopwords.words('english')]
    lemmatized = ' '.join(lemmatized)

    return lemmatized

def clean_text_stem(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = word_tokenize(text)
    stemmed = [stemmer.stem(word) for word in text if word not in stopwords.words('english')]
    stemmed = ' '.join(stemmed)
    return stemmed

def clean_text_adj(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = word_tokenize(text)
    stemmed = [word for word in text if is_adjective(word) or word == 'terrorism' ]
    stemmed = ' '.join(stemmed)
    return stemmed

# Get a list of all country names
countries = [country.name.lower() for country in pycountry.countries]
def get_countries(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    doc = nlp(text)
    ents = [e.text for e in doc.ents if e.label_ in ['NORP', 'GPE']]

    return ents


df['cleaned_text_lem'] = df['full text'].apply(clean_text_lem)
df['cleaned_text_stem'] = df['full text'].apply(clean_text_stem)
df['cleaned_text_adj'] = df['full text'].apply(clean_text_adj)
df['countries'] = df['full text'].apply(get_countries)
df['countries'] = df['countries'].apply(lambda x: list(set([stemmer.stem(country) for country in x])))

In [136]:
df.to_csv('data/output.csv', index=False)

# Places

In [42]:
import ast

def get_places_stat(df, label):
    df['countries'] = df['countries'].apply(ast.literal_eval)
    places = df['countries'].tolist()
    # convert a list string to a list of strings
    
    places = [place for sublist in places for place in sublist]
    # replace 'the united states' 'america' 'us' with 'united states'
    places = [place if place not in ['the united states', 'america', 'us'] else 'united states' for place in places]
    places = [place if place not in ['soviet', 'the soviet union'] else 'soviet union' for place in places]
    places = [place if place not in ['lebanon', 'beirut'] else 'lebanon' for place in places]
    # remove lines with places in 'oklahoma city' and 'arab' as they are not countries
    places = [place for place in places if place not in ['oklahoma city', 'arab', 'muslim', 'islam', 'new york', 'washington']]
    places = pd.Series(places)
    places = places.value_counts().reset_index()
    places.columns = ['place', 'count']
    places['count_normalized'] = places['count'] / places['count'].sum()
    places['label'] = label
    places['place'] = places['place'].str.title()
    return places

In [43]:
date_range = [[1980, 1985], [1986, 1990], [1991, 1995], [1996, 2000], [2001, 2005], [2006, 2010], [2011, 2015], [2016, 2020]]

In [44]:
# get the places for each date range
for date in date_range:
    df_date = df.loc[(df['year'] >= date[0]) & (df['year'] <= date[1])]
    places = get_places_stat(df_date, f'{date[0]} - {date[1]}')
    places.to_csv(f'output/places/places_{date[0]}_{date[1]}.csv', index=False)
    
# combine the results
df_output_places = pd.concat([pd.read_csv(f'output/places/places_{date[0]}_{date[1]}.csv') for date in date_range])
df_output_places.to_csv('output/places/places_by_years.csv', index=False)

In [45]:
df_output_places

Unnamed: 0,place,count,count_normalized,label
0,United States,1437,0.143959,1980 - 1985
1,Lebanon,425,0.042577,1980 - 1985
2,Soviet Union,395,0.039571,1980 - 1985
3,Israel,335,0.033560,1980 - 1985
4,France,208,0.020838,1980 - 1985
...,...,...,...,...
2165,Brandenburg,1,0.000029,2016 - 2020
2166,Karim,1,0.000029,2016 - 2020
2167,Elizabeth City State University,1,0.000029,2016 - 2020
2168,Lgbt Americans,1,0.000029,2016 - 2020


In [46]:
import plotly.subplots as sp
import plotly.graph_objects as go
import plotly.io as py
import matplotlib.cm as cm
import matplotlib.pyplot as plt

df_output_places_filtered = df_output_places.groupby('place').filter(lambda x:  np.std(x.count_normalized)  >= 0.005)

cmap = plt.colormaps.get_cmap('Set1')

norm = cm.colors.Normalize(vmin=0, vmax=len(df_output_places_filtered['place'].unique())-1)
color_map = {place: 'rgba' + str(cmap(norm(i))) for i, place in enumerate(df_output_places_filtered['place'].unique())}

# Create subplot
fig = sp.make_subplots(rows=2, cols=1, shared_xaxes=True)

for place in df_output_places_filtered['place'].unique():
    df_filtered = df_output_places_filtered[df_output_places_filtered['place'] == place]

    # Add traces to subplot 1
    fig.add_trace(go.Scatter(x=df_filtered['label'], y=df_filtered['count_normalized'], mode='lines', name=place, line=dict(color=color_map[place])), row=1, col=1)

    # Add traces to subplot 2
    fig.add_trace(go.Scatter(x=df_filtered['label'], y=df_filtered['count_normalized'], mode='lines', name=place, showlegend=False, line=dict(color=color_map[place])), row=2, col=1)

# Update layout for subplot 1
fig.update_layout(
    title='Frequency change of countries across the years(',
    title_x=0.5,
    yaxis_title="Frequency",
    yaxis=dict(range=[0.12, 0.17]),  # Set y-axis range for subplot 1
    width=1200,
    height=400,
    template='plotly_white'
)

# Update layout for subplot 2
fig.update_layout(
    title='Frequency change of mentioned countries across the years',
    title_x=0.5,
    yaxis_title="Frequency",
    yaxis2=dict(range=[0, 0.05], title='Frequency'),  # Set y-axis range for subplot 2
    xaxis2=dict(title="Year Range"),  # Set x-axis title for subplot 2
    width=1200,
    height=800,
    template='plotly_white'
)

py.write_image(fig, 'output/places/places_by_years.png')

# Show the plots
fig.show()

# Colocates

In [47]:
from nltk.collocations import *
from nltk.tokenize import word_tokenize
from gensim.utils import simple_preprocess

def get_colocates(df, label):
    data = df.values.tolist()
    
    data_words = [[word for word in simple_preprocess(str(doc))] for doc in data]
    tokens = [i for j in data_words for i in j]
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    
    word = 'terrorism'
    df_output_col = pd.DataFrame(columns=['rank', 'word_of_interest', 'colocate', 'score', 'score_normalized', 'label'])
    word_filter = lambda *w: word not in w
    
    finder = BigramCollocationFinder.from_words(tokens)
    finder.apply_freq_filter(3)
    finder.apply_ngram_filter(word_filter)
    counter = 0 
    def get_word_in_tuple(tup):
        if tup[0] == 'terrorism':
            return tup[1]
        elif tup[1] == 'terrorism':
            return tup[0]
        else:
            return None
    for i in finder.score_ngrams(bigram_measures.likelihood_ratio):
        
        df_output_col.loc[len(df_output_col.index)] = [counter, word, get_word_in_tuple(i[0]), i[1], i[1] / finder.N, label] 
        counter += 1
    # remove colocates which have task, charge, case, joint, bill, commission, related, expert in it
    df_output_filtered = df_output_col[~df_output_col['colocate'].str.contains('task|charge|case|joint|bill|commission|related|expert')]
    # merge the colocates which have combat, anti, fight, counter in it into one word counter and sum up the normalized score
    df_output_filtered.loc[df_output_filtered['colocate'].str.contains('combat|anti|fight|counter'), 'colocate'] = 'counter'
    # merge the colocates which have sponsor, support in it into one word support and sum up the normalized score
    df_output_filtered.loc[df_output_filtered['colocate'].str.contains('sponsor|support'), 'colocate'] = 'support'
    # merge the colocates which have renunciation, renounce in it into one word renounce and sum up the normalized score
    df_output_filtered.loc[df_output_filtered['colocate'].str.contains('renunciation|renounce'), 'colocate'] = 'renounce'
    # merge the colocates which have bill, legislation in it into one word legislation and sum up the normalized score
    df_output_filtered.loc[df_output_filtered['colocate'].str.contains('bill|legislation'), 'colocate'] = 'legislation'
    # merge the colocates which have domestic, homegrown in it into one word domestic and sum up the normalized score
    df_output_filtered.loc[df_output_filtered['colocate'].str.contains('domestic|homegrown'), 'colocate'] = 'domestic'
    # merge the colocates which have international, global in it into one word international and sum up the normalized score
    df_output_filtered.loc[df_output_filtered['colocate'].str.contains('international|global'), 'colocate'] = 'international'
    # merge the colocates which have act, acts in it into one word act and sum up the normalized score
    df_output_filtered.loc[df_output_filtered['colocate'].str.contains('act|acts'), 'colocate'] = 'act'
    #  replace 
    df_output_col = df_output_filtered.groupby(['colocate', 'label']).agg({'score_normalized': 'sum', 'score': 'sum'}).reset_index()
    return df_output_col

In [48]:
# get the colocates for each date range
for date in date_range:
    df_date = df.loc[(df['year'] >= date[0]) & (df['year'] <= date[1])]
    res = get_colocates(df_date['cleaned_text_stem'], f'{date[0]} - {date[1]}')
    res.to_csv(f'output/colocates/colocates_{date[0]}_{date[1]}.csv', index=False)

# combine the results
df_output = pd.concat([pd.read_csv(f'output/colocates/colocates_{date[0]}_{date[1]}.csv') for date in date_range])
# df_output = df_output.groupby(['colocate', 'label']).agg({'score_normalized': 'sum', 'score': 'sum'}).reset_index()
df_output.to_csv('output/colocates/colocates_by_years_stem.csv', index=False)

In [49]:
df_output

Unnamed: 0,colocate,label,score_normalized,score
0,abet,1980 - 1985,0.000069,29.769743
1,abroad,1980 - 1985,0.000034,14.505753
2,according,1980 - 1985,0.000024,10.222210
3,accused,1980 - 1985,0.000040,17.172277
4,across,1980 - 1985,0.000009,3.806082
...,...,...,...,...
1598,yes,2016 - 2020,0.000001,2.698115
1599,yet,2016 - 2020,0.000003,4.926823
1600,yorker,2016 - 2020,0.000002,4.452952
1601,zealand,2016 - 2020,0.000003,6.432913


In [50]:
import numpy as np


df_output_filtered = df_output.groupby('colocate').filter(lambda x:  np.std(x.score_normalized)  >= 0.00015)

print(df_output_filtered[:10])

import plotly.express as px

fig = px.line(df_output_filtered, x='label', y='score_normalized', color='colocate', 
              title='Score change of colocates across the years', 
              labels={'label': 'Label', 'score_normalized': 'Score Normalized', 'colocate': 'Colocate'})

fig.update_layout(autosize=False, width=1200, height=800, template='plotly_white')
fig.write_image('output/colocates/colocates_by_years_stem.png')
fig.show()

          colocate        label  score_normalized        score
5              act  1980 - 1985      1.487192e-03   643.616352
102        counter  1980 - 1985      3.602817e-03  1559.201944
105       coverage  1980 - 1985      2.727997e-05    11.806034
129       domestic  1980 - 1985      1.226358e-04    53.073483
227  international  1980 - 1985      3.964224e-03  1715.609160
377       renounce  1980 - 1985      1.487903e-04    64.392431
459        support  1980 - 1985      1.468517e-03   635.534395
462        suspect  1980 - 1985      6.837498e-07     0.295908
499          urban  1980 - 1985      2.205828e-04    95.462260
512            war  1980 - 1985      3.346512e-05    14.482799


In [51]:
df5 = df.loc[df['publisher'] == 'the new york times'] 
res5 = get_colocates(df5['cleaned_text_stem'], 'nyt')
res5


Unnamed: 0,colocate,label,score_normalized,score
0,abandon,nyt,2.231927e-06,5.328839
1,abet,nyt,2.612988e-05,62.386427
2,abroad,nyt,1.950512e-05,46.569459
3,absence,nyt,6.555202e-07,1.565088
4,abu,nyt,2.172497e-07,0.518695
...,...,...,...,...
1796,yet,nyt,5.149965e-06,12.295803
1797,york,nyt,2.175273e-05,51.935744
1798,young,nyt,2.191058e-06,5.231263
1799,yugoslavia,nyt,1.493042e-06,3.564713


In [52]:
df6 = df.loc[df['publisher'] == 'pittsburgh post-gazette'] 
res6 = get_colocates(df6['cleaned_text_stem'], 'ppg')
res6

Unnamed: 0,colocate,label,score_normalized,score
0,according,ppg,0.000019,5.972492
1,act,ppg,0.001505,463.576735
2,administration,ppg,0.000007,2.161845
3,adviser,ppg,0.000011,3.367004
4,afghanistan,ppg,0.000003,0.977181
...,...,...,...,...
367,worry,ppg,0.000042,12.880542
368,would,ppg,0.000011,3.475151
369,years,ppg,0.000004,1.106956
370,yesterday,ppg,0.000005,1.499528


In [53]:

df_output_by_media = pd.concat([res5, res6])
df_output_by_media.to_csv('output/colocates/colocates_by_media_stem.csv', index=False)


In [54]:
# filter out collates that appears in only one label
df_output_by_media_filtered = df_output_by_media.groupby('colocate').filter(lambda x:  np.std(x.score_normalized)  > 0.00008)

import plotly.graph_objects as go

# Get unique colocates
colocates = df_output_by_media_filtered['colocate'].unique()

# Pivot the DataFrame
df_pivot = df_output_by_media_filtered.pivot(index='colocate', columns='label', values='score_normalized').reset_index()

# Create the plot
fig = go.Figure()

# Add NYT scores
fig.add_trace(go.Bar(
    x=df_pivot['colocate'],
    y=df_pivot['nyt'],
    name='NYT'
))

# Add PPG scores
fig.add_trace(go.Bar(
    x=df_pivot['colocate'],
    y=df_pivot['ppg'],
    name='PPG'
))

# Update layout for grouped bars
fig.update_layout(
    barmode='group',
    width=1200, height=800, 
    title='Normalized Scores for Colocates (NYT vs PPG)',
    xaxis_title='Colocate',
    yaxis_title='Normalized Score',
    template='plotly_white'
)

fig.write_image('output/colocates/colocates_by_media_stem.png')
fig.show()

## Adjectives

In [55]:
# get the colocates for each date range
for date in date_range:
    df_date = df.loc[(df['year'] >= date[0]) & (df['year'] <= date[1])]
    res = get_colocates(df_date['cleaned_text_adj'], f'{date[0]} - {date[1]}')
    res.to_csv(f'output/colocates/colocates_adj_{date[0]}_{date[1]}.csv', index=False)
    
# combine the results
df_output = pd.concat([pd.read_csv(f'output/colocates/colocates_adj_{date[0]}_{date[1]}.csv') for date in date_range])
df_output.to_csv('output/colocates/colocates_adj_by_years_stem.csv', index=False)

In [56]:
df_output_filtered = df_output.groupby('colocate').filter(lambda x:  np.std(x.score_normalized)  >= 0.00015)

# Create a new figure
fig = go.Figure()

# Add a trace for each unique 'colocate' in the DataFrame
for colocate in df_output_filtered['colocate'].unique():
    df_filtered = df_output_filtered[df_output_filtered['colocate'] == colocate]
    fig.add_trace(go.Scatter(x=df_filtered['label'], y=df_filtered['score_normalized'], mode='lines', name=colocate))

# Update layout
fig.update_layout(autosize=False, width=1200, height=800,
                  title='Score change of adjective colocates across the years',
                  xaxis_title='Label',
                  yaxis_title='Score Normalized',
                  template='plotly_white'
                  )

# Save the figure as an image and display it
fig.write_image('output/colocates/colocates_adj_by_years_stem.png')
fig.show()

In [57]:
df5 = df.loc[df['publisher'] == 'the new york times'] 
res5 = get_colocates(df5['cleaned_text_adj'], 'nyt')
res5 = res5.sort_values(by='score_normalized', ascending=False)
res5

Unnamed: 0,colocate,label,score_normalized,score
278,international,nyt,6.122286e-03,1794.576812
130,domestic,nyt,5.512556e-03,1615.851336
222,guilty,nyt,2.548158e-04,74.692129
216,graphic,nyt,1.865417e-04,54.679486
328,modern,nyt,1.613610e-04,47.298459
...,...,...,...,...
337,municipal,nyt,1.039348e-08,0.003047
181,fanatical,nyt,1.039348e-08,0.003047
552,uncomfortable,nyt,9.507789e-09,0.002787
416,productive,nyt,4.106974e-09,0.001204


In [58]:
df6 = df.loc[df['publisher'] == 'pittsburgh post-gazette'] 
res6 = get_colocates(df6['cleaned_text_adj'], 'ppg')
res6 = res6.sort_values(by='score_normalized', ascending=False)
res6

Unnamed: 0,colocate,label,score_normalized,score
105,international,ppg,7.288690e-03,265.410374
55,domestic,ppg,3.984872e-03,145.105120
38,controversial,ppg,4.099391e-04,14.927521
72,federal,ppg,3.706510e-04,13.496886
240,vulnerable,ppg,3.535614e-04,12.874584
...,...,...,...,...
26,commercial,ppg,4.111544e-07,0.014972
250,worst,ppg,2.660188e-07,0.009687
53,diplomatic,ppg,6.226471e-08,0.002267
164,poor,ppg,1.090156e-08,0.000397


In [59]:

df_output_by_media = pd.concat([res5, res6])
df_output_by_media.to_csv('output/colocates/colocates_adj_by_media_stem.csv', index=False)


In [61]:
# list top 20 colocates that is unique to NYT 
df_output_by_media_unique_nyt = df_output_by_media.groupby('colocate').filter(lambda x:  len(x) == 1)
df_output_by_media_unique_nyt = df_output_by_media_unique_nyt.loc[df_output_by_media_unique_nyt['label'] == 'nyt']
df_output_by_media_unique_nyt = df_output_by_media_unique_nyt.sort_values(by='score_normalized', ascending=False)
df_output_by_media_unique_nyt.to_csv('output/colocates/colocates_adj_by_media_unique_nyt_stem.csv', index=False)
df_output_by_media_unique_nyt[:20]

Unnamed: 0,colocate,label,score_normalized,score
122,dictionary,nyt,0.000101,29.596669
516,synonymous,nyt,8.7e-05,25.638688
503,subnational,nyt,7.2e-05,21.160659
248,incalculable,nyt,7.1e-05,20.766657
207,gaseous,nyt,7.1e-05,20.706801
62,bubble,nyt,6.6e-05,19.34074
413,probable,nyt,5.2e-05,15.225488
535,transnational,nyt,5e-05,14.768094
573,unite,nyt,4.6e-05,13.485047
212,glorious,nyt,4.3e-05,12.679201


In [62]:
# list top 20 colocates that is unique to PPG
df_output_by_media_unique_ppg = df_output_by_media.groupby('colocate').filter(lambda x:  len(x) == 1)
df_output_by_media_unique_ppg = df_output_by_media_unique_ppg.loc[df_output_by_media_unique_ppg['label'] == 'ppg']
df_output_by_media_unique_ppg = df_output_by_media_unique_ppg.sort_values(by='score_normalized', ascending=False)
df_output_by_media_unique_ppg.to_csv('output/colocates/colocates_adj_by_media_unique_ppg_stem.csv', index=False)
df_output_by_media_unique_ppg[:20]

Unnamed: 0,colocate,label,score_normalized,score
231,unethical,ppg,0.000288,10.487518
188,ridiculous,ppg,0.000164,5.956542
230,undersecretary,ppg,0.000121,4.414064
236,unsealed,ppg,7e-05,2.534198
57,duncan,ppg,8e-06,0.28887


In [63]:
df_output_by_media_filtered = df_output_by_media.groupby('colocate').filter(lambda x:  np.std(x.score_normalized)  > 0.0001)
# remove many, new
df_output_by_media_filtered = df_output_by_media_filtered.loc[~df_output_by_media_filtered['colocate'].isin(['many', 'new'])]

# Pivot the DataFrame
df_pivot = df_output_by_media_filtered.pivot(index='colocate', columns='label', values='score_normalized').reset_index()

# Create the plot
fig = go.Figure()

# Add NYT scores
fig.add_trace(go.Bar(
    x=df_pivot['colocate'],
    y=df_pivot['nyt'],
    name='NYT'
))

# Add PPG scores
fig.add_trace(go.Bar(
    x=df_pivot['colocate'],
    y=df_pivot['ppg'],
    name='PPG'
))

# Update layout for grouped bars
fig.update_layout(
    barmode='group',
    width=1200, height=800, 
    title='Normalized Scores for Adjective Colocates (NYT vs PPG)',
    xaxis_title='Colocate',
    yaxis_title='Normalized Score',
    template='plotly_white'
)

fig.write_image('output/colocates/colocates_adj_by_media_stem.png')
fig.show()



# Dynamic Topic Modeling by Years

In [132]:
from bertopic import BERTopic

def get_topics(df_news, topic_cnt = None):
    news = df_news.loc[df_news['year'] >= 1980]['cleaned_text_stem'].tolist()
    topic_model = BERTopic(verbose=True)
    topics, probs = topic_model.fit_transform(news)
    if topic_cnt is not None:
        topic_model.reduce_topics(news, nr_topics=topic_cnt)
    dates = df_news.loc[df_news['year'] >= 1980]['year'].tolist()
    topics_over_time = topic_model.topics_over_time(news, dates, datetime_format='%Y', nr_bins=20)
    topics_per_class_groupby = topics_over_time.groupby('Timestamp').apply(lambda x: x['Frequency'] / x['Timestamp'].sum())
    topics_over_time['Frequency'] = topics_per_class_groupby.values
    return topic_model, topics_over_time


# Without topic number reduction

In [133]:
topic_model, topics_over_time = get_topics(df)
topic_model.get_topic_info().to_csv('output/dtm/topic_info.csv', index=False)
topic_model.get_topic_info()

2024-05-22 19:45:16,300 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/756 [00:00<?, ?it/s]

2024-05-22 19:47:09,494 - BERTopic - Embedding - Completed ✓
2024-05-22 19:47:09,495 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-22 19:47:24,894 - BERTopic - Dimensionality - Completed ✓
2024-05-22 19:47:24,895 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,8422,-1_terrorism_america_terrorist_said,"[terrorism, america, terrorist, said, war, new...",[visitor national september memorial take view...
1,0,1100,0_israel_palestinian_arafat_peace,"[israel, palestinian, arafat, peace, hama, plo...",[israel palestine liberation organization whos...
2,1,567,1_insurance_insurer_coverage_risk,"[insurance, insurer, coverage, risk, tria, bac...",[washington legislation would extend terrorism...
3,2,519,2_france_paris_charlie_hebdo,"[france, paris, charlie, hebdo, macron, europe...",[man knife kill three people church nice assau...
4,3,256,3_film_movie_character_novel,"[film, movie, character, novel, play, comedy, ...",[terrorism long dose harsh reality increase mo...
...,...,...,...,...,...
306,305,10,305_shoot_hate_dandach_tomajczyk,"[shoot, hate, dandach, tomajczyk, poem, chatta...",[four people charged connection violence polit...
307,306,10,306_ghailani_ghaith_guantanamo_trial,"[ghailani, ghaith, guantanamo, trial, mohammed...",[lawyer terrorism suspect spent nearly five ye...
308,307,10,307_pape_suicide_fundamentalism_tiger,"[pape, suicide, fundamentalism, tiger, bomber,...",[student faculty community member packed filen...
309,308,10,308_imaginary_patriotism_heinlein_patriot,"[imaginary, patriotism, heinlein, patriot, ism...",[washington words weapon nearly every new repo...


In [134]:
topics_over_time.to_csv('output/dtm/topics_over_time.csv', index=False)
topics_over_time


Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"soviet, guatemala, subcommittee, reagan, denton",0.000946,1979.96
1,0,"israel, palestinian, lebanon, arab, arafat",0.000441,1979.96
2,2,"france, mitterrand, paris, jew, jewish",0.000192,1979.96
3,3,"desideria, film, fassbinder, novel, amble",0.000064,1979.96
4,4,"libya, qaddafi, colonel, reagan, oil",0.000273,1979.96
...,...,...,...,...
2543,299,"jersey, kosher, anderson, grew, fulop",0.000026,2018.00
2544,300,"rwanda, rusesabagina, kagame, kigali, genocide",0.000029,2018.00
2545,305,"hate, splc, shoot, crime, dayton",0.000003,2018.00
2546,308,"ism, suffix, partisan, rac, ist",0.000013,2018.00


In [179]:
# 1 
topics_over_time_filtered = topics_over_time.groupby('Topic').filter(lambda x:  np.std(x.Frequency)  > 0.000048)
fig = topic_model.visualize_topics_over_time(topics_over_time_filtered)
# enlarge the size of the plot
fig.update_layout(width=1200, height=800)
fig.write_image('output/dtm/topics_over_time.png', engine="orca")
fig.show()

# Topic number 200

In [150]:
topic_model, topics_over_time = get_topics(df, 200)
topic_model.get_topic_info().to_csv('output/dtm/topic_info_200.csv', index=False)
topic_model.get_topic_info()

2024-05-23 10:32:38,830 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/756 [00:00<?, ?it/s]

2024-05-23 10:34:42,247 - BERTopic - Embedding - Completed ✓
2024-05-23 10:34:42,248 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-23 10:34:50,310 - BERTopic - Dimensionality - Completed ✓
2024-05-23 10:34:50,311 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,8565,-1_terrorism_said_terrorist_america,"[terrorism, said, terrorist, america, attack, ...",[washington break forceful bush administration...
1,0,1067,0_israel_palestinian_arafat_peace,"[israel, palestinian, arafat, peace, hama, ara...",[sharm el sheik egypt dramatic show solidarity...
2,1,689,1_iraq_obama_war_laden,"[iraq, obama, war, laden, bin, bush, al, qaida...",[announce osama bin laden death sunday night p...
3,2,670,2_insurance_coverage_risk_insurer,"[insurance, coverage, risk, insurer, tria, los...",[terrorism risk insurance act establish three ...
4,3,504,3_shoot_white_gun_domestic,"[shoot, white, gun, domestic, supremacist, tru...",[washington law enforce official sound alarm m...
...,...,...,...,...,...
195,194,11,194_charity_nonprofit_treasury_donor,"[charity, nonprofit, treasury, donor, organiza...",[group nonprofit organization pressing u treas...
196,195,11,195_clinic_dear_colorado_domestic,"[clinic, dear, colorado, domestic, parenthood,...",[washington man accused killing three people c...
197,196,10,196_austria_vienna_nehammer_kurz,"[austria, vienna, nehammer, kurz, gunman, shoo...",[philipp jenne frank jordan associate press vi...
198,197,10,197_air_medal_force_enduring,"[air, medal, force, enduring, deploy, mathew, ...",[cheyenne troops left supply room f e warren a...


In [151]:
topics_over_time.to_csv('output/dtm/topics_over_time_200.csv', index=False)
topics_over_time

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"turkey, soviet, france, politics, mr",0.001572,1979.96
1,0,"israel, palestinian, arab, lebanon, arafat",0.000496,1979.96
2,3,"klan, hine, spartacist, liverpool, wilkinson",0.000029,1979.96
3,4,"zia, pakistan, haq, heroin, ul",0.000019,1979.96
4,7,"libya, qaddafi, colonel, reagan, oil",0.000324,1979.96
...,...,...,...,...
1847,188,"jersey, kosher, anderson, grew, fulop",0.000037,2018.00
1848,190,"omar, trump, ilhan, venezuela, election",0.000018,2018.00
1849,195,"psilocybin, clinic, domestic, dear, injunction",0.000004,2018.00
1850,196,"austria, vienna, nehammer, kurz, restaurant",0.000026,2018.00


In [154]:
topics_over_time_filtered = topics_over_time.groupby('Topic').filter(lambda x:  np.std(x.Frequency)  > 0.00006)
fig = topic_model.visualize_topics_over_time(topics_over_time_filtered)
# enlarge the size of the plot
fig.update_layout(width=1200, height=800)
fig.write_image('output/dtm/topics_over_time_200.png', engine="orca")
fig.show()

# Topic number 150

In [140]:
topic_model, topics_over_time = get_topics(df, 150)
topic_model.get_topic_info().to_csv('output/dtm/topic_info_150.csv', index=False)
topic_model.get_topic_info()

2024-05-22 20:02:54,472 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/756 [00:00<?, ?it/s]

2024-05-22 20:04:47,393 - BERTopic - Embedding - Completed ✓
2024-05-22 20:04:47,394 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-22 20:04:56,781 - BERTopic - Dimensionality - Completed ✓
2024-05-22 20:04:56,783 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-22 20:04:57,450 - BERTopic - Cluster - Completed ✓
2024-05-22 20:04:57,456 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-22 20:05:06,417 - BERTopic - Representation - Completed ✓
2024-05-22 20:05:10,143 - BERTopic - Topic reduction - Reducing number of topics
2024-05-22 20:05:19,451 - BERTopic - Topic reduction - Reduced number of topics from 309 to 150
20it [00:26,  1.33s/it]




Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,8567,-1_terrorism_said_terrorist_america,"[terrorism, said, terrorist, america, attack, ...",[el paso shoot call give government tool addre...
1,0,1131,0_israel_palestinian_arafat_peace,"[israel, palestinian, arafat, peace, arab, ham...",[israel reel worst wave arab violence years in...
2,1,908,1_pakistan_india_indian_kashmir,"[pakistan, india, indian, kashmir, musharraf, ...",[india prime minister said yesterday governmen...
3,2,718,2_iraq_bush_obama_laden,"[iraq, bush, obama, laden, bin, al, war, qaida...",[washington country move president lead sept n...
4,3,623,3_insurance_coverage_risk_insurer,"[insurance, coverage, risk, insurer, tria, los...",[government study showing available afford ter...
...,...,...,...,...,...
145,144,11,144_pipeline_oil_crude_train,"[pipeline, oil, crude, train, energy, csx, dak...",[bellingham wash federal local authority inves...
146,145,11,145_aleppo_rebel_evacuate_assad,"[aleppo, rebel, evacuate, assad, syria, bashar...",[last remnant rebel supporter evacuate eastern...
147,146,11,146_aids_hiv_garrett_africa,"[aids, hiv, garrett, africa, health, disease, ...",[last saturday world aids day special day reme...
148,147,11,147_austria_vienna_nehammer_police,"[austria, vienna, nehammer, police, gunman, sh...",[philipp jenne frank jordan associate press vi...


In [141]:
topics_over_time.to_csv('output/dtm/topics_over_time_150.csv', index=False)
topics_over_time

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"soviet, france, mr, politics, terrorist",0.001556,1979.96
1,0,"israel, palestinian, arab, lebanon, arafat",0.000577,1979.96
2,1,"naxalite, zia, pakistan, landless, hijack",0.000041,1979.96
3,2,"iraq, baghdad, hussein, azzawi, iran",0.000021,1979.96
4,4,"denton, senator, casey, amato, search",0.000021,1979.96
...,...,...,...,...
1572,133,"klinghoffer, salamon, opera, hijack, leon",0.000009,2018.00
1573,134,"utrecht, tani, tram, netherlands, dutch",0.000032,2018.00
1574,143,"rwanda, rusesabagina, kagame, kigali, genocide",0.000055,2018.00
1575,144,"pipeline, oil, crude, train, derail",0.000018,2018.00


In [149]:
topics_over_time_filtered = topics_over_time.groupby('Topic').filter(lambda x:  np.std(x.Frequency)  > 0.000068)
fig = topic_model.visualize_topics_over_time(topics_over_time_filtered)
# enlarge the size of the plot
fig.update_layout(width=1200, height=800)
fig.write_image('output/dtm/topics_over_time_150.png', engine="orca")
fig.show()

# Topic number 100

In [135]:
topic_model, topics_over_time = get_topics(df, 100)
topic_model.get_topic_info().to_csv('output/dtm/topic_info_100.csv', index=False)
topic_model.get_topic_info()

2024-05-22 19:49:54,319 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/756 [00:00<?, ?it/s]

2024-05-22 19:51:47,298 - BERTopic - Embedding - Completed ✓
2024-05-22 19:51:47,299 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-22 19:51:53,948 - BERTopic - Dimensionality - Completed ✓
2024-05-22 19:51:53,949 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-22 19:51:54,466 - BERTopic - Cluster - Completed ✓
2024-05-22 19:51:54,469 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-22 19:52:00,669 - BERTopic - Representation - Completed ✓
2024-05-22 19:52:03,467 - BERTopic - Topic reduction - Reducing number of topics
2024-05-22 19:52:09,041 - BERTopic - Topic reduction - Reduced number of topics from 296 to 100
20it [00:15,  1.27it/s]




Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,8394,-1_terrorism_said_terrorist_america,"[terrorism, said, terrorist, america, attack, ...",[president obama address nation speech oval of...
1,0,1404,0_israel_palestinian_arafat_syria,"[israel, palestinian, arafat, syria, peace, ar...",[israel reel worst wave arab violence years in...
2,1,1070,1_muslim_islam_america_terrorism,"[muslim, islam, america, terrorism, terrorist,...",[america million proud muslim take new zealand...
3,2,1063,2_fbi_bill_law_senate,"[fbi, bill, law, senate, ashcroft, house, patr...",[associate press washington congress taken fir...
4,3,1020,3_pakistan_india_afghanistan_indian,"[pakistan, india, afghanistan, indian, taliban...",[washington c pakistan report decision relocat...
...,...,...,...,...,...
95,94,12,94_air_force_command_mobility,"[air, force, command, mobility, base, th, patt...",[ceremony wednesday scott air force base air m...
96,95,11,95_headley_rana_mumbai_pakistan,"[headley, rana, mumbai, pakistan, chicago, den...",[chicago federal jury thursday found chicago b...
97,96,11,96_rwanda_rusesabagina_kagame_kigali,"[rwanda, rusesabagina, kagame, kigali, genocid...",[dubai meeting reappear rwanda authority said ...
98,97,11,97_pipeline_oil_crude_train,"[pipeline, oil, crude, train, csx, dakota, der...",[bellingham wash federal local authority inves...


In [136]:
topics_over_time.to_csv('output/dtm/topics_over_time_100.csv', index=False)
topics_over_time


Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"soviet, mr, terrorism, reagan, america",0.001679,1979.96
1,0,"israel, palestinian, lebanon, arab, arafat",0.000805,1979.96
2,1,"puerto, rican, bomb, rico, group",0.000232,1979.96
3,2,"committee, bureau, bill, subcommittee, senator",0.000109,1979.96
4,3,"naxalite, pakistan, zia, afghanistan, hijack",0.000055,1979.96
...,...,...,...,...
1129,88,"klinghoffer, salamon, opera, hijack, leon",0.000013,2018.00
1130,94,"lecake, air, nasic, springfield, fitzgerald",0.000006,2018.00
1131,96,"rwanda, rusesabagina, kagame, kigali, genocide",0.000071,2018.00
1132,97,"oil, pipeline, crude, train, derail",0.000026,2018.00


In [138]:
topics_over_time_filtered = topics_over_time.groupby('Topic').filter(lambda x:  np.std(x.Frequency)  > 0.0001)
fig = topic_model.visualize_topics_over_time(topics_over_time_filtered)
# enlarge the size of the plot
fig.update_layout(width=1200, height=800)
fig.write_image('output/dtm/topics_over_time_100.png', engine="orca")
fig.show()

# Dynamic Topic Modeling by Media

In [30]:
def get_topics_by_classes(df_news):
    news = df_news['cleaned_text_stem'].tolist()
    topic_model = BERTopic(verbose=True)
    topics, probs = topic_model.fit_transform(news)
    classes = df_news['publisher'].tolist()
    topics_per_class = topic_model.topics_per_class(news, classes=classes)
    topics_per_class_media_groupby = topics_per_class.groupby('Class').apply(lambda x: x['Frequency'] / x['Frequency'].sum())
    topics_per_class['Frequency'] = topics_per_class_media_groupby.values
    
    # join topics_per_class with topic_model_media.get_topic_info() on topic column to add Name column to topics_per_class
    topics_per_class = topics_per_class.merge(topic_model.get_topic_info(), left_on='Topic', right_on='Topic')
    return topic_model, topics_per_class

In [35]:
df_nyt = df.loc[df['publisher'] == 'the new york times'] 
df_nyt['publisher'] = 'NYT'
df_ppg = df.loc[df['publisher'] == 'pittsburgh post-gazette'] 
df_ppg['publisher'] = 'PPG'


In [36]:
# concat df_nyt and df_ppg
df_media = pd.concat([df_nyt, df_ppg])
topic_model_media, topics_per_class_media = get_topics_by_classes(df_media)
topic_model_media.get_topic_info()

2024-05-23 11:36:37,947 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/196 [00:00<?, ?it/s]

2024-05-23 11:37:08,901 - BERTopic - Embedding - Completed ✓
2024-05-23 11:37:08,902 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-23 11:37:12,306 - BERTopic - Dimensionality - Completed ✓
2024-05-23 11:37:12,307 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-23 11:37:12,432 - BERTopic - Cluster - Completed ✓
2024-05-23 11:37:12,436 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-23 11:37:14,820 - BERTopic - Representation - Completed ✓
2it [00:01,  1.43it/s]




Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1896,-1_mr_said_terrorism_terrorist,"[mr, said, terrorism, terrorist, america, atta...",[certainly image oratory last week fast spread...
1,0,284,0_israel_palestinian_arafat_peace,"[israel, palestinian, arafat, peace, arab, jew...",[yasir arafat p l committed seek politics dest...
2,1,221,1_pakistan_india_indian_afghanistan,"[pakistan, india, indian, afghanistan, mumbai,...",[prime minister india pakistan agree cooperate...
3,2,167,2_france_paris_macron_attack,"[france, paris, macron, attack, police, strasb...",[man knife kill three people church nice assau...
4,3,163,3_shoot_gun_white_domestic,"[shoot, gun, white, domestic, mass, violence, ...",[washington law enforce official sound alarm m...
...,...,...,...,...,...
105,104,11,104_soviet_union_webster_intelligence,"[soviet, union, webster, intelligence, agency,...",[soviet union agree meet united states discuss...
106,105,11,105_extradite_ireland_northern_treaty,"[extradite, ireland, northern, treaty, britain...",[joseph p doherty member extremist provisional...
107,106,10,106_port_cargo_customs_container,"[port, cargo, customs, container, ship, securi...",[could imagine post world united states govern...
108,107,10,107_assad_syria_peace_clinton,"[assad, syria, peace, clinton, israel, christo...",[president clinton made one day venture shuttl...


In [37]:
topics_per_class_media.to_csv('output/dtm/topics_per_media.csv', index=False)
topics_per_class_media

Unnamed: 0,Topic,Words,Frequency,Class,Count,Name,Representation,Representative_Docs
0,-1,"mr, said, terrorism, terrorist, america",0.300787,NYT,1896,-1_mr_said_terrorism_terrorist,"[mr, said, terrorism, terrorist, america, atta...",[certainly image oratory last week fast spread...
1,0,"israel, palestinian, arafat, peace, arab",0.045165,NYT,284,0_israel_palestinian_arafat_peace,"[israel, palestinian, arafat, peace, arab, jew...",[yasir arafat p l committed seek politics dest...
2,1,"pakistan, india, indian, afghanistan, kashmir",0.034295,NYT,221,1_pakistan_india_indian_afghanistan,"[pakistan, india, indian, afghanistan, mumbai,...",[prime minister india pakistan agree cooperate...
3,2,"france, paris, macron, attack, police",0.029798,NYT,167,2_france_paris_macron_attack,"[france, paris, macron, attack, police, strasb...",[man knife kill three people church nice assau...
4,3,"shoot, gun, white, domestic, mass",0.027361,NYT,163,3_shoot_gun_white_domestic,"[shoot, gun, white, domestic, mass, violence, ...",[washington law enforce official sound alarm m...
...,...,...,...,...,...,...,...,...
202,99,"tunisia, morocco, email, consulate, tunis",0.002198,PPG,12,99_tunisia_morocco_tunis_niqab,"[tunisia, morocco, tunis, niqab, ennahda, sous...",[tunis tunisia prime minister youssef chahe tu...
203,102,"boim, lewin, damages, salah, appeal",0.001099,PPG,11,102_punitive_court_appeal_damages,"[punitive, court, appeal, damages, judge, ruli...",[federal appeal court tuesday threw terrorism ...
204,103,"africa, mali, somalia, troops, continent",0.004396,PPG,11,103_africa_mali_troops_pentagon,"[africa, mali, troops, pentagon, somalia, sene...",[niamey niger nearly every day sometimes twice...
205,107,"assad, clinton, syria, peace, israel",0.003297,PPG,10,107_assad_syria_peace_clinton,"[assad, syria, peace, clinton, israel, christo...",[president clinton made one day venture shuttl...


In [41]:
topics_per_class_media_filtered = topics_per_class_media.groupby('Topic').filter(lambda x:  np.std(x.Frequency)  > 0.004)
# filter out Topic -1
topics_per_class_media_filtered = topics_per_class_media_filtered.loc[topics_per_class_media_filtered['Topic'] != -1]

# Pivot the DataFrame
df_pivot = topics_per_class_media_filtered.pivot(index='Name', columns='Class', values='Frequency').reset_index()

# Create the plot
fig = go.Figure()

# Add NYT scores
fig.add_trace(go.Bar(
    x=df_pivot['Name'],
    y=df_pivot['NYT'],
    name='NYT'
))

# Add PPG scores
fig.add_trace(go.Bar(
    x=df_pivot['Name'],
    y=df_pivot['PPG'],
    name='PPG'
))

# Update layout for grouped bars
fig.update_layout(
    barmode='group',
    width=1200, height=800, 
    title='Frequency of Topics (NYT vs PPG)',
    xaxis_title='Name',
    yaxis_title='Frequency',
    template='plotly_white'
)

fig.write_image('output/dtm/topics_per_media.png')
fig.show()