In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from google.colab import drive
# Plotting pretty figures and avoid blurry images
%config InlineBackend.figure_format = 'retina'
# Larger scale for plots in notebooks
sns.set_context('talk')

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Enable multiple cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/Data Vanguard/Source Analysis/

/content/drive/MyDrive/Data Vanguard/Source Analysis


In [None]:
all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 486 entries, 0 to 485
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   citation_name     486 non-null    object
 1   publisher         486 non-null    object
 2   publication_date  486 non-null    int64 
 3   publication_type  486 non-null    object
 4   publisher_hq      486 non-null    object
 5   citation_comment  486 non-null    object
dtypes: int64(1), object(5)
memory usage: 22.9+ KB


In [None]:
all = pd.read_csv('uyghur sources.csv')
all.head()

Unnamed: 0,citation_name,publisher,publication_date,publication_type,publisher_hq,citation_comment
0,"241 ^ ""Short Form of the China Tribunal's Jud...",China Tribunal,2020,NGO,UK,Alleged ties to Falun Gong
1,"404 ^ ""Turkey continues arresting Uyghurs dur...",Stockholm Centre for Freedom,2021,Media,Sweden,Alleged ties to Gülen Movement
2,"8 ^ Adrian Zenz (July 2019). ""Break Their Roo...",The Journal of Political Risk,2019,Journal,USA,"Authored by Adrian Zenz, Newlines Institute, W..."
3,"74 ^ Zenz, Adrian. ""Brainwashing, Police Guard...",The Journal of Political Risk,2019,Journal,USA,"Authored by Adrian Zenz, Newlines Institute, W..."
4,"76 ^ Zenz, Adrian (1 July 2020). ""China's Own ...",Foreign Policy,2020,Media,USA,"Authored by Adrian Zenz, Newlines Institute, W..."


In [None]:
all['publication_date'].value_counts()

2021    173
2020    134
2019    105
2018     26
2014      8
2017      7
2015      6
2012      3
2009      3
2007      2
1990      2
2004      2
2005      2
2022      2
2011      2
2016      2
1998      1
1993      1
2010      1
2001      1
2002      1
2003      1
1986      1
Name: publication_date, dtype: int64

In [None]:
date_df = all.publication_date.value_counts().rename_axis('publication_date').reset_index(name='unique_values')
date_df

Unnamed: 0,publication_date,unique_values
0,2021,173
1,2020,134
2,2019,105
3,2018,26
4,2014,8
5,2017,7
6,2015,6
7,2012,3
8,2009,3
9,2007,2


In [None]:
import plotly.express as px
fig = px.bar(date_df, x="publication_date",y='unique_values',title='Number of Articles per Year in Citations',text='unique_values')
fig.show()

In [None]:
country_df = all.publisher_hq.value_counts().rename_axis('publisher_hq').reset_index(name='unique_values')
country_df

Unnamed: 0,publisher_hq,unique_values
0,USA,248
1,UK,76
2,Canada,43
3,Australia,23
4,Qatar,19
5,France,16
6,Israel,13
7,China (Hong Kong),6
8,International,5
9,Germany,4


In [None]:
all['publisher_hq'].value_counts()

USA                  248
UK                    76
Canada                43
Australia             23
Qatar                 19
France                16
Israel                13
China (Hong Kong)      6
International          5
Germany                4
China (Taiwan)         4
Japan                  3
Belgium                3
India                  3
Singapore              3
China (Mainland)       2
New Zealand            2
Philippines            2
Turkey                 2
Sweden                 1
Journal                1
Russia                 1
Jordan                 1
Saudi Arabia           1
China                  1
Ireland                1
Switzerland            1
Netherlands            1
Name: publisher_hq, dtype: int64

In [None]:
all['publication_type'].value_counts()

Media           414
Journal          21
Agency           15
Thinktank        12
NGO              10
Book              7
Organisation      3
Book Review       1
Product           1
Letter            1
Travel Guide      1
Name: publication_type, dtype: int64

In [None]:
import plotly.express as px
fig = px.scatter(country_df, x="publisher_hq", y="unique_values",
	         size="unique_values", color="publisher_hq",
                 hover_name="publisher_hq",text='unique_values', size_max=60,title="Who's voice is being represented on Wikipedia?")
fig.show()

In [None]:
# Draw the figure
year_2019 = all[all['publication_date'] == 2019]
year_2019_us = year_2019[year_2019['publisher_hq'] == 'USA']     
year_2020 = all[all['publication_date'] == 2020]
year_2020_us = year_2020[year_2020['publisher_hq'] == 'USA']     
year_2021 = all[all['publication_date'] == 2021]
year_2021_us = year_2021[year_2021['publisher_hq'] == 'USA']     
year_2019.head()

last_three = all.loc[(all['publication_date'] >= 2019) & (all['publication_date'] <= 2021)]
last_three_us = last_three[last_three['publisher_hq'] == 'USA'] 

Unnamed: 0,citation_name,publisher,publication_date,publication_type,publisher_hq,citation_comment
2,"8 ^ Adrian Zenz (July 2019). ""Break Their Roo...",The Journal of Political Risk,2019,Journal,USA,"Authored by Adrian Zenz, Newlines Institute, W..."
3,"74 ^ Zenz, Adrian. ""Brainwashing, Police Guard...",The Journal of Political Risk,2019,Journal,USA,"Authored by Adrian Zenz, Newlines Institute, W..."
6,"288 ^ Fiskejö, Magnus (8 April 2019). ""China'...",Inside Higher Ed.,2019,Media,USA,"Authored by Magnus Fiskejö, Reppy Institute, C..."
22,"85 ^ ""Foreign Ministry Spokesperson Hua Chunyi...",Foreign Ministry of China,2019,Agency,China,Government funded
23,3 ^ Congressional Research Service (18 June 20...,Congressional Research Service,2019,Agency,USA,Government funded


In [None]:
tree = px.treemap(last_three_us, path=['publication_date', 'publisher_hq', 'publication_type']
              , color  = 'publication_type')
# Set title and font size
tree.update_layout(title=dict(text="Last Three Years Breakdown In the USA by Publication Type",
                              x=0.5),
                   title_font= dict(family = 'Arial', size = 35),
                   font = dict(size = 25, family = 'Verdana'),     
                   hovermode = False,                    
                   width  = 1400, height = 1400)
tree.data[0].textinfo = 'label + value'
#tree.data[0].hovertemplate = '%{label}<br>%{value}'
tree.show()

In [None]:
all['publisher'].value_counts()

Radio Free Asia            24
Reuters                    23
CNN                        23
The Guardian               23
Al Jazeera                 19
                           ..
Catholic News Agency        1
The Heritage Foundation     1
Al Araby                    1
The Moscow Times            1
Byline Times                1
Name: publisher, Length: 181, dtype: int64

In [None]:
usa_sources = all[all['publisher_hq'] == 'USA']

In [None]:
usa_sources.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 248 entries, 2 to 483
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   citation_name     248 non-null    object
 1   publisher         248 non-null    object
 2   publication_date  248 non-null    int64 
 3   publication_type  248 non-null    object
 4   publisher_hq      248 non-null    object
 5   citation_comment  248 non-null    object
dtypes: int64(1), object(5)
memory usage: 13.6+ KB


In [None]:
usa_sources['publication_type'].value_counts()

Media           209
Agency           10
Thinktank         9
Journal           8
NGO               6
Book              2
Book Review       1
Organisation      1
Product           1
Letter            1
Name: publication_type, dtype: int64

In [None]:
usa_sources['publisher'].value_counts()

Radio Free Asia                             24
CNN                                         23
The New York Times                          15
Associated Press                            13
The Diplomat                                13
                                            ..
Inside Higher Ed.                            1
MIT Technology Review                        1
boston.com                                   1
The Weekly Standard                          1
US Senate Committee on Foreign Relations     1
Name: publisher, Length: 90, dtype: int64

In [None]:
usa_sources['citation_comment'].value_counts()

No known association                                       199
Government funded                                           42
Authored by Adrian Zenz, Newlines Institute, Washington      3
Authored by Magnus Fiskejö, Reppy Institute, Cornell         2
Christian fundamentalist                                     2
Name: citation_comment, dtype: int64

In [None]:

pub_all = all.publisher.value_counts().rename_axis('publisher').reset_index(name='unique_values')
pub_all
pub_df = usa_sources.publisher.value_counts().rename_axis('publisher').reset_index(name='unique_values')
pub_df

Unnamed: 0,publisher,unique_values
0,Radio Free Asia,24
1,Reuters,23
2,CNN,23
3,The Guardian,23
4,Al Jazeera,19
...,...,...
176,Catholic News Agency,1
177,The Heritage Foundation,1
178,Al Araby,1
179,The Moscow Times,1


Unnamed: 0,publisher,unique_values
0,Radio Free Asia,24
1,CNN,23
2,The New York Times,15
3,Associated Press,13
4,The Diplomat,13
...,...,...
85,Inside Higher Ed.,1
86,MIT Technology Review,1
87,boston.com,1
88,The Weekly Standard,1


In [None]:
pub_all.sort_values('unique_values',ascending=False)
top20all = pub_all[:20]

pub_df.sort_values('unique_values',ascending=False)
top20 = pub_df[:20]

Unnamed: 0,publisher,unique_values
0,Radio Free Asia,24
2,CNN,23
3,The Guardian,23
1,Reuters,23
4,Al Jazeera,19
...,...,...
98,iNews,1
97,Defense One,1
96,US News and World Report,1
95,US Department of Commerce,1


Unnamed: 0,publisher,unique_values
0,Radio Free Asia,24
1,CNN,23
2,The New York Times,15
3,Associated Press,13
4,The Diplomat,13
...,...,...
56,LAist,1
57,ABC-CLIO,1
58,US News and World Report,1
59,US Chamber of Commerce,1


In [None]:
import plotly.graph_objects as go
import numpy as np
import plotly.express as px
fig = px.bar(top20, x="publisher", y='unique_values',text='unique_values', title="Who is Behind USA Coverage?")
fig.show()

In [None]:
import plotly.graph_objects as go
import numpy as np
import plotly.express as px
fig = px.bar(top20all, x="publisher", y='unique_values',text='unique_values', title="Who is Behind Global Coverage?")
fig.show()

In [None]:
import plotly.express as px
fig = px.pie(usa_sources, values=usa_sources.citation_comment.value_counts(), names=usa_sources.citation_comment.value_counts().index, labels={'value'},title='State Funding Dominating the "Free Press"')
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.update_traces(textposition='inside', textinfo='value')
fig.show()

In [None]:
import plotly.express as px
fig = px.pie(all, values=all.citation_comment.value_counts(), names=all.citation_comment.value_counts().index, labels={'citation_comment'},title='State Funding Dominating the World Press Too')
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()

In [None]:
no_association = all[all['citation_comment'] == 'No known association']

In [None]:
pub_noassoc = no_association.publisher.value_counts().rename_axis('publisher').reset_index(name='unique_values')
pub_noassoc

Unnamed: 0,publisher,unique_values
0,Reuters,23
1,CNN,23
2,The Guardian,23
3,Al Jazeera,19
4,The New York Times,15
...,...,...
147,Stuff,1
148,Strategic Insights,1
149,boston.com,1
150,The Irish Times,1


In [None]:
pub_large = pub_noassoc[pub_noassoc['unique_values'] >= 5]
fig = px.bar(pub_large, x="publisher", y='unique_values',text='unique_values', title="What About the Citations without a Known Association?")
fig.show()