## Analysis File: Atlas Network - 2023

https://mybinder.org/v2/gh/mojeffski/atlas_analysis_test/master

### Table of Content**
1. Loading 4 google-sheets
2. Combining sheets
3. Checking for missing values
4. Analysis

**importing libraries**

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import dtale
from collections import Counter

import watermark

In [3]:
# This is just to update the requirement.txt, 
# which is necessary for updating the file on binder

"""
%load_ext watermark
%watermark -v -m -p pandas,numpy,networkx,dtale,collections,watermark

%watermark -u -n -t -z
"""

'\n%load_ext watermark\n%watermark -v -m -p pandas,numpy,networkx,dtale,collections,watermark\n\n%watermark -u -n -t -z\n'

## 1. Loading 4 google-sheets

In [None]:
google_path = "https://docs.google.com/spreadsheets/d/"
excel_export = "/export?format=xlsx"

### google-sheet-codes:

In [None]:
#ATLAS_CONTENT_FINAL_2023
atlas_all_content = "1ga9sBiBQPPRRxoyLnp13ywzLDPA0g3TmuTloDv_Pgw0"

#atlas_think_tank_founding_dates
atlas_founding_dates = "1rgx5XO2B_TeWsEdDDD-OJtbynEKR0Nb41mRZ-fBcu_g"

#cleaned_atlas_project
atlas_network_files = "1zIrHwGjn_VGvZRYzYC9Lt-hPHup_Dz9JuqVeL7uXFsE"

atlas_main_employer = "131mPB129qhutrQaiPAc48zt7IpBhfjSB"

denial_petitions = "1hlCSW1ZzdPmuwfhQ_Wo_f-ZTVEfz0kHv"

mises = "1oq2joQuBviOOQMXHkezcoG5dAEG9xjbOc5zNuDnixxk"

mises_authors = "1oq2joQuBviOOQMXHkezcoG5dAEG9xjbOc5zNuDnixxk"

In [None]:
# The dictionary contains the codes to load the individual google-sheets 
# and specifies the particular tabs that shall be importat


ALL_dfs = {'l_atlas_all_content_analysis' : [[atlas_all_content],['EUROPE_FULL','NORTH_AMERICA',
                            'AFRICA_MENA_FULL','OCEANIA_FULL',
                            'LATIN_AMERICA_FULL'   
                           ]],
           'l_atlas_founding_dates' : [[atlas_founding_dates],['africa_mena','latin_america',
                         'western_europe','southern_europe',
                         'eastern_central_europe','nordic_exsoviet',
                         'uk_ireland','north_america',
                         'oceania_asia'
                            ]],
           'l_network_files' : [[atlas_network_files],['africa_mena','eastern_central_europe',
                 'latin_america','nordic_exsoviet',
                 'north_america','oceania_asia',
                 'uk_ireland','southern_europe',
                 'western_europe'
                            ]],
           'main_employer' : [[atlas_main_employer],['main_employer_info_170723']],
           'denial_petitions' : [[denial_petitions],['All_petition_signatures'   
                           ]],
           'mises' :[[mises],["Mises_all_individuals"]],
           'mises_authors' :[[mises_authors],["edges_mises"]]
            }

In [None]:
ALL_dfs_dict = {
    'l_atlas_all_content_analysis' : {},
    'l_atlas_founding_dates' : {},
    'l_network_files' : {},
    'main_employer' : {},
    'denial_petitions':{},
    'mises':{},
    'mises_authors':{}
}

continents = {'eastern_central_europe':'europe',
'southern_europe':'europe',
'western_europe':'europe',
'nordic_exsoviet':'europe',
'uk_ireland':'europe',
'north_america':'north_america',
'latin_america':'latin_america',
'oceania_asia':'oceania_asia'
}

### 1.2.  import google sheets as individual dataframes

In [None]:
for key,val in ALL_dfs.items():
    xls = pd.ExcelFile(f"{google_path}{val[0][0]}{excel_export}")
    #print(f"{google_path}{val[0]}{excel_export}")
    print('NEXT SHEET:',key)
    for sheet in val[1]:
        file = pd.read_excel(xls,sheet,header=0)
        file.name = sheet
        if sheet == 'main_employer_info_170723' or sheet =='All_petition_signatures' or sheet =='Mises_all_individuals':
            ALL_dfs_dict[key].update({file.name:file})
            print(file.name)
            
        else:
            file['REGION_SHEET'] = file.name
            file['REGION_SHEET_BROAD'] = file['REGION_SHEET'].replace(continents)
            print(file.name, ':  Nr. of rows:',len(file))
            ALL_dfs_dict[key].update({file.name:file})

In [None]:
for key,val in ALL_dfs_dict.items():
    #print(key)
    combined_df = pd.concat(ALL_dfs_dict[key].values(), ignore_index=True)
    ALL_dfs_dict.update({key:combined_df})

In [None]:
#ALL_dfs_dict['l_atlas_all_content_analysis']

## Overviews of missing values:

In [None]:
# This concerns missing values in the content file:

In [None]:
content_sheet = ALL_dfs_dict['l_atlas_all_content_analysis']
# Authors:
df_empty_authors = content_sheet.authors.isnull().groupby(content_sheet['REGION_SHEET']).sum().astype(int).reset_index(name='empty_author_fields')
# Years
df_empty_years = content_sheet.year.isnull().groupby(content_sheet['REGION_SHEET']).sum().astype(int).reset_index(name='empty_years')
# Hot-Topics
df_empty_hot_topics = content_sheet.new_hot_topics.isnull().groupby(content_sheet['REGION_SHEET']).sum().astype(int).reset_index(name='empty_hot_topics')

df_empty_thinktanks = content_sheet.thinktank.isnull().groupby(content_sheet['REGION_SHEET']).sum().astype(int).reset_index(name='empty_thinktanks')

#Concat to one dataframe
content_empty_overview = pd.concat([df_empty_authors.set_index('REGION_SHEET'),
           df_empty_hot_topics.set_index('REGION_SHEET'),
           df_empty_years.set_index('REGION_SHEET'),
           df_empty_thinktanks.set_index('REGION_SHEET')
                                   ]
          ,axis=1)

In [None]:
content_empty_overview