# Main Folder Level

For current workflow for years **1816-1826**.

Results are saved in the folder *'Fourth Notebook Results'* in drive

In [None]:
import spacy
import re
import pandas as pd
import os, json
import time
spacy.cli.download("en_core_web_md")
spacy.require_gpu()
import errno
from os import path
# from tqdm import tqdm


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [None]:
nlp = spacy.load('en_core_web_md')

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# This functions determines whether the word "Chili" found in an article refers\
# to the country or not.
def is_it_a_country(text):
    isCountry = "No"
    for entity in text.ents:
        if entity.label_ in ("NORP", "GPE", "LOC", "ORG"):
            if "Chili" in entity.text:
                isCountry = "Yes"
            elif "Chile" in entity.text:
                isCountry = "Yes"
    return isCountry

#This function is used inside the regex to substitute the target by its lower character
def upper_repl(match):
     return match.group(1).lower()

In [None]:
# This function processes one JSON file and
# determine if any Chile-related content is found.
def run_one_file(filename):
    # Read a JSON file and make it into a table(dataframe) in pandas.
    dataframe = pd.read_json(filename).T

    # Converts all 'chili' - case insensitive - to 'Chili'
    dataframe.replace('(?i)chili','Chili', inplace=True, regex=True)
    dataframe.replace('(?i)chile','Chile', inplace=True, regex=True)

    # Converts common errors like chiliren/chiliden/chiliten to children    
    for data in dataframe[1]:
      data = re.sub(r"(\S*[rdt]\S*Chili\S*|\bChili\S*[rdt]\S*)", upper_repl , data)

    # Prepare the content of the JSON file for named entity recognition
    dataframe['spacyprep'] = dataframe[1].apply(lambda x: nlp(x))

    # Determine if Chile is found in the content
    dataframe['country'] = dataframe['spacyprep'].apply(lambda x: is_it_a_country(x))

    # Count how many Yes (there is Chile-related content) and No in the file
    count = dataframe['country'].value_counts()
    
    try:
      # Return the percentage of No found in the file. If the ratio is less than
      # 1, it means that there ARE Chile-related content
      return count['No']/len(dataframe.index)
    except:
      # In the rare case that a year has a json file with a SINGLE title and related to "Chili"
      return 0


In [None]:

def create_sheet(main, sub, table, output_path):

  folder_exists = path.exists(output_path)
  print(folder_exists)
  try:
    os.mkdir('/content/drive/My Drive/Fourth Notebook results/{}'.format(main))
  except OSError as e:
    if e.errno == errno.EEXIST:
      pass

  if folder_exists:
    print('Sheet already exists. Use non-aggregated version for re-creating sheets')
    print('\n')
  
  else:
    table.to_excel(output_path, index = True)
    print('Excel sheet for {} successfully created'.format(sub))

        
      





In [None]:
# The piece of code below runs the above functions through a year-level folder
# Here input the name of the main folder and the year folder
main_folder = input('Enter the main folder to run: ')

folder_path = "/content/drive/My Drive/Notebook Files/{}".format(main_folder)
print(folder_path)
years = [folder for folder in os.listdir(folder_path) if int(folder[:4]) <= 1816 and int(folder[:4]) >= 1825]
print (years)

for year in years:

  file_path = folder_path + '/{}'.format(year)
  json_files = [pos_json for pos_json in
              os.listdir(file_path) if pos_json.endswith('.json')]
  print('The folder {} has {} json files'.format(file_path, len(json_files)))

  result = []
  for json_file in json_files:
    result.append(run_one_file(file_path+'/'+json_file))

  # Create a table that shows name of each file and result of that file
  result_dict = {'File Name': json_files, 'Result': result}
  result_table = pd.DataFrame(result_dict)
  # Create a table that shows name and result of files that have Chile content
  yes_result = result_table[result_table['Result'] != 1]
  out = '/content/drive/My Drive/Fourth Notebook results/{}/{}.xlsx'.format(main_folder, year)
  create_sheet(main_folder, year, yes_result, out)


#MCLN 
#LVMR



Enter the main folder to run: MCLN
/content/drive/My Drive/Notebook Files/MCLN
['1825', '1826']
The folder /content/drive/My Drive/Notebook Files/MCLN/1825 has 314 json files
True
Sheet already exists. Use non-aggregated version for re-creating sheets


The folder /content/drive/My Drive/Notebook Files/MCLN/1826 has 312 json files
False
Excel sheet for 1826 successfully created


In [None]:
print (yes_result)

      File Name    Result
0    03_07.json  0.888889
1    02_07.json  0.888889
3    02_10.json  0.800000
4    10_27.json  0.909091
7    05_10.json  0.800000
..          ...       ...
304  04_18.json  0.909091
305  06_07.json  0.944444
307  02_02.json  0.888889
310  03_02.json  0.916667
311  03_18.json  0.909091

[145 rows x 2 columns]


In [None]:
print('A run of folder {} for years 1816-1826 is complete! '.format(main_folder))

A run of folder MCLN for years 1816-1826 is complete! 
