# Corpus Analysis from OpenRefine

In [14]:
import pandas as pd
import os
import time
import glob
import requests
from tqdm import tqdm
import datetime

## Step 1. Create folder for the process

In [21]:
# Get the current date in "YYYY-MM-DD" format
current_date = datetime.datetime.now().strftime("%Y-%m-%d")

# Define the folder name based on the current date
folder_name = f"data_OR_{current_date}"

# Get the script's directory (where this code is executed)
script_directory = os.getcwd()

# Create the full path for the new folder in the script's directory
folder_path = os.path.join(script_directory, folder_name)

# Check if the folder already exists and create it if not
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Folder '{folder_name}' created in '{script_directory}'")
else:
    print(f"Folder '{folder_name}' already exists in '{script_directory}'")

Folder 'data_OR_2023-09-06' already exists in '/Users/carboni/Documents/UNIGE/pynotebook/OpenRefine_fusion_Local'


## Step 2. Download datasets

### Step 2.1 LOC

#### Online

In [12]:
# Define the URL of the file to download
file_url = 'https://jdp.visualcontagions.net/data/loc.csv'

In [23]:
# Extract the file name from the URL
file_name = os.path.basename(file_url)
file_path = os.path.join(folder_path, file_name)

# Stream the file download with a progress bar
response = requests.get(file_url, stream=True)

# Check if the request was successful
if response.status_code == 200:
    # Get the total file size (in bytes)
    total_size = int(response.headers.get('content-length', 0))

    # Create a progress bar using tqdm
    progress_bar = tqdm(total=total_size, unit='B', unit_scale=True)

    # Open a file for writing
    with open(file_path, 'wb') as file:
        for data in response.iter_content(chunk_size=1024):
            # Write the downloaded data to the file
            file.write(data)

            # Update the progress bar
            progress_bar.update(len(data))

    # Close the progress bar
    progress_bar.close()

    print(f"File '{file_name}' downloaded and saved in '{folder_path}'")
else:
    # Handle errors
    print(f"Download failed with status code {response.status_code}")


100%|██████████| 504M/504M [00:08<00:00, 57.8MB/s] 

File 'loc.csv' downloaded and saved in '/Users/carboni/Documents/UNIGE/pynotebook/OpenRefine_fusion_Local/data_OR_2023-09-06'





In [29]:
loc_dataset = f"{folder_path}/{file_name}"
loc = pd.read_csv(loc_dataset)

#### Local

In [2]:
### it is better to store LOC instead of downloading it each time. 
### Once stored you can uncomment the code 
### (only the part with 1 #)
### and use it to copy the file in the processing folder


import shutil

### Specify the source file path (original location of the CSV file)
### e.g. source_file_path = '/Users/carboni/Documents/UNIGE/LOC.csv'

#source_file_path = 'file_path_of_LOC'

### Specify the new filename for the copied file
# new_filename = 'loc.csv'

### Create the full destination file path by combining the folder and filename
# destination_file_path = os.path.join(destination_folder, new_filename)

### Copy the file from the source to the destination
#shutil.copy(source_file_path, destination_file_path)

#print(f"File '{new_filename}' copied to '{destination_folder}'")


### Step 2.2. Everything Else

**List of the projects from OpenRefine**

In [31]:
projects_dict = {
    '1878246014108': 'Backup_2022_02_03_PDF_cleaned',
    '2374512373252': 'BackupCBT_2022_02_03_IIIF_Only',
    '2332843665824': 'Princeton_Blue_Mountain',
    '1688128007940': 'Journaux_Est-Asiatique',
    '1695291554765': 'Marie',
    '1914921870678': 'Adrien',
    '1695103319397': 'ag',
    '2370932517189': 'ag2',
    '2010224589348': 'Der-Spiegel',
    '2332878816656': 'digiteca',
    '2141702759150': 'l_artiste',
    '2330813936766': 'art_mode',
    '1897293512341': 'BNF_1',
    '1792589433060': 'BNF_2',
    '2216922237200': 'locomotion_automobile',
    '1929627493981': 'Project_Translatio'
}

In [32]:
# Define the API endpoint and parameters
url = 'http://129.194.213.75/command/core/export-rows'
format = 'csv'
engine = '{"facets":[],"mode":"row-based"}'

# Dictionary of project IDs to project names

# the folder is already specified in step 1
#folder_path = 'project_refine'

# Iterate over the projects and their names
for project_id, project_name in projects_dict.items():
    # Set the project parameter
    params = {
        'project': project_id,
        'format': format,
        'engine': engine
    }

    # Specify the file name for each project
    file_name = f'{project_name}.csv'
    file_path = os.path.join(folder_path, file_name)

    # Send the POST request
    response = requests.post(url, params=params)

    # Check the response status and save the CSV data to a file
    if response.status_code == 200:
        # The request was successful, save the CSV data to the file
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print(f"CSV data for '{project_name}' (ID: {project_id}) saved as '{file_name}' in '{folder_path}'")
    else:
        # Handle errors
        print(f"Request for '{project_name}' (ID: {project_id}) failed with status code {response.status_code}")
        print(response.text)


CSV data for 'Backup_2022_02_03_PDF_cleaned' (ID: 1878246014108) saved as 'Backup_2022_02_03_PDF_cleaned.csv' in '/Users/carboni/Documents/UNIGE/pynotebook/OpenRefine_fusion_Local/data_OR_2023-09-06'
CSV data for 'BackupCBT_2022_02_03_IIIF_Only' (ID: 2374512373252) saved as 'BackupCBT_2022_02_03_IIIF_Only.csv' in '/Users/carboni/Documents/UNIGE/pynotebook/OpenRefine_fusion_Local/data_OR_2023-09-06'
CSV data for 'Princeton_Blue_Mountain' (ID: 2332843665824) saved as 'Princeton_Blue_Mountain.csv' in '/Users/carboni/Documents/UNIGE/pynotebook/OpenRefine_fusion_Local/data_OR_2023-09-06'
CSV data for 'Journaux_Est-Asiatique' (ID: 1688128007940) saved as 'Journaux_Est-Asiatique.csv' in '/Users/carboni/Documents/UNIGE/pynotebook/OpenRefine_fusion_Local/data_OR_2023-09-06'
CSV data for 'Marie' (ID: 1695291554765) saved as 'Marie.csv' in '/Users/carboni/Documents/UNIGE/pynotebook/OpenRefine_fusion_Local/data_OR_2023-09-06'
CSV data for 'Adrien' (ID: 1914921870678) saved as 'Adrien.csv' in '/Use

## Step 3. Merge and Analyse

In [64]:
keep_col = ['Media URL','City','Country','wkt', 'normalized_date', 'Title', 'Journal Type']

In [76]:
# List all CSV files in the folder
all_files = glob.glob(folder_path + "/*.csv")

# Iterate over the files and delete those with 'CorpusCombined' in the filename
for filename in all_files:
    if 'CorpusCombined' in filename:
        os.remove(filename)
        print(f"Deleted '{filename}'")

Deleted '/Users/carboni/Documents/UNIGE/pynotebook/OpenRefine_fusion_Local/data_OR_2023-09-06/CorpusCombined_2023-09-06.csv'


### Step 3.1 Create 2 datasets, one with LOC and one without

In [93]:
all_files = glob.glob(folder_path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0, delimiter=',', low_memory=False)
    df = df[keep_col]
    li.append(df)

df_merged = pd.concat(li, axis=0, ignore_index=True)

In [94]:
all_files = glob.glob(folder_path + "/*.csv")
li = []

for filename in all_files:
    # Skip processing 'loc.csv'
    if 'loc.csv' in filename:
        continue    
    df = pd.read_csv(filename, index_col=None, header=0, delimiter=',', low_memory=False)    
    df = df[keep_col]    
    li.append(df)
df_merged_sans_loc = pd.concat(li, axis=0, ignore_index=True)

In [95]:
df_merged_sans_loc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471673 entries, 0 to 471672
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Media URL        471672 non-null  object
 1   City             469653 non-null  object
 2   Country          471619 non-null  object
 3   wkt              471093 non-null  object
 4   normalized_date  471420 non-null  object
 5   Title            471673 non-null  object
 6   Journal Type     328902 non-null  object
dtypes: object(7)
memory usage: 25.2+ MB


In [96]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3536894 entries, 0 to 3536893
Data columns (total 7 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   Media URL        object
 1   City             object
 2   Country          object
 3   wkt              object
 4   normalized_date  object
 5   Title            object
 6   Journal Type     object
dtypes: object(7)
memory usage: 188.9+ MB


### Step 3.2 Save the content of the dataset without LOC

In [98]:
merged_name = f"CorpusCombined_{current_date}.csv"
merged_path = os.path.join(folder_path, merged_name)
df_merged_sans_loc.to_csv(merged_path, index=False)
#df_merged.to_csv(merged_path, index=False)

### Number of Journal and issues

In [99]:
df_merged["Title"].nunique()

4528

In [100]:
df_merged["Media URL"].nunique()

3536673

### Cities and Countries

In [101]:
df_merged["City"].nunique()

1189

In [102]:
df_merged["Country"].nunique()

51

### Journal type

In [42]:
df_merged["Journal Type"].nunique()

83

### Earliest and latest date

In [43]:
df_merged['normalized_date'] =  pd.to_datetime(df_merged['normalized_date'])

In [44]:
df_merged['normalized_date'].min()

Timestamp('1801-12-22 00:00:00')

In [45]:
df_merged['normalized_date'].max()

Timestamp('2021-12-01 00:00:00')