# Corpus from OpenRefine

The notebook download and merge all the sources tagged with #corpusVC in OpenRefine 

In [None]:
import pandas as pd
import os
import time
import glob
import requests
from tqdm import tqdm
import datetime
import plotly.express as px
import shutil

## Step 1. Create folder for the process

In [None]:
# Get the current date in "YYYY-MM-DD" format
current_date = datetime.datetime.now().strftime("%Y-%m-%d")

# Define the folder name based on the current date
folder_name = f"data_OR_{current_date}"

# Get the script's directory (where this code is executed)
script_directory = os.getcwd()

# Create the full path for the new folder in the script's directory
folder_path = os.path.join(script_directory, folder_name)

# Check if the folder already exists and create it if not
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Folder '{folder_name}' created in '{script_directory}'")
else:
    print(f"Folder '{folder_name}' already exists in '{script_directory}'")

## Step 2. Download datasets

### Step 2.1 LOC

#### Online

In [None]:
# Define the URL of the file to download
file_url = 'https://jdp.visualcontagions.net/data/loc.csv'

In [None]:
# Extract the file name from the URL
file_name = os.path.basename(file_url)
file_path = os.path.join(folder_path, file_name)

# Stream the file download with a progress bar
response = requests.get(file_url, stream=True)

# Check if the request was successful
if response.status_code == 200:
    # Get the total file size (in bytes)
    total_size = int(response.headers.get('content-length', 0))

    # Create a progress bar using tqdm
    progress_bar = tqdm(total=total_size, unit='B', unit_scale=True)

    # Open a file for writing
    with open(file_path, 'wb') as file:
        for data in response.iter_content(chunk_size=1024):
            # Write the downloaded data to the file
            file.write(data)

            # Update the progress bar
            progress_bar.update(len(data))

    # Close the progress bar
    progress_bar.close()

    print(f"File '{file_name}' downloaded and saved in '{folder_path}'")
else:
    # Handle errors
    print(f"Download failed with status code {response.status_code}")


#### Local

In [None]:
loc_source_file = "/Users/carboni/Documents/UNIGE/pynotebook/OpenRefine_fusion_Local/LOC/LOC.csv"
shutil.move(loc_source_file, folder_path)

### Step 2.2. Everything Else

**List of the projects from OpenRefine**

In [None]:
# Define the API endpoint URL
api_url = 'http://129.194.213.75/command/core/get-all-project-metadata'

# Disable SSL certificate verification (use with caution)
verify_ssl = False

# Make the GET request to the API
response = requests.get(api_url, verify=verify_ssl)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the JSON response
    api_response = response.json()
else:
    print(f"API request failed with status code: {response.status_code}")
    print(response.text)

projects_data = api_response.get('projects', {})

# Initialize an empty dictionary for projects
projects_dict = {}

# Iterate through the projects
for project_id, project_info in projects_data.items():
    project_tags = project_info.get('tags', [])
    if 'corpusVC' in project_tags:
        project_name = project_info.get('name', '')
        projects_dict[project_id] = project_name

print(projects_dict)

In [None]:
# Define the API endpoint and parameters
url = 'http://129.194.213.75/command/core/export-rows'
format = 'csv'
engine = '{"facets":[],"mode":"row-based"}'

# Dictionary of project IDs to project names

# the folder is already specified in step 1
#folder_path = 'project_refine'

# Iterate over the projects and their names
for project_id, project_name in projects_dict.items():
    # Set the project parameter
    params = {
        'project': project_id,
        'format': format,
        'engine': engine
    }

    # Specify the file name for each project
    file_name = f'{project_name}.csv'
    file_path = os.path.join(folder_path, file_name)

    # Send the POST request
    response = requests.post(url, params=params)

    # Check the response status and save the CSV data to a file
    if response.status_code == 200:
        # The request was successful, save the CSV data to the file
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print(f"CSV data for '{project_name}' (ID: {project_id}) saved as '{file_name}' in '{folder_path}'")
    else:
        # Handle errors
        print(f"Request for '{project_name}' (ID: {project_id}) failed with status code {response.status_code}")
        print(response.text)


## Step 3. Merge and Analyse

In [None]:
keep_col = ['Media URL','City','Country','wkt', 'normalized_date', 'Title', 'Journal Type']

In [None]:
# List all CSV files in the folder
all_files = glob.glob(folder_path + "/*.csv")

# Iterate over the files and delete those with 'CorpusCombined' in the filename
for filename in all_files:
    if 'CorpusCombined' in filename:
        os.remove(filename)
        print(f"Deleted '{filename}'")

### Step 3.1 Create 2 datasets, one with LOC and one without

In [None]:
#with LOC

all_files = glob.glob(folder_path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0, delimiter=',', low_memory=False)
    df = df[keep_col]
    df['source'] = filename.split("/")[-1]
    li.append(df)

df_merged = pd.concat(li, axis=0, ignore_index=True)

In [None]:
#without LOC + sources

all_files = glob.glob(folder_path + "/*.csv")
li = []

for filename in all_files:
    # Skip processing 'loc.csv'
    if 'loc.csv' in filename:
        continue    
    df = pd.read_csv(filename, index_col=None, header=0, delimiter=',', low_memory=False)    
    df = df[keep_col]   
    df['source'] = filename.split("/")[-1]
    li.append(df)
    
df_merged_sans_loc = pd.concat(li, axis=0, ignore_index=True)

In [None]:
df_merged_sans_loc.to_csv('combined.csv', index=False)

In [None]:
combined_light = df_merged_sans_loc.drop_duplicates(subset='Title', keep="first")

In [None]:
combined_light.to_csv('combined_light.csv', index=False)

In [None]:
df_merged_sans_loc.info()

In [None]:
df_merged.info()

### Step 3.2 Save the content of the dataset without LOC

In [None]:
merged_name = f"CorpusCombined_{current_date}.csv"
merged_path = os.path.join(folder_path, merged_name)
df_merged_sans_loc.to_csv(merged_path, index=False)
#df_merged.to_csv(merged_path, index=False)

### Number of Journal and issues

In [None]:
df_merged["Title"].nunique()

In [None]:
df_merged["Media URL"].nunique()

### Cities and Countries

In [None]:
df_merged["City"].nunique()

In [None]:
df_merged["Country"].nunique()

In [None]:
country_counts = df_merged['Country'].value_counts().reset_index()
country_counts.columns = ['Country', 'Count']

# Create the Icicle Chart
fig = px.icicle(country_counts, path=['Country'], values='Count', 
                title='Country Frequency in DataFrame')

# Show the chart
fig.show()

### Journal type

In [None]:
df_merged["Journal Type"].nunique()

### Earliest and latest date

In [None]:
df_merged['normalized_date'] =  pd.to_datetime(df_merged['normalized_date'])

In [None]:
df_merged['normalized_date'].min()

In [None]:
df_merged['normalized_date'].max()

In [None]:
df_merged.to_csv('/Users/carboni/Downloads/merged.csv', index=False)