In [27]:
import pandas as pd 

df = pd.read_csv('authors_species_en_wiki.csv')

df.head()

# From https://quarry.wmcloud.org/query/85509, August 2024

Unnamed: 0,Species,Author
0,Aardvark,Lee Daniel Crocker
1,Aardwolf,Lee Daniel Crocker
2,Almond,Lee Daniel Crocker
3,Albertosaurus,Arco Scheepen
4,Agapanthus_africanus,216.99.203.xxx


In [28]:
import requests
import pandas as pd

# Define the SPARQL query for all species
query = """
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX schema: <http://schema.org/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT ?name ?wikipedia_id ?wikidata_id WHERE {
  ?wikidata_id wdt:P31 wd:Q16521 .
  ?wikidata_id wdt:P105 wd:Q7432 .
  ?wikipedia_id schema:about ?wikidata_id.
  ?wikipedia_id schema:name ?name . 
  ?wikipedia_id schema:inLanguage "en" .
  FILTER(CONTAINS(STR(?wikipedia_id), "wikipedia"))
}
"""

# Set up the request parameters
url = "https://qlever.cs.uni-freiburg.de/api/wikidata"
params = {
    'query': query
}

# Send the request
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    result = response.json()
    
    # Convert to DataFrame
    if 'results' in result and 'bindings' in result['results']:
        rows = []
        for binding in result['results']['bindings']:
            row = {}
            for key, value in binding.items():
                row[key] = value['value']
            rows.append(row)
        
        wikidata_df = pd.DataFrame(rows)
        print("Query successful! Results:")
        print(wikidata_df.head())
        
        # Extract species name from Wikipedia URL
        wikidata_df['species_name'] = wikidata_df['wikipedia_id'].apply(
            lambda x: x.split('/')[-1] if '/' in x else x
        )
    else:
        print("No results found in the response")
else:
    print(f"Query failed with status code {response.status_code}")
    print(response.text)

Query successful! Results:
                      name                                       wikipedia_id  \
0       "Aquifex aeolicus"  https://en.wikipedia.org/wiki/%22Aquifex_aeoli...   
1        "Bufo" scorteccii  https://en.wikipedia.org/wiki/%22Bufo%22_scort...   
2      "Centrolene" azulae  https://en.wikipedia.org/wiki/%22Centrolene%22...   
3  "Centrolene" guanacarum  https://en.wikipedia.org/wiki/%22Centrolene%22...   
4      "Centrolene" medemi  https://en.wikipedia.org/wiki/%22Centrolene%22...   

                               wikidata_id  
0  http://www.wikidata.org/entity/Q4034249  
1   http://www.wikidata.org/entity/Q841369  
2   http://www.wikidata.org/entity/Q859454  
3   http://www.wikidata.org/entity/Q669304  
4   http://www.wikidata.org/entity/Q859505  


In [29]:
# First, let's examine both dataframes to understand their structure
print("Authors dataframe shape:", df.shape)
print("Wikidata dataframe shape:", wikidata_df.shape)

# Check the first few rows of each dataframe
print("\nAuthors dataframe head:")
print(df.head())

print("\nWikidata dataframe head:")
print(wikidata_df.head())

# Check for any potential issues with the join keys
print("\nUnique species in authors dataframe:", df['Species'].nunique())
print("Unique species_name in wikidata dataframe:", wikidata_df['species_name'].nunique())

# Perform the inner join
merged_df = pd.merge(
    df, 
    wikidata_df, 
    left_on='Species', 
    right_on='species_name', 
    how='inner'
)

# Check the result
print("\nMerged dataframe shape:", merged_df.shape)
print("\nMerged dataframe head:")
print(merged_df.head())

# Check how many species were matched
print("\nNumber of species matched:", merged_df['Species'].nunique())

Authors dataframe shape: (329502, 2)
Wikidata dataframe shape: (351757, 4)

Authors dataframe head:
                Species              Author
0              Aardvark  Lee Daniel Crocker
1              Aardwolf  Lee Daniel Crocker
2                Almond  Lee Daniel Crocker
3         Albertosaurus       Arco Scheepen
4  Agapanthus_africanus      216.99.203.xxx

Wikidata dataframe head:
                      name                                       wikipedia_id  \
0       "Aquifex aeolicus"  https://en.wikipedia.org/wiki/%22Aquifex_aeoli...   
1        "Bufo" scorteccii  https://en.wikipedia.org/wiki/%22Bufo%22_scort...   
2      "Centrolene" azulae  https://en.wikipedia.org/wiki/%22Centrolene%22...   
3  "Centrolene" guanacarum  https://en.wikipedia.org/wiki/%22Centrolene%22...   
4      "Centrolene" medemi  https://en.wikipedia.org/wiki/%22Centrolene%22...   

                               wikidata_id                 species_name  
0  http://www.wikidata.org/entity/Q4034249       

KeyboardInterrupt: 

In [9]:
# Query to get Wikidata IDs for bird species
bird_query = """
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX schema: <http://schema.org/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT ?wikidata_id WHERE {
  ?wikidata_id wdt:P31 wd:Q16521 .
  ?wikidata_id wdt:P105 wd:Q7432 .
  ?wikidata_id wdt:P171+ wd:Q5113
}
"""

# Set up the request parameters
url = "https://qlever.cs.uni-freiburg.de/api/wikidata"
params = {
    'query': bird_query
}

# Send the request
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    result = response.json()
    
    # Convert to DataFrame
    if 'results' in result and 'bindings' in result['results']:
        rows = []
        for binding in result['results']['bindings']:
            row = {}
            for key, value in binding.items():
                row[key] = value['value']
            rows.append(row)
        
        bird_df = pd.DataFrame(rows)
        print("Bird query successful! Results:")
        print(f"Number of bird species: {len(bird_df)}")
        print(bird_df.head())
        
        # Create a set of bird Wikidata IDs for faster lookup
        bird_wikidata_ids = set(bird_df['wikidata_id'])
    else:
        print("No results found in the response")
else:
    print(f"Query failed with status code {response.status_code}")
    print(response.text)

Bird query successful! Results:
Number of bird species: 21004
                                 wikidata_id
0    http://www.wikidata.org/entity/Q1000262
1    http://www.wikidata.org/entity/Q1000977
2    http://www.wikidata.org/entity/Q1001580
3  http://www.wikidata.org/entity/Q100158068
4    http://www.wikidata.org/entity/Q1001586


In [16]:
# Query to get Wikidata IDs for mammal species (Q7377 is the Wikidata ID for mammals)
mammal_query = """
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX schema: <http://schema.org/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT ?wikidata_id WHERE {
  ?wikidata_id wdt:P31 wd:Q16521 .
  ?wikidata_id wdt:P105 wd:Q7432 .
  ?wikidata_id wdt:P171+ wd:Q7377
}
"""

# Query to get Wikidata IDs for reptile species (Q10908 is the Wikidata ID for reptiles)
reptile_query = """
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX schema: <http://schema.org/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT ?wikidata_id WHERE {
  ?wikidata_id wdt:P31 wd:Q16521 .
  ?wikidata_id wdt:P105 wd:Q7432 .
  ?wikidata_id wdt:P171+ wd:Q10908
}
"""

# Query to get Wikidata IDs for fish species (Q127282 is the Wikidata ID for actinopterygii)
fish_query = """
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX schema: <http://schema.org/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT ?wikidata_id WHERE {
  ?wikidata_id wdt:P31 wd:Q16521 .
  ?wikidata_id wdt:P105 wd:Q7432 .
  ?wikidata_id wdt:P171+ wd:Q127282
}
"""

# Function to execute a query and return a set of Wikidata IDs
def execute_query_and_get_ids(query):
    url = "https://qlever.cs.uni-freiburg.de/api/wikidata"
    params = {'query': query}
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        result = response.json()
        if 'results' in result and 'bindings' in result['results']:
            return {binding['wikidata_id']['value'] for binding in result['results']['bindings']}
    
    return set()

# Execute queries for different taxonomic groups
mammal_wikidata_ids = execute_query_and_get_ids(mammal_query)
reptile_wikidata_ids = execute_query_and_get_ids(reptile_query)
fish_wikidata_ids = execute_query_and_get_ids(fish_query)

# Print the number of species in each taxonomic group
print(f"Number of mammal species: {len(mammal_wikidata_ids)}")
print(f"Number of reptile species: {len(reptile_wikidata_ids)}")
print(f"Number of fish species: {len(fish_wikidata_ids)}")

# Create a dictionary to store taxonomic groups
taxonomic_groups = {
    'Birds': bird_wikidata_ids,
    'Mammals': mammal_wikidata_ids,
    'Reptiles': reptile_wikidata_ids,
    'Bony Fish': fish_wikidata_ids
}

Number of mammal species: 13364
Number of reptile species: 12199
Number of fish species: 60747


In [17]:
# Filter the merged dataframe to identify species in each taxonomic group
# and calculate statistics for top editors

# Function to filter merged_df by taxonomic group
def get_taxonomic_group_data(group_name, wikidata_ids):
    # Filter merged_df to include only species in the taxonomic group
    group_df = merged_df[merged_df['wikidata_id'].isin(wikidata_ids)]
    
    # Calculate top editors for this taxonomic group
    top_editors = group_df['Author'].value_counts().reset_index()
    top_editors.columns = ['Author', 'Count']
    top_editors = top_editors.sort_values('Count', ascending=False).head(20)
    
    return {
        'group_name': group_name,
        'total_species': len(group_df['Species'].unique()),
        'total_edits': len(group_df),
        'top_editors': top_editors.to_dict('records')
    }

# Calculate statistics for each taxonomic group
taxonomic_stats = {}
for group_name, wikidata_ids in taxonomic_groups.items():
    taxonomic_stats[group_name] = get_taxonomic_group_data(group_name, wikidata_ids)

# Print some basic statistics
for group_name, stats in taxonomic_stats.items():
    print(f"\n{group_name}:")
    print(f"  Total species: {stats['total_species']}")
    print(f"  Total edits: {stats['total_edits']}")
    print(f"  Top 5 editors:")
    for i, editor in enumerate(stats['top_editors'][:5]):
        print(f"    {i+1}. {editor['Author']}: {editor['Count']} edits")


Birds:
  Total species: 10091
  Total edits: 10113
  Top 5 editors:
    1. Polbot: 5285 edits
    2. Jimfbleak: 1399 edits
    3. Pvmoutside: 470 edits
    4. Big iron: 192 edits
    5. Stavenn: 135 edits

Mammals:
  Total species: 4847
  Total edits: 4869
  Top 5 editors:
    1. Polbot: 2086 edits
    2. Exlibris: 337 edits
    3. UtherSRG: 115 edits
    4. WolfmanSF: 107 edits
    5. Geekgecko: 79 edits

Reptiles:
  Total species: 6637
  Total edits: 6651
  Top 5 editors:
    1. Polbot: 4829 edits
    2. Darkfrog24: 305 edits
    3. Shyamal: 108 edits
    4. Stevey7788: 102 edits
    5. Micromesistius: 96 edits

Bony Fish:
  Total species: 17591
  Total edits: 17614
  Top 5 editors:
    1. Amit6: 1789 edits
    2. Polbot: 1769 edits
    3. Wilhelmina Will: 1737 edits
    4. Phil Fish: 1218 edits
    5. Lumpsucker: 808 edits


In [39]:
import json
from IPython.display import HTML

# Convert the taxonomic_stats dictionary to a JSON string for use in JavaScript
taxonomic_stats_json = json.dumps(taxonomic_stats)

# Create a dictionary to store sample species for each author in each taxonomic group
author_species_samples = {}

# For each taxonomic group
for group_name, group_stats in taxonomic_stats.items():
    author_species_samples[group_name] = {}
    
    # Get the top editors for this group
    top_editors = [editor['Author'] for editor in group_stats['top_editors']]
    
    # Filter the merged dataframe to include only species in this taxonomic group
    if group_name == 'All Species':
        group_df = merged_df
    else:
        group_df = merged_df[merged_df['wikidata_id'].isin(taxonomic_groups[group_name])]
    
    # For each top editor, get a sample of up to 10 species they edited
    for editor in top_editors:
        editor_species = group_df[group_df['Author'] == editor]['Species'].unique()
        author_species_samples[group_name][editor] = editor_species[:100].tolist()

# Convert the author_species_samples dictionary to a JSON string
author_species_samples_json = json.dumps(author_species_samples)

# Create HTML content with updated styles and structure
html_content = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Wikipedia Species Editors</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 20px;
            background-color: #f5f5f5;
        }
        .container {
            max-width: 1200px;
            margin: 0 auto;
            background-color: white;
            padding: 20px;
            border-radius: 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }
        .main-content {
            display: flex;
            gap: 20px;
        }
        .left-panel {
            flex: 2;
        }
        .right-panel {
            flex: 1;
            padding: 20px;
            background-color: #f8f9fa;
            border-radius: 8px;
            min-height: 400px;
        }
        h1, h2 {
            color: #333;
            text-align: center;
        }
        .controls {
            margin-bottom: 20px;
            text-align: center;
        }
        select {
            padding: 8px 12px;
            font-size: 16px;
            border-radius: 4px;
            border: 1px solid #ddd;
        }
        .stats-container {
            display: flex;
            justify-content: center;
            gap: 40px;
            margin-bottom: 20px;
        }
        .stats {
            text-align: center;
        }
        .stats-value {
            font-size: 24px;
            font-weight: bold;
            color: #2c3e50;
        }
        .stats-label {
            font-size: 14px;
            color: #7f8c8d;
        }
        .chart-container {
            height: 400px;
            margin-top: 20px;
        }
        .bar {
            fill: #3498db;
            transition: fill 0.3s;
        }
        .bar:hover {
            fill: #2980b9;
        }
        .editor-table {
            width: 100%;
            border-collapse: collapse;
            margin-top: 20px;
        }
        .editor-table th, .editor-table td {
            padding: 8px 12px;
            text-align: left;
            border-bottom: 1px solid #ddd;
        }
        .editor-table th {
            background-color: #f2f2f2;
        }
        .editor-table tr:hover {
            background-color: #f5f5f5;
        }
        .editor-links {
            display: flex;
            gap: 10px;
        }
        .editor-link {
            color: #3498db;
            text-decoration: none;
            padding: 2px 6px;
            border-radius: 3px;
            font-size: 0.9em;
        }
        .editor-link:hover {
            background-color: #3498db;
            color: white;
        }
        .species-list {
            list-style-type: none;
            padding: 0;
            margin: 0;
        }
        .species-list li {
            padding: 8px 0;
            border-bottom: 1px solid #eee;
        }
        .species-list li:last-child {
            border-bottom: none;
        }
        .species-link {
            color: #2c3e50;
            text-decoration: none;
        }
        .species-link:hover {
            color: #3498db;
            text-decoration: underline;
        }
        #selected-editor-info {
            display: none;
        }
        .footer {
            margin-top: 40px;
            padding-top: 20px;
            border-top: 1px solid #eee;
            text-align: center;
            color: #666;
            font-size: 0.9em;
        }
    </style>
</head>
<body>
    <div class="container">
        <h1>English Wikipedia Species Article Creators</h1>
        
        <div class="controls">
            <label for="taxonomic-group">Select Taxonomic Group: </label>
            <select id="taxonomic-group">
                <option value="All Species">All Species</option>
                <option value="Birds">Birds</option>
                <option value="Mammals">Mammals</option>
                <option value="Reptiles">Reptiles</option>
                <option value="Bony Fish">Bony Fish</option>
                <option value="Insects">Insects</option>
                <option value="Plants">Plants</option>
            </select>
        </div>
        
        <div class="stats-container">
            <div class="stats">
                <div class="stats-value" id="total-species">-</div>
                <div class="stats-label">Total Species</div>
            </div>

        </div>
        
        <div class="main-content">
            <div class="left-panel">
                <div id="chart" class="chart-container"></div>
                
                <table class="editor-table">
                    <thead>
                        <tr>
                            <th>Rank</th>
                            <th>Editor</th>
                            <th>Links</th>
                            <th>Edits</th>
                            <th>Percentage</th>
                        </tr>
                    </thead>
                    <tbody id="editor-table-body">
                        <!-- Table rows will be populated by JavaScript -->
                    </tbody>
                </table>
            </div>
            
            <div class="right-panel">
                <div id="selected-editor-info">
                    <h2>Created Species Articles (sample)</h2>
                    <h3 id="selected-editor-name"></h3>
                    <ul id="species-list" class="species-list">
                        <!-- Species list will be populated by JavaScript -->
                    </ul>
                </div>
            </div>
        </div>
        
        <div class="footer">
            <p>Data source: English Wikipedia via Quarry (Query 85509), August 2024</p>
            <p>Species taxonomic classification from Wikidata</p>
            <p>Demo dashboard for the Wikimedia Research Fund 2025 application</p>

        </div>
    </div>

    <script src="https://d3js.org/d3.v7.min.js"></script>
    <script>
"""

# Add the JavaScript part
js_content = f"""
        // Parse the taxonomic stats data
        var taxonomicStats = {taxonomic_stats_json};
        
        // Parse the author species samples data
        var authorSpeciesSamples = {author_species_samples_json};
        
        // Function to update the visualization based on the selected taxonomic group
        function updateVisualization() {{
            var selectedGroup = document.getElementById('taxonomic-group').value;
            var groupData = taxonomicStats[selectedGroup];
            
            // Update stats
            document.getElementById('total-species').textContent = groupData.total_species;
            
            // Update chart
            updateChart(groupData.top_editors.slice(0, 10));
            
            // Update table
            updateTable(groupData.top_editors, groupData.total_edits, selectedGroup);
            
            // Hide the species list when changing groups
            document.getElementById('selected-editor-info').style.display = 'none';
        }}
        
        // Function to update the chart
        function updateChart(topEditors) {{
            // Clear previous chart
            d3.select('#chart').html('');
            
            // Set up dimensions
            var margin = {{top: 20, right: 30, bottom: 90, left: 60}};
            var width = document.getElementById('chart').clientWidth - margin.left - margin.right;
            var height = document.getElementById('chart').clientHeight - margin.top - margin.bottom;
            
            // Create SVG
            var svg = d3.select('#chart')
                .append('svg')
                .attr('width', width + margin.left + margin.right)
                .attr('height', height + margin.top + margin.bottom)
                .append('g')
                .attr('transform', 'translate(' + margin.left + ',' + margin.top + ')');
            
            // Create scales
            var x = d3.scaleBand()
                .domain(topEditors.map(function(d) {{ return d.Author; }}))
                .range([0, width])
                .padding(0.2);
            
            var y = d3.scaleLinear()
                .domain([0, d3.max(topEditors, function(d) {{ return d.Count; }})])
                .nice()
                .range([height, 0]);
            
            // Create axes
            svg.append('g')
                .attr('transform', 'translate(0,' + height + ')')
                .call(d3.axisBottom(x))
                .selectAll('text')
                .attr('transform', 'rotate(-45)')
                .style('text-anchor', 'end')
                .attr('dx', '-.8em')
                .attr('dy', '.15em');
            
            svg.append('g')
                .call(d3.axisLeft(y));
            
            // Create tooltip
            var tooltip = d3.select('body')
                .append('div')
                .attr('class', 'tooltip')
                .style('opacity', 0);
            
            // Create bars
            svg.selectAll('.bar')
                .data(topEditors)
                .enter()
                .append('rect')
                .attr('class', 'bar')
                .attr('x', function(d) {{ return x(d.Author); }})
                .attr('y', function(d) {{ return y(d.Count); }})
                .attr('width', x.bandwidth())
                .attr('height', function(d) {{ return height - y(d.Count); }})
                .on('mouseover', function(event, d) {{
                    tooltip.transition()
                        .duration(200)
                        .style('opacity', .9);
                    tooltip.html('<strong>' + d.Author + '</strong><br>' + d.Count + ' edits')
                        .style('left', (event.pageX + 10) + 'px')
                        .style('top', (event.pageY - 28) + 'px');
                }})
                .on('mouseout', function() {{
                    tooltip.transition()
                        .duration(500)
                        .style('opacity', 0);
                }})
                .on('click', function(event, d) {{
                    showSpeciesList(d.Author, document.getElementById('taxonomic-group').value);
                }});
            
            // Add labels
            svg.append('text')
                .attr('transform', 'rotate(-90)')
                .attr('y', 0 - margin.left)
                .attr('x', 0 - (height / 2))
                .attr('dy', '1em')
                .style('text-anchor', 'middle')
                .text('Number of Edits');
        }}
        
        // Function to update the table
        function updateTable(topEditors, totalEdits, selectedGroup) {{
            var tableBody = document.getElementById('editor-table-body');
            tableBody.innerHTML = '';
            
            topEditors.forEach(function(editor, index) {{
                var percentage = ((editor.Count / totalEdits) * 100).toFixed(2);
                var row = document.createElement('tr');
                
                row.innerHTML = 
                    '<td>' + (index + 1) + '</td>' +
                        '<td>' + editor.Author + '</td>' + 
                    '<td class="editor-links">' +
                    
                        '<a href="https://en.wikipedia.org/wiki/User:' + encodeURIComponent(editor.Author) + '" ' +
                        'class="editor-link" target="_blank">Profile</a>' +
                        '<a href="#" class="editor-link" onclick="showSpeciesList(\\'' + editor.Author + '\\', \\'' + selectedGroup + '\\'); return false;">Species</a>' +
                    '</td>' +
                    '<td>' + editor.Count + '</td>' +
                    '<td>' + percentage + '%</td>';
                
                tableBody.appendChild(row);
            }});
        }}
        
        // Function to show the species list
        function showSpeciesList(editorName, taxonomicGroup) {{
            var editorInfo = document.getElementById('selected-editor-info');
            var editorNameElement = document.getElementById('selected-editor-name');
            var speciesList = document.getElementById('species-list');
            
            // Show the editor info section
            editorInfo.style.display = 'block';
            
            // Set the editor name
            editorNameElement.textContent = editorName;
            
            // Clear the species list
            speciesList.innerHTML = '';
            
            // Get the species edited by this editor in the selected taxonomic group
            var speciesSamples = authorSpeciesSamples[taxonomicGroup][editorName] || [];
            
            // Add each species to the list
            if (speciesSamples.length > 0) {{
                speciesSamples.forEach(function(species) {{
                    var listItem = document.createElement('li');
                    var link = document.createElement('a');
                    link.href = 'https://en.wikipedia.org/wiki/' + encodeURIComponent(species);
                    link.className = 'species-link';
                    link.textContent = species;
                    link.target = '_blank';
                    listItem.appendChild(link);
                    speciesList.appendChild(listItem);
                }});
            }} else {{
                var listItem = document.createElement('li');
                listItem.textContent = 'No species data available for this editor.';
                speciesList.appendChild(listItem);
            }}
        }}
        
        // Initialize the visualization
        document.addEventListener('DOMContentLoaded', function() {{
            // Set up event listener for dropdown
            document.getElementById('taxonomic-group').addEventListener('change', updateVisualization);
            
            // Initial update
            updateVisualization();
        }});
"""

# Combine HTML and JavaScript content
full_html_content = html_content + js_content + """
    </script>
</body>
</html>
"""

# Save the HTML content to a file
with open('species_editors_visualization.html', 'w') as f:
    f.write(full_html_content)

# Display a message confirming the file was created
print("HTML visualization file 'species_editors_visualization.html' has been created successfully.")

# Try to display the HTML in the notebook
try:
    display(HTML(full_html_content))
    print("HTML preview displayed above.")
except Exception as e:
    print(f"Could not display HTML preview: {e}")
    print("Please open the HTML file in a web browser to view the interactive visualization.")

HTML visualization file 'species_editors_visualization.html' has been created successfully.


Rank,Editor,Links,Edits,Percentage


HTML preview displayed above.


In [24]:
import os

# Check if the HTML file was created
if os.path.exists('species_editors_visualization.html'):
    file_size = os.path.getsize('species_editors_visualization.html')
    print(f"The HTML file was created successfully. File size: {file_size} bytes.")
    
    # Print instructions for using the HTML file
    print("\nInstructions for using the enhanced visualization:")
    print("1. The HTML file 'species_editors_visualization.html' can be hosted on GitHub Pages.")
    print("2. The visualization now has a two-panel layout:")
    print("   - Left panel: Chart and table of editors")
    print("   - Right panel: List of species edited by selected editor")
    print("3. Each editor in the table has two links:")
    print("   - 'Profile': Opens their Wikipedia user page in a new tab")
    print("   - 'Species': Shows their edited species in the right panel")
    print("4. The species list in the right panel:")
    print("   - Shows up to 10 sample species for the selected editor")
    print("   - Each species name is a clickable link to its Wikipedia article")
    print("   - Updates automatically when selecting different editors")
    print("5. The visualization includes all taxonomic groups:")
    print("   - All Species (combined view)")
    print("   - Birds")
    print("   - Mammals")
    print("   - Reptiles")
    print("   - Fish")
    print("   - Insects")
    print("   - Plants")
    print("6. Interactive features:")
    print("   - Clicking on bars in the chart shows the editor's species")
    print("   - Hovering over bars shows edit count tooltips")
    print("   - Changing taxonomic groups updates all visualizations")
    print("7. Source information:")
    print("   - Data from English Wikipedia via Quarry (Query 85509), August 2024")
    print("   - Species taxonomic classification from Wikidata")
else:
    print("Error: The HTML file was not created.")

The HTML file was created successfully. File size: 53620 bytes.

Instructions for using the enhanced visualization:
1. The HTML file 'species_editors_visualization.html' can be hosted on GitHub Pages.
2. The visualization now has a two-panel layout:
   - Left panel: Chart and table of editors
   - Right panel: List of species edited by selected editor
3. Each editor in the table has two links:
   - 'Profile': Opens their Wikipedia user page in a new tab
   - 'Species': Shows their edited species in the right panel
4. The species list in the right panel:
   - Shows up to 10 sample species for the selected editor
   - Each species name is a clickable link to its Wikipedia article
   - Updates automatically when selecting different editors
5. The visualization includes all taxonomic groups:
   - All Species (combined view)
   - Birds
   - Mammals
   - Reptiles
   - Fish
   - Insects
   - Plants
6. Interactive features:
   - Clicking on bars in the chart shows the editor's species
   - Hov

In [25]:
# Check a sample of the links to make sure they're formatted correctly
sample_editor = list(taxonomic_stats['All Species']['top_editors'])[0]
sample_species = author_species_samples['All Species'][sample_editor['Author']][0]

print("Sample link verification:")
print(f"1. Editor profile link: https://en.wikipedia.org/wiki/User:{sample_editor['Author']}")
print(f"2. Species article link: https://en.wikipedia.org/wiki/{sample_species}")

# Also verify that we have species samples for all top editors
top_editor_count = len(taxonomic_stats['All Species']['top_editors'])
editors_with_samples = len(author_species_samples['All Species'])
print(f"\nVerification of species samples:")
print(f"Number of top editors: {top_editor_count}")
print(f"Number of editors with species samples: {editors_with_samples}")

Sample link verification:
1. Editor profile link: https://en.wikipedia.org/wiki/User:Wilhelmina Will
2. Species article link: https://en.wikipedia.org/wiki/Artemisia_kruhsiana

Verification of species samples:
Number of top editors: 20
Number of editors with species samples: 20


In [26]:
print("Instructions for hosting on GitHub Pages:")
print("1. Create a new repository on GitHub")
print("2. Upload the 'species_editors_visualization.html' file")
print("3. Go to repository Settings > Pages")
print("4. Under 'Source', select 'main' branch")
print("5. The visualization will be available at: https://[username].github.io/[repository-name]/species_editors_visualization.html")
print("\nNote: The visualization includes:")
print("- Interactive taxonomic group selection")
print("- Direct links to Wikipedia user profiles")
print("- Clickable species lists with links to Wikipedia articles")
print("- Complete statistics for all taxonomic groups")
print("- Source attribution and data provenance information")

Instructions for hosting on GitHub Pages:
1. Create a new repository on GitHub
2. Upload the 'species_editors_visualization.html' file
3. Go to repository Settings > Pages
4. Under 'Source', select 'main' branch
5. The visualization will be available at: https://[username].github.io/[repository-name]/species_editors_visualization.html

Note: The visualization includes:
- Interactive taxonomic group selection
- Direct links to Wikipedia user profiles
- Clickable species lists with links to Wikipedia articles
- Complete statistics for all taxonomic groups
- Source attribution and data provenance information


In [23]:
import os

# Check if the HTML file was created
if os.path.exists('species_editors_visualization.html'):
    file_size = os.path.getsize('species_editors_visualization.html')
    print(f"The HTML file was created successfully. File size: {file_size} bytes.")
    
    # Print instructions for using the HTML file
    print("\nInstructions for using the enhanced visualization:")
    print("1. The HTML file 'species_editors_visualization.html' can be hosted on GitHub Pages.")
    print("2. The visualization now has a two-panel layout:")
    print("   - Left panel: Chart and table of editors")
    print("   - Right panel: List of species edited by selected editor")
    print("3. Each editor in the table has two links:")
    print("   - 'Profile': Opens their Wikipedia user page in a new tab")
    print("   - 'Species': Shows their edited species in the right panel")
    print("4. The species list in the right panel:")
    print("   - Shows up to 10 sample species for the selected editor")
    print("   - Each species name is a clickable link to its Wikipedia article")
    print("   - Updates automatically when selecting different editors")
    print("5. The visualization includes all taxonomic groups:")
    print("   - All Species (combined view)")
    print("   - Birds")
    print("   - Mammals")
    print("   - Reptiles")
    print("   - Fish")
    print("   - Insects")
    print("   - Plants")
    print("6. Interactive features:")
    print("   - Clicking on bars in the chart shows the editor's species")
    print("   - Hovering over bars shows edit count tooltips")
    print("   - Changing taxonomic groups updates all visualizations")
else:
    print("Error: The HTML file was not created.")

The HTML file was created successfully. File size: 53620 bytes.

Instructions for using the enhanced visualization:
1. The HTML file 'species_editors_visualization.html' can be hosted on GitHub Pages.
2. The visualization now has a two-panel layout:
   - Left panel: Chart and table of editors
   - Right panel: List of species edited by selected editor
3. Each editor in the table has two links:
   - 'Profile': Opens their Wikipedia user page in a new tab
   - 'Species': Shows their edited species in the right panel
4. The species list in the right panel:
   - Shows up to 10 sample species for the selected editor
   - Each species name is a clickable link to its Wikipedia article
   - Updates automatically when selecting different editors
5. The visualization includes all taxonomic groups:
   - All Species (combined view)
   - Birds
   - Mammals
   - Reptiles
   - Fish
   - Insects
   - Plants
6. Interactive features:
   - Clicking on bars in the chart shows the editor's species
   - Hov

In [21]:
import os

# Check if the HTML file was created
if os.path.exists('species_editors_visualization.html'):
    file_size = os.path.getsize('species_editors_visualization.html')
    print(f"The HTML file was created successfully. File size: {file_size} bytes.")
    
    # Print instructions for using the HTML file
    print("\nInstructions for using the enhanced visualization:")
    print("1. The HTML file 'species_editors_visualization.html' can be hosted on GitHub Pages.")
    print("2. To do this, push the file to a GitHub repository and enable GitHub Pages in the repository settings.")
    print("3. The visualization allows users to select a taxonomic group from the dropdown menu:")
    print("   - All Species (combined stats across all taxonomic groups)")
    print("   - Birds")
    print("   - Mammals")
    print("   - Reptiles")
    print("   - Fish")
    print("   - Insects")
    print("   - Plants")
    print("4. For each taxonomic group, the visualization displays:")
    print("   - Total number of species")
    print("   - Total number of edits")
    print("   - A bar chart showing the top 10 editors by number of edits")
    print("   - A table listing all top editors with their rank, name, number of edits, and percentage of total edits")
    print("5. New interactive features:")
    print("   - Clicking on an editor's name (in the table or chart) opens a modal showing a sample of species they edited")
    print("   - Hovering over bars in the chart shows tooltips with editor information")
    print("   - The 'All Species' option provides a comprehensive view across all taxonomic groups")
else:
    print("Error: The HTML file was not created.")

The HTML file was created successfully. File size: 53620 bytes.

Instructions for using the enhanced visualization:
1. The HTML file 'species_editors_visualization.html' can be hosted on GitHub Pages.
2. To do this, push the file to a GitHub repository and enable GitHub Pages in the repository settings.
3. The visualization allows users to select a taxonomic group from the dropdown menu:
   - All Species (combined stats across all taxonomic groups)
   - Birds
   - Mammals
   - Reptiles
   - Fish
   - Insects
   - Plants
4. For each taxonomic group, the visualization displays:
   - Total number of species
   - Total number of edits
   - A bar chart showing the top 10 editors by number of edits
   - A table listing all top editors with their rank, name, number of edits, and percentage of total edits
5. New interactive features:
   - Clicking on an editor's name (in the table or chart) opens a modal showing a sample of species they edited
   - Hovering over bars in the chart shows tooltip

In [19]:
# Query to get Wikidata IDs for insect species (Q1390 is the Wikidata ID for insects)
insect_query = """
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX schema: <http://schema.org/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT ?wikidata_id WHERE {
  ?wikidata_id wdt:P31 wd:Q16521 .
  ?wikidata_id wdt:P105 wd:Q7432 .
  ?wikidata_id wdt:P171+ wd:Q1390
}
"""

# Query to get Wikidata IDs for plant species (Q756 is the Wikidata ID for plants)
plant_query = """
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX schema: <http://schema.org/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT ?wikidata_id WHERE {
  ?wikidata_id wdt:P31 wd:Q16521 .
  ?wikidata_id wdt:P105 wd:Q7432 .
  ?wikidata_id wdt:P171+ wd:Q756
}
"""

# Execute queries for insects and plants
insect_wikidata_ids = execute_query_and_get_ids(insect_query)
plant_wikidata_ids = execute_query_and_get_ids(plant_query)

# Print the number of species in each new taxonomic group
print(f"Number of insect species: {len(insect_wikidata_ids)}")
print(f"Number of plant species: {len(plant_wikidata_ids)}")

# Add the new taxonomic groups to our dictionary
taxonomic_groups['Insects'] = insect_wikidata_ids
taxonomic_groups['Plants'] = plant_wikidata_ids

# Calculate statistics for the new taxonomic groups
for group_name in ['Insects', 'Plants']:
    wikidata_ids = taxonomic_groups[group_name]
    taxonomic_stats[group_name] = get_taxonomic_group_data(group_name, wikidata_ids)

# Add "All Species" group
all_species_data = get_taxonomic_group_data('All Species', set().union(*taxonomic_groups.values()))
taxonomic_stats['All Species'] = all_species_data

# Print some basic statistics for the new groups
for group_name in ['Insects', 'Plants', 'All Species']:
    stats = taxonomic_stats[group_name]
    print(f"\n{group_name}:")
    print(f"  Total species: {stats['total_species']}")
    print(f"  Total edits: {stats['total_edits']}")
    print(f"  Top 5 editors:")
    for i, editor in enumerate(stats['top_editors'][:5]):
        print(f"    {i+1}. {editor['Author']}: {editor['Count']} edits")

Number of insect species: 1238390
Number of plant species: 736623

Insects:
  Total species: 133471
  Total edits: 133502
  Top 5 editors:
    1. Wilhelmina Will: 39087 edits
    2. Ruigeroeland: 38701 edits
    3. Qbugbot: 14280 edits
    4. Thine Antique Pen: 8409 edits
    5. Simuliid: 4839 edits

Plants:
  Total species: 62769
  Total edits: 62837
  Top 5 editors:
    1. Polbot: 9143 edits
    2. IceCreamAntisocial: 4500 edits
    3. Gderrin: 4265 edits
    4. Hughesdarren: 3346 edits
    5. Joseph Laferriere: 2795 edits

All Species:
  Total species: 235406
  Total edits: 235586
  Top 5 editors:
    1. Wilhelmina Will: 41005 edits
    2. Ruigeroeland: 38706 edits
    3. Polbot: 24041 edits
    4. Qbugbot: 14290 edits
    5. Thine Antique Pen: 8475 edits
