In [None]:
import pandas as pd

# Load the TSV file into a DataFrame
file_path = 'constraint-corrections-oneOf.tsv'

# Assign column names
col_names = [
    "constraint.statement",
    "revision.id.url",
    "subject.t0", "predicate.t0", "object.t0",
    "follows.symbol",
    "subject.t1", "predicate.t1", "object.t1", 
    "cud.action",
    "V11", "V12", "V13", "V14"
]
df = pd.read_csv(file_path, sep='\t', header=None, names=col_names)

In [None]:
df

# Extract Users

In [None]:
# Extract revision_id and property_id
df['revision_id'] = df['revision.id.url'].str.extract(r".*/(\d+)>")[0]
df['property_id'] = df['predicate.t0'].str.extract(r".*/(P\d+)>")[0]

# Create the result_string and initialize Full_User_URL as empty
df['result_string'] = (
    "https://www.wikidata.org/w/index.php?title=Property:" + 
    df['property_id'] + "&oldid=" + df['revision_id']
)
df['Full_User_URL'] = ''  # Initialize with empty strings

In [None]:
# Reorder columns: new columns first, followed by the rest
new_order = ['revision_id', 'property_id', 'result_string', 'Full_User_URL'] + df.columns[:-4].tolist()
df = df[new_order]

In [None]:
df

In [None]:
import requests
import pandas as pd
from tqdm import tqdm
import time
import numpy as np

# Wikidata API base URL
wikidata_api_url = "https://www.wikidata.org/w/api.php"
output_file = "users_one_of.tsv"

# Process each row and update the DataFrame
for i in tqdm(range(len(df)), desc="Processing rows"):
    if pd.isna(df.loc[i, 'Full_User_URL']) or df.loc[i, 'Full_User_URL'] == '':
        full_user_url = ""
        revision_id = df.loc[i, 'revision_id']
        
        # API parameters
        params = {
            'action': 'query',
            'prop': 'revisions',
            'revids': revision_id,
            'rvprop': 'user|timestamp',
            'format': 'json'
        }
        
        # Make an HTTP GET request to the API
        response = requests.get(wikidata_api_url, params=params)
        
        if response.status_code == 200:
            json_data = response.json()
            
            # Navigate through the JSON response to find the user
            pages = json_data.get('query', {}).get('pages', {})
            for page_id, page_data in pages.items():
                revisions = page_data.get('revisions', [])
                if revisions:
                    user = revisions[0].get('user')
                    if user:
                        # Construct full user URL
                        full_user_url = f"https://www.wikidata.org/wiki/User:{user}"
                        
                        # Update the DataFrame with the Full_User_URL
                        df.at[i, 'Full_User_URL'] = full_user_url

        # Simulate delay to avoid overwhelming the server
        #if (i+1) % 1000 == 0:
        #    time.sleep(max(1, np.random.normal(3, 1)))

        # Save progress to output file periodically or after each iteration
        if (i+1) % 1000000 == 0:
            print("saving backup")
            df.to_csv(output_file, sep='\t', index=False)

# Final save of the complete DataFrame to the output file
df.to_csv(output_file, sep='\t', index=False)
print("Processing complete. Output saved to:", output_file)

In [None]:
import pandas as pd

def analyze_user_urls(df, column_name='Full_User_URL', top_n=20):
    """
    Analyzes the 'Full_User_URL' column to find the top users and their share.
    """
    if column_name not in df.columns:
        print(f"Error: Column '{column_name}' not found in DataFrame.")
        return None

    user_counts = df[column_name].value_counts()
    total_urls = len(df[column_name])

    top_users = user_counts.head(top_n)
    top_users_df = pd.DataFrame({'Count': top_users})
    top_users_df['Share'] = top_users_df['Count'] / total_urls * 100

    return top_users_df

def print_top_users(df_analysis):
    if df_analysis is None:
        return
    print("Top Users Analysis:")
    for user, row in df_analysis.iterrows():
        print(f"User: {user}, Count: {row['Count']}, Share: {row['Share']:.2f}%")

# Example usage (assuming 'df' is your DataFrame):
# df = pd.read_csv('your_data.csv') #if reading from a file.

top_users_analysis = analyze_user_urls(df) #run the analysis, and save the result.
print_top_users(top_users_analysis) #pass the result of the analysis to the print function.