In [5]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model and tokenizer
model_path = "REA_GenderIdentification_v1"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on device: {device}")
model.to(device)

# Load data
data_path = "FinalDataFrame.csv"
data = pd.read_csv(data_path)

# Check if column exists
if 'Covered_Recipient_First_Name' not in data.columns:
    raise ValueError("The column 'Covered_Recipient_First_Name' is not found in the dataset.")

# Preprocess column
data['Covered_Recipient_First_Name'] = data['Covered_Recipient_First_Name'].fillna("").str.lower()

# Define batch size
batch_size = 32

# Initialize predictions list with None values
predictions = [None] * len(data)

# Iterate through the data in batches
for i in range(0, len(data), batch_size):
    batch_names = data['Covered_Recipient_First_Name'][i:i+batch_size].tolist()
    
    # Keep track of non-empty names and their indices
    non_empty_indices = [j for j, name in enumerate(batch_names) if name.strip() != ""]
    non_empty_names = [batch_names[j] for j in non_empty_indices]
    
    # If there are no valid names in this batch, continue
    if len(non_empty_names) == 0:
        continue

    # Tokenize and move to device
    inputs = tokenizer(non_empty_names, return_tensors="pt", padding=True, truncation=True).to(device)

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted classes (1 for M, 0 for F)
    logits = outputs.logits
    batch_predictions = torch.argmax(logits, dim=1).cpu().tolist()

    # Assign predictions back to the correct positions in the main predictions list
    for idx, prediction in zip(non_empty_indices, batch_predictions):
        predictions[i + idx] = "M" if prediction == 1 else "F"

# Add predictions to the DataFrame
data['Gender'] = predictions

# Save the output to a new CSV file
output_path = "FinalDataFrame_with_Gender.csv"
data.to_csv(output_path, index=False)

print(f"Gender predictions saved to {output_path}")


Running on device: cuda


  data = pd.read_csv(data_path)


Gender predictions saved to FinalDataFrame_with_Gender.csv


In [2]:
import pandas as pd

# Read data
data_A = pd.read_csv('US1.csv')
data_A = data_A.dropna(subset=['FirstName', 'Gender'])

data_B = pd.read_csv('data.csv')
data_B = data_B.dropna(subset=['Name', 'Gender'])

# Convert FirstName, Name, and Gender columns to lowercase
data_A['FirstName'] = data_A['FirstName'].str.lower()

data_B['Name'] = data_B['Name'].str.lower()

# Create a dictionary mapping from names to gender
name_to_gender = dict(zip(data_A['FirstName'], data_A['Gender']))

# Update name-to-gender mapping using data from data_B
for name, gender in zip(data_B['Name'], data_B['Gender']):
    name_to_gender[name] = gender

# Gender prediction function
def predict_gender(name):
    return name_to_gender.get(name.lower(), 'Unknown')  # Convert the input name to lowercase

# Read the third dataset and process
data_C = pd.read_csv('FinalDataFrame.csv')
data_C = data_C.dropna(subset=['Covered_Recipient_First_Name'])

# Convert Covered_Recipient_First_Name to lowercase and predict gender
data_C['Gender'] = data_C['Covered_Recipient_First_Name'].str.lower().apply(predict_gender)

# Calculate unmatched rate
total_count = len(data_C)
unmatched_count = len(data_C[data_C['Gender'] == 'Unknown'])
unmatched_rate = unmatched_count / total_count
print("Unrecognized rate:", unmatched_rate)

# Save the results as a new CSV file, including the Gender column
data_C.to_csv('FinalDataFrame_with_Gender.csv', index=False)


  data_C = pd.read_csv('FinalDataFrame.csv')


Unrecognized rate: 0.017446011102844005


In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model and tokenizer
model_path = "REA_GenderIdentification_v1"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on device: {device}")
model.to(device)

# Read data
data_A = pd.read_csv('US1.csv')
data_A = data_A.dropna(subset=['FirstName', 'Gender'])

data_B = pd.read_csv('data.csv')
data_B = data_B.dropna(subset=['Name', 'Gender'])

# Convert FirstName, Name, and Gender columns to lowercase
data_A['FirstName'] = data_A['FirstName'].str.lower()
data_B['Name'] = data_B['Name'].str.lower()

# Create a dictionary mapping from names to gender
name_to_gender = dict(zip(data_A['FirstName'], data_A['Gender']))

# Update name-to-gender mapping using data from data_B
for name, gender in zip(data_B['Name'], data_B['Gender']):
    name_to_gender[name] = gender

# Gender prediction function
def predict_gender(name):
    return name_to_gender.get(name.lower(), 'Unknown')  # Convert the input name to lowercase

# Read the third dataset and process
data_C = pd.read_csv('FinalDataFrame.csv')
data_C = data_C.dropna(subset=['Covered_Recipient_First_Name'])

# Convert Covered_Recipient_First_Name to lowercase and predict gender
data_C['Covered_Recipient_First_Name'] = data_C['Covered_Recipient_First_Name'].fillna("").str.lower()
data_C['Gender'] = data_C['Covered_Recipient_First_Name'].apply(predict_gender)

# Find unmatched rows
unmatched_rows = data_C[data_C['Gender'] == 'Unknown'].copy()

# If there are unmatched rows, use the model to predict gender
if len(unmatched_rows) > 0:
    # Define batch size
    batch_size = 32
    
    # Initialize list for predictions
    unmatched_predictions = [None] * len(unmatched_rows)
    
    # Batch prediction using the model
    for i in range(0, len(unmatched_rows), batch_size):
        batch_names = unmatched_rows['Covered_Recipient_First_Name'][i:i+batch_size].tolist()
        
        # Filter out empty names and get valid indices
        non_empty_indices = [j for j, name in enumerate(batch_names) if name.strip() != ""]
        non_empty_names = [batch_names[j] for j in non_empty_indices]
        
        # Skip if no valid names in batch
        if len(non_empty_names) == 0:
            continue
        
        # Tokenize names and move to device
        inputs = tokenizer(non_empty_names, return_tensors="pt", padding=True, truncation=True).to(device)
        
        # Model prediction
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Get prediction results (1 represents M, 0 represents F)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=1).cpu().tolist()
        
        # Save predictions back to list
        for idx, prediction in zip(non_empty_indices, batch_predictions):
            unmatched_predictions[i + idx] = "M" if prediction == 1 else "F"
    
    # Update the Gender column in unmatched rows
    unmatched_rows['Gender'] = unmatched_predictions
    
    # Update the original data with the newly predicted genders
    data_C.update(unmatched_rows)

# Calculate unmatched rate
total_count = len(data_C)
unmatched_count = len(data_C[data_C['Gender'] == 'Unknown'])
unmatched_rate = unmatched_count / total_count
print("Unrecognized rate:", unmatched_rate)

# Save the results as a new CSV file, including the Gender column
output_path = "FinalDataFrame_with_Gender_Predictions.csv"
data_C.to_csv(output_path, index=False)

print(f"Gender predictions saved to {output_path}")


Running on device: cuda


  data = pd.read_csv(data_path)


Gender predictions saved to FinalDataFrame_with_Gender_Predictions.csv


In [4]:
import pandas as pd

# Load the CSV file with low_memory=False to avoid the mixed types warning
newDf = pd.read_csv('FinalDataFrame_with_Gender_Predictions.csv', low_memory=False)

# Display the first 10 records
print(newDf.head(10))

        Covered_Recipient_Type Covered_Recipient_First_Name  \
0  Covered Recipient Physician                          ben   
1  Covered Recipient Physician                         sana   
2  Covered Recipient Physician                      tauseef   
3  Covered Recipient Physician                    annabelle   
4  Covered Recipient Physician                    annabelle   
5  Covered Recipient Physician                       robert   
6  Covered Recipient Physician                       thomas   
7  Covered Recipient Physician                        renli   
8  Covered Recipient Physician                       lovrdu   
9  Covered Recipient Physician                       lovrdu   

  Covered_Recipient_Middle_Name Covered_Recipient_Last_Name Recipient_City  \
0                           NaN                         RAD         FRESNO   
1                           NaN                     QURESHI         POMONA   
2                           NaN                     QURESHI    LOS ANGEL

In [3]:
# Get the list of columns
columns = newDf.columns.tolist()

# Print the list of columns
print(columns)

['Covered_Recipient_Type', 'Covered_Recipient_First_Name', 'Covered_Recipient_Middle_Name', 'Covered_Recipient_Last_Name', 'Recipient_City', 'Recipient_State', 'Recipient_Country', 'Recipient_Province', 'Covered_Recipient_Primary_Type_1', 'Covered_Recipient_Specialty_1', 'Covered_Recipient_License_State_code1', 'Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name', 'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name', 'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State', 'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country', 'Total_Amount_of_Payment_USDollars', 'Date_of_Payment', 'Number_of_Payments_Included_in_Total_Amount', 'Form_of_Payment_or_Transfer_of_Value', 'Nature_of_Payment_or_Transfer_of_Value', 'Physician_Ownership_Indicator', 'Third_Party_Payment_Recipient_Indicator', 'Charity_Indicator', 'Contextual_Information', 'Dispute_Status_for_Publication', 'Related_Product_Indicator', 'Covered_or_Noncovered_Indicator_1', 'Indicate_Drug_or_B

In [6]:
# Select the relevant columns
selected_columns = ['Covered_Recipient_First_Name', 'Covered_Recipient_Last_Name', 'Total_Amount_of_Payment_USDollars', 'Gender']
# Set display options to prevent wrapping to new lines
pd.set_option('display.width', 1000)  # Adjust the width as needed
pd.set_option('display.max_columns', 10)  # Set max columns to display in a row

# Display the first 10 records of the selected columns
print(newDf[selected_columns].head(10))

  Covered_Recipient_First_Name Covered_Recipient_Last_Name  Total_Amount_of_Payment_USDollars Gender
0                          ben                         RAD                              24.16      M
1                         sana                     QURESHI                              22.01      F
2                      tauseef                     QURESHI                              24.34      M
3                    annabelle                      QUIZON                              26.14      F
4                    annabelle                      QUIZON                             108.49      F
5                       robert                       QUINT                              25.75      M
6                       thomas                       QUINN                              29.79      M
7                        renli                        QIAO                              23.22      F
8                       lovrdu                     PYREDDY                              22.