In [1]:
import pandas as pd
import gspread
from fuzzywuzzy import fuzz, process

In [2]:
# service account json credentials 
SERVICE_ACCOUNT_FILE = 'accesspysheet-bd152702637a.json'
# Authenticate with the Google Sheets API
gc = gspread.service_account(filename=SERVICE_ACCOUNT_FILE)

In [3]:
# Open the Google Sheet by URL
SHEET_URL = "https://docs.google.com/spreadsheets/d/1h4HlBY1_vOAuFmWQxTgHRr3075wgfxfroIWmHZm4b98/edit?gid=0#gid=0"
sh = gc.open_by_url(SHEET_URL)

In [4]:

# Select the first worksheet
worksheet = sh.get_worksheet(0)

# Extract all data as a list of lists
data = worksheet.get_all_values()

# Convert to a Pandas DataFrame
df = pd.DataFrame(data[1:], columns=data[0])



In [5]:
# Convert the 'Submitted at' column to datetime
df['Submitted at'] = pd.to_datetime(df['Submitted at'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

In [6]:
# Verify the conversion
print(df['Submitted at'].dtype)  # Should now show 'datetime64[ns]'


datetime64[ns]


In [7]:
df = df.drop_duplicates(subset=df.columns[0])

In [8]:
# Ensure the 'working title' column is treated as strings
df['What is your working project title?'] = df['What is your working project title?'].astype(str)

In [9]:
# Function to check fuzzy matches for duplicates in the 3rd column
def find_fuzzy_duplicates(df, column_index, threshold=90):
    duplicate_rows = []
    for i, value1 in enumerate(df.iloc[:, column_index]):
        for j, value2 in enumerate(df.iloc[:, column_index]):
            if i < j and fuzz.ratio(value1, value2) >= threshold:
                duplicate_rows.append(i)
                duplicate_rows.append(j)
    return df.iloc[duplicate_rows].drop_duplicates()

In [10]:
# Find fuzzy duplicates in the 'working title' column
duplicate_pairs = find_fuzzy_duplicates(df, 3, threshold=90)
duplicate_pairs

Unnamed: 0,Submission ID,Respondent ID,Submitted at,What is your working project title?,Project Leader,Other Project Members?,What is your email address?,What is your best contact number?,Which Department do you belong to? (Please choose ONE),What is your staffing role? (Please choose ONE),...,Protocol Required Elements,Protocol Required Elements (Check here to view document and download),New Study Submission Checklist (optional info),New Study Submission Checklist (optional info) (Check here to view document and download),Request Preview to PHI for Trinity Health Network on EPIC (good for data scrubbing prior to IRB approval),Request Preview to PHI for Trinity Health Network on EPIC (good for data scrubbing prior to IRB approval) (Check here to view document and download),Will you submit this project for the upcoming SAMC's Annual Research Poster Day?,Published or submitting for peer reviewed publication?,"Please submit supporting documents (PDF, DOC, PPT) that you are reporting","For verification purposes, please sign below using mouse or touch (mobile)"


# Total Count for Research Day Submission

In [11]:
count_yes = df[df.iloc[:, 84] == "YES"].shape[0]
print(count_yes)

63


In [19]:
# Group by department and the unique values in column index 11 and 14
grouped_df = df.groupby([df.columns[8], df.columns[11], df.columns[14]]).size().reset_index(name='Total Count')

# Pivot the table to get a better view
pivot_df = grouped_df.pivot_table(index=df.columns[8], columns=[df.columns[11], df.columns[14]], values='Total Count', fill_value=0)

pivot_df

Please select the category of scholarly activity that you wish to report performed under SAMC GME?,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Research
"Is this project a Quality Improvement (QI), Research, or Case Report (Please choose ONE)",Case Report,QI,Research,Unnamed: 4_level_1
Which Department do you belong to? (Please choose ONE),Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Emergency Medicine,4.0,0.0,3.0,0.0
Family Medicine,6.0,4.0,0.0,0.0
Internal Medicine,24.0,17.0,2.0,0.0
Transitional Medicine,2.0,0.0,0.0,4.0


In [None]:
# Filter the original dataframe to only include rows where the submission for research day is "YES"
df_yes = df[df['Will you submit this project for the upcoming SAMC\'s Annual Research Poster Day?'] == 'YES']

# Group by department and the unique values in column index 11 and 14
grouped_df_yes = df_yes.groupby([df_yes.columns[8], df_yes.columns[11], df_yes.columns[14]]).size().reset_index(name='Total Count')

# Pivot the table to get a better view
pivot_df_yes = grouped_df_yes.pivot_table(index=df_yes.columns[8], columns=[df_yes.columns[11], df_yes.columns[14]], values='Total Count', fill_value=0)

# Add a new column 'Total' to get the total count for each department
pivot_df_yes['Total'] = pivot_df_yes.sum(axis=1)

pivot_df_yes

In [22]:
pivot_df_yes = pivot_df_yes.astype(int)
pivot_df_yes

Please select the category of scholarly activity that you wish to report performed under SAMC GME?,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Research,Total
"Is this project a Quality Improvement (QI), Research, or Case Report (Please choose ONE)",Case Report,QI,Research,Unnamed: 4_level_1,Unnamed: 5_level_1
Which Department do you belong to? (Please choose ONE),Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Emergency Medicine,4,0,3,0,7
Family Medicine,6,2,0,0,8
Internal Medicine,24,16,2,0,42
Transitional Medicine,2,0,0,4,6
