In [1]:
import pandas as pd
from pathlib import Path

In [2]:
 # The path to our CSV file
data = Path("Resources/Kaplan-Meier Patient_Graft Survival Data_Clean.xlsx", sheet_name='1 Year')

# Read our Crowdfunding data into pandas
transplant_df = pd.read_excel(data)
transplant_df.head(10)

Unnamed: 0,Transplant Year,Organ,Age Group,N,Graft Survival Rate [95% CI],Patient Survival Rate [95% CI]
0,1988,Heart/Lung,1-5 Years,1,*,*
1,1988,Heart/Lung,11-17 Years,6,*,*
2,1988,Heart/Lung,18-34,31,"54.84 [35.97, 70.26]","54.84 [35.97, 70.26]"
3,1988,Heart/Lung,35-49,26,"42.31 [23.47, 60.02]","42.31 [23.47, 60.02]"
4,1988,Heart/Lung,50-64,6,*,*
5,1988,Heart/Lung,6-10 Years,3,*,*
6,1988,Heart,<1 Year,34,"64.71 [46.30, 78.18]","67.40 [48.88, 80.46]"
7,1988,Heart,1-5 Years,19,"78.95 [53.19, 91.53]","78.95 [53.19, 91.53]"
8,1988,Heart,11-17 Years,41,"78.05 [62.06, 87.92]","80.49 [64.75, 89.73]"
9,1988,Heart,18-34,182,"87.36 [81.60, 91.42]","90.05 [84.68, 93.62]"


In [3]:
# Filter data to only show kidney data
kidney_df = transplant_df[transplant_df['Organ'] == 'Kidney']

# Display the filtered DataFrame
kidney_df.head()

Unnamed: 0,Transplant Year,Organ,Age Group,N,Graft Survival Rate [95% CI],Patient Survival Rate [95% CI]
17,1988,Kidney,<1 Year,7,*,*
18,1988,Kidney,1-5 Years,81,"76.54 [65.73, 84.34]","91.12 [82.28, 95.67]"
19,1988,Kidney,11-17 Years,271,"85.19 [80.37, 88.91]","96.65 [93.66, 98.24]"
20,1988,Kidney,18-34,2076,"86.15 [84.58, 87.57]","95.63 [94.66, 96.43]"
21,1988,Kidney,35-49,2579,"83.17 [81.67, 84.57]","93.57 [92.55, 94.45]"


In [4]:
def remove_square_brackets(text):
    import re
    return re.sub(r'\[.*?\]', '', text)

# Columns to remove brackets from
columns_to_modify = ['Graft Survival Rate [95% CI]', 'Patient Survival Rate [95% CI]']

# Apply the function to the desired columns in kidney_df
kidney_df[columns_to_modify] = kidney_df[columns_to_modify].applymap(remove_square_brackets)

# Print the modified DataFrame
kidney_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,Transplant Year,Organ,Age Group,N,Graft Survival Rate [95% CI],Patient Survival Rate [95% CI]
17,1988,Kidney,<1 Year,7,*,*
18,1988,Kidney,1-5 Years,81,76.54,91.12
19,1988,Kidney,11-17 Years,271,85.19,96.65
20,1988,Kidney,18-34,2076,86.15,95.63
21,1988,Kidney,35-49,2579,83.17,93.57


In [5]:
def is_number(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

# List of column names to check for asterisks
columns_to_check = ['Graft Survival Rate [95% CI]', 'Patient Survival Rate [95% CI]']

# Boolean indexing to filter out rows with asterisks in the specified columns
for column in columns_to_check:
    kidney_df = kidney_df[kidney_df[column].apply(is_number)]

# Print the modified DataFrame
kidney_df

Unnamed: 0,Transplant Year,Organ,Age Group,N,Graft Survival Rate [95% CI],Patient Survival Rate [95% CI]
18,1988,Kidney,1-5 Years,81,76.54,91.12
19,1988,Kidney,11-17 Years,271,85.19,96.65
20,1988,Kidney,18-34,2076,86.15,95.63
21,1988,Kidney,35-49,2579,83.17,93.57
22,1988,Kidney,50-64,1654,80.53,88.41
...,...,...,...,...,...,...
1690,2021,Kidney,18-34,2428,97.39,99.36
1691,2021,Kidney,35-49,5159,95.89,98.07
1692,2021,Kidney,50-64,8016,93.24,95.59
1693,2021,Kidney,6-10 Years,127,97.60,100.00


In [6]:
column_values = kidney_df['Transplant Year'].unique()
print(column_values)

[1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001
 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
 2016 2017 2018 2019 2020 2021]


In [7]:
# Export to clean Excel file
kidney_df.to_excel('Resources/cleaned_kidney_data.xlsx', index=False)