In [2]:
import pandas as pd

Sometimes, csv files come with values that should be NaN (ex. '#VALUE!' or '#DIV/0!'). An easy way to replace when loading:

In [3]:
# original with unknowns filled in as NaN
df = pd.read_csv('risk_factors_cervical_cancer.csv')
print(df)

     Age Number of sexual partners First sexual intercourse  \
0     18                       4.0                     15.0   
1     15                       1.0                     14.0   
2     34                       1.0                        ?   
3     52                       5.0                     16.0   
4     46                       3.0                     21.0   
5     42                       3.0                     23.0   
6     51                       3.0                     17.0   
7     26                       1.0                     26.0   
8     45                       1.0                     20.0   
9     44                       3.0                     15.0   
10    44                       3.0                     26.0   
11    27                       1.0                     17.0   
12    45                       4.0                     14.0   
13    44                       2.0                     25.0   
14    43                       2.0                     

In [4]:
# new load, edited
df = pd.read_csv('risk_factors_cervical_cancer.csv', na_values=['?'])
print(df)

     Age  Number of sexual partners  First sexual intercourse  \
0     18                        4.0                      15.0   
1     15                        1.0                      14.0   
2     34                        1.0                       NaN   
3     52                        5.0                      16.0   
4     46                        3.0                      21.0   
5     42                        3.0                      23.0   
6     51                        3.0                      17.0   
7     26                        1.0                      26.0   
8     45                        1.0                      20.0   
9     44                        3.0                      15.0   
10    44                        3.0                      26.0   
11    27                        1.0                      17.0   
12    45                        4.0                      14.0   
13    44                        2.0                      25.0   
14    43                 

In [5]:
# pass by value (or reference?)
new_df = df.copy(deep=True)
new_df['Years since first encounter'] = new_df['Age'] - new_df['First sexual intercourse']

In [6]:
new_df

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy,Years since first encounter
0,18,4.0,15.0,1.0,0.0,0.000000,0.00,0.0,0.00,0.0,...,,0,0,0,0,0,0,0,0,3.0
1,15,1.0,14.0,1.0,0.0,0.000000,0.00,0.0,0.00,0.0,...,,0,0,0,0,0,0,0,0,1.0
2,34,1.0,,1.0,0.0,0.000000,0.00,0.0,0.00,0.0,...,,0,0,0,0,0,0,0,0,
3,52,5.0,16.0,4.0,1.0,37.000000,37.00,1.0,3.00,0.0,...,,1,0,1,0,0,0,0,0,36.0
4,46,3.0,21.0,4.0,0.0,0.000000,0.00,1.0,15.00,0.0,...,,0,0,0,0,0,0,0,0,25.0
5,42,3.0,23.0,2.0,0.0,0.000000,0.00,0.0,0.00,0.0,...,,0,0,0,0,0,0,0,0,19.0
6,51,3.0,17.0,6.0,1.0,34.000000,3.40,0.0,0.00,1.0,...,,0,0,0,0,1,1,0,1,34.0
7,26,1.0,26.0,3.0,0.0,0.000000,0.00,1.0,2.00,1.0,...,,0,0,0,0,0,0,0,0,0.0
8,45,1.0,20.0,5.0,0.0,0.000000,0.00,0.0,0.00,0.0,...,,1,0,1,1,0,0,0,0,25.0
9,44,3.0,15.0,,1.0,1.266973,2.80,0.0,0.00,,...,,0,0,0,0,0,0,0,0,29.0
