# Data Pre-Processing

### Define Functions

In [1]:
# load setup.py file
%run setup.py
# change number of lines in dataframe
pd.set_option('display.max_rows', None)

In [2]:
def read_csv_files_from_folder(filepath,folder):
    # Step 1: Create a path to all the .csv files in the folder
    csv_path = os.path.join(filepath, folder, "*.csv")

    # Step 2: Use the glob module to get a list of all the .csv files in the folder
    csv_files = glob.glob(csv_path)
    
    # Step 3: Create an empty list to store the dataframes
    df_list = []

    # Step 4: Loop through all the .csv files and append them to the dataframe list
    for file_path in csv_files:
        df = pd.read_csv(file_path,index_col=None,encoding='latin-1')
        df['Conference (short)'] = os.path.basename(file_path)
        df['Conference (short)'] = df['Conference (short)'].str.split('_').str[0]
        df_list.append(df)

    # Step 5: Concatenate all the dataframes in the list into a single dataframe
    final_df = pd.concat(df_list, ignore_index=True)
    
    # Step 6: Delete unnesseary columns
    final_df = final_df.drop(["Field"], axis=1)
    try:
        final_df = final_df.drop(["Key Note Speaker"], axis=1)
    except:
        pass

    # Step 7: Return the final dataframe
    return final_df

def Name_preprocessing(df):
    # First Name
    df['First name'] = df['Full name'].str.split(' ').str[0].str.lower()
    df['First name'] = df["First name"].str.normalize('NFKD')\
       .str.encode('ascii', errors='ignore')\
       .str.decode('utf-8')
    print("Number of rows where the first name only contains abbreviation: ", df[df['First name'].str.contains('^[a-z]\.$')].shape[0])
    print("These rows will be removed")
    df = df[~df['First name'].str.contains('^[a-z]\.$')]
    print("There are now:",df.shape[0],"rows")
       
    # Middel and Last Name
    df['Middle name'] = df['Full name'].str.split(' ').str[1]
    df['Last name (temp)'] = df['Full name'].str.split(' ').str[2]
    df['Last name'] = np.where(df['Last name (temp)'].isnull(), df['Middle name'], df['Last name (temp)'])
    df['Middle name'] = np.where(df['Last name (temp)'].isnull(), np.nan, df['Middle name'])
    df = df.drop(["Last name (temp)"], axis=1)
    return df

def Middle_name_clean(df):
    # Find all rows with different middle names
    row_to_remove = df[df.duplicated(['First name','Last name'], keep=False)].sort_values(['First name','Last name']).drop_duplicates(subset=['First name','Middle name','Last name'], keep=False).drop_duplicates(subset=['First name','Last name'], keep=False)
    rows_to_change = df[df.duplicated(['First name','Last name'], keep=False)].sort_values(['First name','Last name']).drop_duplicates(subset=['First name','Middle name','Last name'], keep=False)
    rows_to_change = rows_to_change[~rows_to_change.isin(row_to_remove)].dropna(how='all').sort_values(['First name','Last name','Middle name'])

    for i in range(len(rows_to_change)-1):
        if (rows_to_change.iloc[i]['First name'] == rows_to_change.iloc[i+1]['First name']) and (rows_to_change.iloc[i]['Last name'] == rows_to_change.iloc[i+1]['Last name']):
            if pd.isnull(rows_to_change.iloc[i+1]['Middle name']):
                rows_to_change.iloc[i+1,rows_to_change.columns.get_loc('Middle name')] = rows_to_change.iloc[i]['Middle name']
    
    # Overwrite the Middle name
    df.loc[rows_to_change.index,'Middle name'] = rows_to_change['Middle name']
    
    df['First name'] = df['First name'].str.capitalize()

    # if middle name is nan then combine first and last name with space between, otherwise if middle name is not nan then combine first, middle and last name with space between
    df['Full name'] = np.where(df['Middle name'].isnull(), df['First name'] + ' ' + df['Last name'], df['First name'] + ' ' + df['Middle name'] + ' ' + df['Last name'])
    return df



#############################################
# Gender Data
#############################################
name_df = pd.read_csv('Dimension Tables/wgnd_2_0_name-gender-code.csv')
# Keep only US or DK names
name_df = name_df[(name_df['code'] == 'US') | (name_df['code'] == 'DK')]
# drop duplicates
name_df = name_df.drop_duplicates(subset=['name',"gender"], keep='first')
# for all duplicated value keep the row with the max "wgt" value
name_df = name_df.sort_values('wgt', ascending=False).drop_duplicates(subset=['name'], keep='first')

print("Number of rows:", len(name_df))
name_df.head()

Number of rows: 97795


Unnamed: 0,name,code,gender,wgt
758,aaban,US,M,1.0
1857914,lexxy,US,F,1.0
1857667,lexia,US,F,1.0
1857563,lexi,US,F,1.0
1857549,lexey,US,F,1.0


## Key Note Speakers

#### Load and combine all .csv files in the folder

In [14]:
df = read_csv_files_from_folder(filepath=filepath,folder='Invited Speakers')
print("Number of rows:", len(df))
df.head()

Number of rows: 2104


Unnamed: 0,Full name,Year,Sex,Conference (short)
0,Atsushi Asada,1995,0,ASPDAC
1,Jim Meadlock,1995,0,ASPDAC
2,John Darringer,1995,0,ASPDAC
3,Tatsuo Izawa,1997,0,ASPDAC
4,Daniel D. Gajski,1997,0,ASPDAC


#### Remove rows

In [15]:
# Only include data after 2003
df = df[(df['Year'] >= 2003) & (df['Year'] <= 2022)]
df = df[~df['Conference (short)'].isin(["ICIP","CVPR"])]
df = df[df['Full name'] != "Wei Wei"]

#### Clean the name columns

In [16]:
df = Name_preprocessing(df)
df.head()

Number of rows where the first name only contains abbreviation:  40
These rows will be removed
There are now: 1488 rows


Unnamed: 0,Full name,Year,Sex,Conference (short),First name,Middle name,Last name
22,Glovanni De WIichcli,2003,0,ASPDAC,glovanni,De,WIichcli
23,Tadahiro Ohm,2003,0,ASPDAC,tadahiro,,Ohm
24,Ycrvant Zorian,2003,0,ASPDAC,ycrvant,,Zorian
25,Gary L. Baldwi,2004,0,ASPDAC,gary,L.,Baldwi
26,Rudy Lauwereins,2004,0,ASPDAC,rudy,,Lauwereins


#### Add gender data

In [17]:
# merge df and name_df by "First name" and "name" and include "Gender column"
df = pd.merge(df, name_df, how='left', left_on='First name', right_on='name')

# Drop unnecessary columns
df = df.drop(['name', 'code', 'wgt'], axis=1)

print("Number of rows with null value in gender:", df["gender"].isnull().sum())

# Drop all rows with nan value in gender column #TODO
df = df.dropna(subset=['gender'])
print("Number of rows after removal of unknown genders:", len(df))

df.head()

Number of rows with null value in gender: 154
Number of rows after removal of unknown genders: 1334


Unnamed: 0,Full name,Year,Sex,Conference (short),First name,Middle name,Last name,gender
3,Gary L. Baldwi,2004,0,ASPDAC,gary,L.,Baldwi,M
4,Rudy Lauwereins,2004,0,ASPDAC,rudy,,Lauwereins,M
5,Rajeev Madhavan,2005,0,ASPDAC,rajeev,,Madhavan,M
6,Jan M. Rabaey,2005,0,ASPDAC,jan,M.,Rabaey,M
8,Alberto Sangiovanni-Vincentelli,2006,0,ASPDAC,alberto,,Sangiovanni-Vincentelli,M


##### Clean up gender columns

In [18]:
df['Sex'].replace(np.nan, 'unknown', inplace=True)
df['Sex'].replace('x','unknown',inplace=True)
df['Sex'].replace('0','M',inplace=True)
df['Sex'].replace(0,'M',inplace=True)
df['Sex'].replace('1','F',inplace=True)
df['Sex'].replace(1,'F',inplace=True)

In [19]:
print('Number of rows where "Sex" is F and "gender" is M:',len(np.where((df['Sex'] == 'F') & (df['gender']=='M'))[0]))
print('Number of rows where "Sex" is M and "gender" is F:',len(np.where((df['Sex'] == 'M') & (df['gender']=='F'))[0]))

# Overwrite the value in column 'gender' with the 'Sex' column
df.loc[df['Sex'] == 'F', 'gender'] = 'F'  
df.loc[df['Sex'] == 'M', 'gender'] = 'M'   

# Drop unnecessary columns
df = df.drop(['Sex'],axis=1)

Number of rows where "Sex" is F and "gender" is M: 2
Number of rows where "Sex" is M and "gender" is F: 6


#### Fix unique names

In [20]:
print("Before: There are ",df['Full name'].nunique()," unique names")
df = Middle_name_clean(df)
print("After: There are ",df['Full name'].nunique()," unique names")

Before: There are  1136  unique names
After: There are  1124  unique names


#### Save factTable

In [22]:
df.to_csv(os.path.join(filepath,"factInvited_init.csv"), index=False)

## Proceedings

#### Load and combine all .csv files in the folder

In [25]:
# Append all files in Data folder as a datafrom, and add a column for the file name without the extension and folder name
df = read_csv_files_from_folder(filepath=filepath,folder='Proceedings')
print("Number of rows:", len(df))
df.tail()

Number of rows: 466168


Unnamed: 0,Full name,Year,Conference (short)
466163,ZoltÃ¡n Kmetty,2019.0,ic2s2
466164,Zoya Khan,2019.0,ic2s2
466165,Zuzana Sasovova,2019.0,ic2s2
466166,Ãkos Jakobi,2019.0,ic2s2
466167,ÃaÄrÄ± Yoltar,2019.0,ic2s2


#### Remove rows

In [26]:
# Only include data after 2003
df = df[(df['Year'] >= 2003) & (df['Year'] <= 2022)]
df = df[~df['Conference (short)'].isin(["ICIP","CVPR"])]

#### Clean the names

In [27]:
df = Name_preprocessing(df)
df.head()

Number of rows where the first name only contains abbreviation:  1828
These rows will be removed
There are now: 275118 rows


Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name
2405,Hiroto Yasuura,2003.0,ASPDAC,hiroto,,Yasuura
2406,Yazdan Aghaghiri,2003.0,ASPDAC,yazdan,,Aghaghiri
2407,Farzan Fallah,2003.0,ASPDAC,farzan,,Fallah
2408,Massoud Pedram,2003.0,ASPDAC,massoud,,Pedram
2409,Satoshi Komatsu,2003.0,ASPDAC,satoshi,,Komatsu


#### Add gender data

In [5]:
# merge df and name_df by "First name" and "name" and include "Gender column"
df = pd.merge(df, name_df, how='left', left_on='First name', right_on='name')

# Drop unnecessary columns
df = df.drop(['name', 'code', 'wgt'], axis=1)

print("Number of rows with null value in gender:", df["gender"].isnull().sum())

# Drop all rows with nan value in gender column #TODO
df = df.dropna(subset=['gender'])
print("Number of rows after removal of unknown genders:", len(df))

df.head()

Number of rows with null value in gender: 135609
Number of rows after removal of unknown genders: 324479


Unnamed: 0,Full name,Year,Conference (short),First name,Middle name,Last name,gender
0,Isao Shirakawa,1995.0,ASPDAC,isao,,Shirakawa,M
5,TingTing Hwang,1995.0,ASPDAC,tingting,,Hwang,F
6,Sanjay Dhar,1995.0,ASPDAC,sanjay,,Dhar,M
7,Dave J. Gurney,1995.0,ASPDAC,dave,J.,Gurney,M
8,Michel Thill,1995.0,ASPDAC,michel,,Thill,M


#### Fix Unique names

In [6]:
print("Before: There are ",df['Full name'].nunique()," unique names")
df = Middle_name_clean(df)
print("After: There are ",df['Full name'].nunique()," unique names")

Before: There are  107340  unique names
After: There are  106655  unique names


#### Save factTable

In [7]:
df.to_csv(os.path.join(filepath,"factProceedings_init.csv"), index=False)

## Add Columns
NB: Do not complete this step until after the Get_Paper_info.ipynb has been runned

In [3]:
# Load files
df_Invited = pd.read_csv(os.path.join(filepath,'factInvited.csv'))
df_Proceedings = pd.read_csv(os.path.join(filepath,'factProceedings.csv'))

In [31]:
# Remove Wei Wei from df_Invited
df_Invited = df_Invited[df_Invited['Full name'] != "Wei Wei"]

In [7]:
# Add age
df_Invited['Age'] = np.where(df_Invited['First year paper'] == 0, 0, np.where(df_Invited['Year'] - df_Invited['First year paper'] < 0, 0, df_Invited['Year'] - df_Invited['First year paper']))
df_Proceedings['Age'] = np.where(df_Proceedings['First year paper'] == 0, 0, np.where(df_Proceedings['Year'] - df_Proceedings['First year paper'] < 0, 0, df_Proceedings['Year'] - df_Proceedings['First year paper']))

In [5]:
# replace nan values in Year with 2019
df_Proceedings['Year'].replace(np.nan, 2019, inplace=True)

In [8]:
# Change column types to int
def change_column_types_to_int(df, columns):
    for column in columns:
        df[column] = df[column].astype(int)
    return df

df_Invited = change_column_types_to_int(df_Invited,['Year','Paper Count','Max Paper Count','First year paper','Age'])
df_Proceedings = change_column_types_to_int(df_Proceedings,['Year','Paper Count','Max Paper Count','First year paper','Age'])

In [9]:
# Add field
df_conference = pd.read_csv(os.path.join(filepath,'Dimension Tables','Conference_Field_Name.csv'))

df_Invited = df_Invited.merge(df_conference, how='left', on='Conference (short)')
df_Proceedings = df_Proceedings.merge(df_conference, how='left', on='Conference (short)')

df_Invited = df_Invited.drop(['Subcategori Topic'],axis=1)
df_Proceedings = df_Proceedings.drop(['Subcategori Topic'],axis=1)

In [10]:
# Add Productivity column
df_Invited['Productivity'] = np.divide(df_Invited['Paper Count'],df_Invited['Age'])
df_Invited['Productivity'] = df_Invited['Productivity'].replace([np.inf, -np.inf,np.nan], 0)

df_Proceedings['Productivity'] = np.divide(df_Proceedings['Paper Count'],df_Proceedings['Age'])
df_Proceedings['Productivity'] = df_Proceedings['Productivity'].replace([np.inf, -np.inf,np.nan], 0)

In [11]:
# Remove conference
df_Invited = df_Invited[~df_Invited['Conference (short)'].isin(["ECCV","ICIP","CVPR"])]
df_Proceedings = df_Proceedings[~df_Proceedings['Conference (short)'].isin(["ECCV","ICIP","CVPR"])]

In [12]:
# Only include data after 2003
df_Invited = df_Invited[df_Invited['Year'] >= 2003]
df_Proceedings = df_Proceedings[df_Proceedings['Year'] >= 2003]

df_Invited = df_Invited[df_Invited['Year'] <= 2022]
df_Proceedings = df_Proceedings[df_Proceedings['Year'] <= 2022]

In [16]:
df_Invited['Conference (short)'].replace("ic2s2", "IC2S2", inplace=True)

In [32]:
# Save tables
df_Invited.to_csv(os.path.join(filepath,'factInvited.csv'), index=False)
df_Proceedings.to_csv(os.path.join(filepath,'factProceedings.csv'), index=False)