In [53]:
import pandas as pd
from collections import defaultdict
import csv

In [54]:
#before reading in the data, we need to make sure there are no whitespaces
#leading or trailing the text. 

#Coverters: Dict of functions for converting values in certain columns. Keys can 
#either be integers or column labels 


def strip(text):
    try:
        return text.strip()
    except AttributeError:
        return text
    
def stripReplace(text):
    try:
        return text.strip().replace('.','').replace('0', 'none')
    except AttributeError:
        return text    

data = pd.read_csv('faculty.csv', sep =',',
                    names = ["name", "degree", "title", "email"],
                    header = 0,
                    converters = {'name' : strip,
                                  'degree' : stripReplace,
                                  'title' : strip,
                                  'email' : strip})
data.head()

Unnamed: 0,name,degree,title,email
0,Scarlett L. Bellamy,ScD,Associate Professor of Biostatistics,bellamys@mail.med.upenn.edu
1,Warren B. Bilker,PhD,Professor of Biostatistics,warren@upenn.edu
2,Matthew W Bryan,PhD,Assistant Professor of Biostatistics,bryanma@upenn.edu
3,Jinbo Chen,PhD,Associate Professor of Biostatistics,jinboche@upenn.edu
4,Susan S Ellenberg,PhD,Professor of Biostatistics,sellenbe@upenn.edu


In [55]:
degrees = list(data.degree)
Alldegrees = []
for item in degrees:
    Alldegrees.extend(item.split())
    
degree_count = defaultdict(int)
for degree in Alldegrees:
    degree_count[degree] += 1
    
degree_count

defaultdict(int,
            {'BSEd': 1,
             'JD': 1,
             'MA': 1,
             'MD': 1,
             'MPH': 2,
             'MS': 2,
             'PhD': 31,
             'ScD': 6,
             'none': 1})

In [56]:
freqs = pd.DataFrame.from_dict(degree_count, orient='index')
freqs

Unnamed: 0,0
none,1
PhD,31
MD,1
MPH,2
JD,1
BSEd,1
MS,2
ScD,6
MA,1


In [57]:
data.title.values[24] = data.title.values[24].replace(' is ', ' of ')
data.title.value_counts()

Professor of Biostatistics              13
Assistant Professor of Biostatistics    12
Associate Professor of Biostatistics    12
Name: title, dtype: int64

In [58]:
email_list = list(data.email.values)

In [81]:
domains = []
for item in email_list:
    domains.append(item.split('@')[-1])
    
set(domains)

{'cceb.med.upenn.edu', 'email.chop.edu', 'mail.med.upenn.edu', 'upenn.edu'}

In [39]:
with open('emails.csv', 'w') as fp:
    for email in data.email.values:
        fp.write('%s\n' % email)

In [94]:
faculty_dict = {}
for index, row in data.iterrows():
    faculty_dict[row['name'].split()[-1]] = row.degree, row.title, row.email
faculty_dict

{'Bellamy': ('ScD',
  'Associate Professor of Biostatistics',
  'bellamys@mail.med.upenn.edu'),
 'Bilker': ('PhD', 'Professor of Biostatistics', 'warren@upenn.edu'),
 'Bryan': ('PhD', 'Assistant Professor of Biostatistics', 'bryanma@upenn.edu'),
 'Chen': ('PhD', 'Associate Professor of Biostatistics', 'jinboche@upenn.edu'),
 'Ellenberg': ('PhD',
  'Professor of Biostatistics',
  'jellenbe@mail.med.upenn.edu'),
 'Feng': ('PhD', 'Assistant Professor of Biostatistics', 'ruifeng@upenn.edu'),
 'French': ('PhD',
  'Associate Professor of Biostatistics',
  'bcfrench@mail.med.upenn.edu'),
 'Gimotty': ('PhD', 'Professor of Biostatistics', 'pgimotty@upenn.edu'),
 'Guo': ('PhD', 'Professor of Biostatistics', 'wguo@mail.med.upenn.edu'),
 'Hsu': ('PhD',
  'Assistant Professor of Biostatistics',
  'hsu9@mail.med.upenn.edu'),
 'Hubbard': ('PhD',
  'Associate Professor of Biostatistics',
  'rhubb@mail.med.upenn.edu'),
 'Hwang': ('PhD',
  'Associate Professor of Biostatistics',
  'whwang@mail.med.upenn

In [89]:
faculty_dict = {}
for index, row in data.iterrows():
    faculty_dict[row['name'].split()[0], row['name'].split()[-1]] = row.degree, row.title, row.email
faculty_dict

{('A.', 'Localio'): ('JD MA MPH MS PhD',
  'Associate Professor of Biostatistics',
  'rlocalio@upenn.edu'),
 ('Alisa', 'Stephens'): ('PhD',
  'Assistant Professor of Biostatistics',
  'alisaste@mail.med.upenn.edu'),
 ('Andrea', 'Troxel'): ('ScD',
  'Professor of Biostatistics',
  'atroxel@mail.med.upenn.edu'),
 ('Benjamin', 'French'): ('PhD',
  'Associate Professor of Biostatistics',
  'bcfrench@mail.med.upenn.edu'),
 ('Dawei', 'Xie'): ('PhD',
  'Assistant Professor of Biostatistics',
  'dxie@upenn.edu'),
 ('Haochang', 'Shou'): ('PhD',
  'Assistant Professor of Biostatistics',
  'hshou@mail.med.upenn.edu'),
 ('Hongzhe', 'Li'): ('PhD', 'Professor of Biostatistics', 'hongzhe@upenn.edu'),
 ('J.', 'Landis'): ('BSEd MS PhD',
  'Professor of Biostatistics',
  'jrlandis@mail.med.upenn.edu'),
 ('Jason', 'Roy'): ('PhD',
  'Associate Professor of Biostatistics',
  'jaroy@mail.med.upenn.edu'),
 ('Jinbo', 'Chen'): ('PhD',
  'Associate Professor of Biostatistics',
  'jinboche@upenn.edu'),
 ('Jonas'

In [102]:
faculty_dict = {}
for index, row in data.iterrows():
    faculty_dict[row['name'].split()[0], row['name'].split()[-1]] = row.degree, row.title, row.email

sorted(faculty_dict.items(), key=lambda key: key[0][1])

[(('Scarlett', 'Bellamy'),
  ('ScD',
   'Associate Professor of Biostatistics',
   'bellamys@mail.med.upenn.edu')),
 (('Warren', 'Bilker'),
  ('PhD', 'Professor of Biostatistics', 'warren@upenn.edu')),
 (('Matthew', 'Bryan'),
  ('PhD', 'Assistant Professor of Biostatistics', 'bryanma@upenn.edu')),
 (('Jinbo', 'Chen'),
  ('PhD', 'Associate Professor of Biostatistics', 'jinboche@upenn.edu')),
 (('Susan', 'Ellenberg'),
  ('PhD', 'Professor of Biostatistics', 'sellenbe@upenn.edu')),
 (('Jonas', 'Ellenberg'),
  ('PhD', 'Professor of Biostatistics', 'jellenbe@mail.med.upenn.edu')),
 (('Rui', 'Feng'),
  ('PhD', 'Assistant Professor of Biostatistics', 'ruifeng@upenn.edu')),
 (('Benjamin', 'French'),
  ('PhD',
   'Associate Professor of Biostatistics',
   'bcfrench@mail.med.upenn.edu')),
 (('Phyllis', 'Gimotty'),
  ('PhD', 'Professor of Biostatistics', 'pgimotty@upenn.edu')),
 (('Wensheng', 'Guo'),
  ('PhD', 'Professor of Biostatistics', 'wguo@mail.med.upenn.edu')),
 (('Yenchih', 'Hsu'),
  ('Ph