In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("raw-email.csv",
                usecols=['Subject', 'Body', 'From: (Name)', 'From: (Address)'],
                #index_col='Subject',
                dtype='str')
df = df.rename(columns={'From: (Name)': 'Sender', 'From: (Address)': 'Email'})

# make all lowercase
df.columns = df.columns.str.strip().str.lower()
df = df.apply(lambda x: x.astype(str).str.lower())
#df.index = df.index.str.lower()
df = df.astype('string')
df

Unnamed: 0,subject,body,sender,email
0,"strivescan's ""colleges that change lives"" virt...",join our special event! ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌...,st. john's college admissions,admissions@sjc.edu
1,wooster is coming to you virtually.,connect with us soon. <https://eur06.safeli...,the college of wooster,admissions@wooster.edu
2,experience a bard college at simon's rock class!,"dear madeleine, would you like to experienc...",agabay@simons-rock.edu,agabay@simons-rock.edu
3,faculty relationships run deep,<https://mx.technolutions.net/proxy/k5ih0w...,lafayette college,admissions@lafayette.edu
4,how honors work can transform your college exp...,<http://elink.alaska.edu/o/3/tvrrnu5ez3dpvfe6...,university of alaska fairbanks,uaf-admissions@alaska.edu
...,...,...,...,...
2649,virtual event: religious & spiritual life at h...,join us to learn more about the religious and ...,johns hopkins university,gotojhu@jhu.edu
2650,explore cornell cals,get a glimpse of cornell cals from your comput...,college of agriculture and life sciences admis...,cals_admissions@cornell.edu
2651,looking for a college that will change your life?,ctcl virtual days <https://nam11.safelinks....,st. olaf college,admissions@stolaf.edu
2652,pre-health advantages at rochester,<https://mx.technolutions.net/proxy/teeczf7ia...,university of rochester admissions,admit@admissions.rochester.edu


In [2]:
# Remove all that aren't from an actual college or university
df = df[df.email.str.contains('.edu')]
df

Unnamed: 0,subject,body,sender,email
0,"strivescan's ""colleges that change lives"" virt...",join our special event! ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌...,st. john's college admissions,admissions@sjc.edu
1,wooster is coming to you virtually.,connect with us soon. <https://eur06.safeli...,the college of wooster,admissions@wooster.edu
2,experience a bard college at simon's rock class!,"dear madeleine, would you like to experienc...",agabay@simons-rock.edu,agabay@simons-rock.edu
3,faculty relationships run deep,<https://mx.technolutions.net/proxy/k5ih0w...,lafayette college,admissions@lafayette.edu
4,how honors work can transform your college exp...,<http://elink.alaska.edu/o/3/tvrrnu5ez3dpvfe6...,university of alaska fairbanks,uaf-admissions@alaska.edu
...,...,...,...,...
2649,virtual event: religious & spiritual life at h...,join us to learn more about the religious and ...,johns hopkins university,gotojhu@jhu.edu
2650,explore cornell cals,get a glimpse of cornell cals from your comput...,college of agriculture and life sciences admis...,cals_admissions@cornell.edu
2651,looking for a college that will change your life?,ctcl virtual days <https://nam11.safelinks....,st. olaf college,admissions@stolaf.edu
2652,pre-health advantages at rochester,<https://mx.technolutions.net/proxy/teeczf7ia...,university of rochester admissions,admit@admissions.rochester.edu


In [3]:
import re
s = 'My name is Conrad, and blahblah@admissions.harvard.edu is my email.'
domain = re.search(r"(?<=@)[\w.-_]+(?=.edu)", s)
print(domain.group())

print(df.dtypes)

admissions.harvard
subject    string
body       string
sender     string
email      string
dtype: object


In [11]:
# Get school from email domain
#df['school'] = df['email'].str.extract('(?<=@)([\w.-_]+)(?=.edu)').astype('string')
#print(df['school'].dtype)


# Alternate method for parsing school from email address.
# #1: Reverse email address
# #2: Regex: start after first '.', match all characters up until next '.' or '@'
# #3: Reverse match. 

#df['school'] = re.match(r"(?<=\.)[\w_-]+(?=(\.|@))", df['email'].str[::-1]).group()[::-1]
df['email_rev'] = df['email'].str[::-1]
df['school_rev'] = df['email_rev'].str.extract(pat=r'(?<=\.)([\w_-]+)(?=(\.|@))').astype('string')
df['school'] = df['school_rev'].str[::-1]
df = df.drop(['email_rev', 'school_rev'], axis=1)

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['email_rev'] = df['email'].str[::-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['school_rev'] = df['email_rev'].str.extract(pat=r'(?<=\.)([\w_-]+)(?=(\.|@))').astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['school'] = df['school_rev'].str[::-1]


Unnamed: 0,subject,body,sender,email,school
0,"strivescan's ""colleges that change lives"" virt...",join our special event! ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌...,st. john's college admissions,admissions@sjc.edu,sjc
1,wooster is coming to you virtually.,connect with us soon. <https://eur06.safeli...,the college of wooster,admissions@wooster.edu,wooster
2,experience a bard college at simon's rock class!,"dear madeleine, would you like to experienc...",agabay@simons-rock.edu,agabay@simons-rock.edu,simons-rock
3,faculty relationships run deep,<https://mx.technolutions.net/proxy/k5ih0w...,lafayette college,admissions@lafayette.edu,lafayette
4,how honors work can transform your college exp...,<http://elink.alaska.edu/o/3/tvrrnu5ez3dpvfe6...,university of alaska fairbanks,uaf-admissions@alaska.edu,alaska
...,...,...,...,...,...
2649,virtual event: religious & spiritual life at h...,join us to learn more about the religious and ...,johns hopkins university,gotojhu@jhu.edu,jhu
2650,explore cornell cals,get a glimpse of cornell cals from your comput...,college of agriculture and life sciences admis...,cals_admissions@cornell.edu,cornell
2651,looking for a college that will change your life?,ctcl virtual days <https://nam11.safelinks....,st. olaf college,admissions@stolaf.edu,stolaf
2652,pre-health advantages at rochester,<https://mx.technolutions.net/proxy/teeczf7ia...,university of rochester admissions,admit@admissions.rochester.edu,rochester


In [None]:
schools = df.school.unique()
print(schools)

In [None]:
print(len(schools))

In [None]:
schools_df = pd.DataFrame(index=schools)
schools_df.index.name = 'school'
schools_df['num_emails'] = None
schools_df

for school in schools:
    schools_df['num_emails'] = df[df.school == 'school'].shape[0]
schools_df.head(167)

In [None]:
# Alternate method for parsing school from email address.
# hello@admit.uw.edu
txt = 'hello@admit.uw.edu'
# reverse email
# ude.wu.timda@ollhe
txt = txt[::-1]
print(txt)
# regex: parse from 1st '.' to next '.' or '@'
# wu
import re
txt = re.search(r"(?<=\.)[\w_-]+(?=(\.|@))", txt)
print(txt.group())
# reverse string
# uw
txt = txt.group()[::-1]
print(txt)

In [None]:
# Alternate method for parsing school from email address.
# #1: Reverse email address
# #2: Regex: start after first '.', match all characters up until next '.' or '@'
# #3: Reverse match. 
txt = 'hello@admit.uw.edu'
txt = re.search(r"(?<=\.)[\w_-]+(?=(\.|@))", txt[::-1]).group()[::-1]

print(txt)