In [24]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np
import spacy
from collections import Counter
import copy
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_lg")

[nltk_data] Downloading package punkt to /Users/manoj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/manoj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using the enron.csv file created in the previous notebook ,
we are extracting the emails of only top 5,10,15 authors(sorted according to no of emails per author) for our analysis 
and dropping any mails that have no text.

In [55]:
#Put in the file path to the dataset created from extractingauthors.ipynb
df = pd.read_csv("./enron.csv")
df = df.drop(["Email Folder"], axis=1)
#We need only the top 20 authors ordered by number of emails found in either the
#sent folder or _sent_mail folder

#Add top_authors = df.value_counts(["Folder"])[:X] for the number of authors required
# Change X to 5,10,15 to test with 5, 10, 15 authors
top_authors = df.value_counts(["Folder"])[:5]
df = df.loc[df["Folder"].isin(list(top_authors.index.get_level_values(0)))].drop(["Unnamed: 0"], axis=1).reset_index(drop=True)
df = df[df["Text"]!=" "]
df = df[df["Text"]!="\n"]
df = df.dropna()

In [56]:
df["Folder"].value_counts()

mann-k          8167
kaminski-v      5926
dasovich-j      4805
germany-c       4571
shackleton-s    4003
Name: Folder, dtype: int64

Random Sampling equal number of emails from each author 

In [19]:
def uniform_distribution(samples_per_author, df):
    df3 = pd.DataFrame(columns=["Author", "Folder", "File", "Text", "Raw Text"]) 
    for folder in df["Folder"].value_counts().index:
        df3 = df3.append(df[df["Folder"]==folder].sample(n=samples_per_author), ignore_index=True)
    return df3

In [58]:
# Change the number of samples per author here
df = uniform_distribution(4000, df)
print(df["Folder"].value_counts())
df

mann-k          4000
kaminski-v      4000
dasovich-j      4000
germany-c       4000
shackleton-s    4000
Name: Folder, dtype: int64


Unnamed: 0,Author,Folder,File,Text,Raw Text,Message ID
0,Kay,mann-k,540.0,Thanks thanks thanks\n\n,Message-ID: <21864199.1075846041637.JavaMail.e...,21864199.1075846041637.
1,Kay,mann-k,3623.0,I heard a rumor that there is a new leader for...,Message-ID: <237189.1075846007044.JavaMail.eva...,237189.1075846007044.
2,Kay,mann-k,2175.0,I won't do anything about this until you reach...,Message-ID: <19081360.1075846081129.JavaMail.e...,19081360.1075846081129.
3,Kay,mann-k,513.0,"Heather,\n\nDid you want to send this? Anythi...",Message-ID: <29170173.1075845930936.JavaMail.e...,29170173.1075845930936.
4,Kay,mann-k,3263.0,FYI.\n,Message-ID: <9690619.1075846108468.JavaMail.ev...,9690619.1075846108468.
...,...,...,...,...,...,...
19995,Sara,shackleton-s,1587.0,per my voice mail\n,Message-ID: <8208851.1075844556442.JavaMail.ev...,8208851.1075844556442.
19996,Sara,shackleton-s,812.0,Would someone please provide me with an explan...,Message-ID: <30780824.1075844536582.JavaMail.e...,30780824.1075844536582.
19997,Sara,shackleton-s,5506.0,Please schedule 30 minutes with Laurel to disc...,Message-ID: <22516364.1075844908943.JavaMail.e...,22516364.1075844908943.
19998,Sara,shackleton-s,2077.0,Thanks for the catch! Sara\n,Message-ID: <1832774.1075844568746.JavaMail.ev...,1832774.1075844568746.


In the next three cells , we have written function to extract the stylometric features(a combination of lexical
, structural and syntatic features)of a particular email using regex matching.

In [27]:
#Feature Extraction - Manoj
#extract feature - email length in characters. Exclude all whitespace. 
def feature_email_length_characters(text):
    if type(text) == str:
        text = text.strip()
        text = re.sub(r"\W", "", text)
        length = len(text)
        return length
    else:
        return np.NaN

#extract digit density - ratio of number of digits to number of characters
def feature_digit_density(text):
    if type(text) == str:
        text = text.strip()
        text = re.sub(r"\W", "", text)
        total_length = len(text)
        number_digits = len(re.findall(r"\d", text))
        try:
            return (number_digits/total_length)
        except ZeroDivisionError as e:
            return 0
    if type(text) == float:
        return np.NaN

#extract space density - ratio of space to number of characters
def feature_space_density(text):
    if type(text) == str:
        number_space = len(re.findall(r"[\s\n\t]", text))
        text = text.strip()
        text = re.sub(r"\W", "", text)
        total_length = len(text)
        try:
            return (number_space/total_length)
        except ZeroDivisionError as e:
            return 0
    if type(text) == float:
        return np.NaN

#extract number of paragraphs
def feature_paragraph(text):
    if type(text) == str:
        number_paragraphs = len(re.findall(r"\n\n", text))
        return max(1.0, number_paragraphs)
    if type(text) == float:
        return np.NaN
    
# extract number of sentences in paragraphs
def feature_average_characters_paragraph(text):
    if type(text) == str:
        number_paragraphs = len(re.findall(r"\n\n", text))
        if not number_paragraphs:
            return max(1, len(re.findall(r"[.?!]\W", text)))
        else:
            paragraphs = re.findall(r"(?:.+\n)+\n", text)
            length = 0
            for paragraph in paragraphs:
                length += max(1, len(re.findall(r"[.?!]\W", paragraph)))
            return length/number_paragraphs
    if type(text) == float:
        return np.NaN
    

#extract farewell words
def feature_farewell_words(text):
    if type(text) == str:
        try:
            words = text.split()
            for word in reversed(words):
                if re.search(r"\w+", word):
                    last_word = re.search(r"\w+", word).group().lower()
                    return last_word
                else:
                    continue
            return "\n"
        except IndexError as e:
            return np.NaN
    if type(text) == float:
        return np.NaN

def feature_freq_farewell_words(farewell_words, text):
    if type(text) == str:
        if text in farewell_words:
            return text
        else:
            return "Other"
    if type(text) == float:
        return np.NaN
        

#extract last punctuation
def feature_ending_punctuation(text):
    if type(text) == str:
        if re.search(r"[\!\,\.\?\:\'\"]", text):
            try:
                last_punc = re.findall(r"[\!\,\.\?\:\'\"]", text)[-1]
                return last_punc
            except IndexError as e:
                last_punc = len(re.findall(r"[\!\,\.\?\:\'\"]", text))
                if len == 1:
                    return last_punc[0]
        else:
            return "None"
    if type(text) == float:
        return np.NaN

#extract most common used punctuation in the email
def feature_most_used_punctuation(text):
    if type(text) == str:
        if re.search(r"[\!\,\.\?\:\'\"]", text):
            punc = re.findall(r"[\!\,\.\?\:\'\"]", text)
            most_used_punc = Counter(punc).most_common(1)[0][0]
            return most_used_punc
        else:
            return "None"
    if type(text) == float:
        return np.NaN
    
#extract subjectivity and polarity
def feature_subjectivity(text):
    if type(text) == str:
        blob = TextBlob(text)
        return blob.sentiment.subjectivity
    if type(text) == float:
        return np.NaN

#extract polarity
def feature_polarity(text):
    if type(text) == str:
        blob = TextBlob(text)
        return blob.sentiment.polarity
    if type(text) == float:
        return np.NaN

def feature_most_pos(text):
    if type(text) == str:
        blob = TextBlob(text)
        final_pos = []
        for word, pos in blob.tags:
            if word not in stopwords.words("english"):
                final_pos.append(pos)
        count_pos = Counter(final_pos)
        if count_pos.most_common():
            return count_pos.most_common()[0][0]
        else:
            return "Other"
    if type(text) == float:
        return np.NaN

def feature_get_greeting(text):
    if type(text) == str:
        if re.match(r"^\w+", text):
            greeting_word = re.match(r"^\w+", text).group()
            return greeting_word
        else:
            return "None"
    if type(text) == float:
        return np.NaN

def feature_most_common_word(text):
    if type(text) == str:
        blob = TextBlob(text)
        words = []
        for word, pos in blob.tags:
            words.append(word)
        count_word = Counter(words)
        if count_word.most_common():
            return count_word.most_common()[0][0]
        else:
            return np.NaN
    if type(text) == float:
        return np.NaN

def feature_freq_most_common_word(text):
    if type(text) == str:
        blob = TextBlob(text)
        words = []
        for word, pos in blob.tags:
            words.append(word)
        count_word = Counter(words)
        if count_word.most_common():
            return count_word.most_common()[0][1]
        else:
            return 0
    if type(text) == float:
        return np.NaN

def feature_number_words(text):
    if type(text) == str:
        blob = TextBlob(text)
        return len(blob.words)
    if type(text) == float:
        return np.NaN

In [28]:
#Feature Extraction - Sairaj
#Average length of words
def avg_length(text):
    if type(text) == str:
        list1 = text.split()
        word_len = 0
        for text in list1:
            strip_text=text.strip()
            if re.search(r"\w+",strip_text):
                strip_text = re.search(r"\w+",strip_text).group()
                word_len += len(strip_text)
        try:
            avg_word_len = word_len/len(list1)
        except ZeroDivisionError as e:
            return np.NaN
        return avg_word_len
    else:
        return np.NaN

#Average Sentence Length
def avg_sentence_length(text):
    if type(text) == str:
        list1 = re.findall(r"[^\.\?\!]+",text)
        sent_len = 0 
        for le in list1:
            sent_len += len(le)
        try:
            return sent_len/len(list1) 
        except:
            return np.NaN      
    else:
        return np.NaN

#Number of short words to overall number of words
def feature_short_word_ratio(text):
    if type(text) == str:
        list1 = text.split()
        short_word = 0
        if len(list1) >= 1:
            for word in list1:
                strip_text = word.strip()
                if re.search(r"\w+",strip_text):
                    strip_text = re.search(r"\w+",strip_text).group()
                    word_len = len(strip_text)
                    if word_len < 4:
                        short_word +=1
            try:
                short_word_rat = short_word/len(list1)
                return short_word_rat
            except:
                return 0
        else:
            return 0  
    else:
        return np.NaN

#Frequency of punctuation
def punctuation_frequency(text):
    if type(text) == str:
        if re.search(r"[\!\,\.\?\:\'\"]", text):
            punc = re.findall(r"[\!\,\.\?\:\'\"]", text)
            freq_punc = len(punc)
            return freq_punc
        else:
            return 0
    if type(text) == float:
        return np.NaN

#Punctuation after greeting
def punctuation_greeting(text):
    if type(text) == str:
        if re.search(r"^\w+([\,\:\?\!\-])\n", text):
            punc = re.search(r"^\w+([\,\:\?\!\-])\n", text).group(1)
            return punc
        else:
            return "None"
    if type(text) == float:
        return np.NaN

def feature_number_special_characters(text):
    if type(text) == str:
        special_characters = re.findall(r"[\@\#\$\%\^\&\~\`\*\(\)\<\>\\\[\]\{\}\|]", text)
        return len(special_characters)
    if type(text) == float:
        return np.NaN

def feature_max_special_character(text):
    if type(text) == str:
        special_characters = re.findall(r"[\@\#\$\%\^\&\~\`\*\(\)\<\>\\\[\]\{\}\|]", text)
        special_char_count = Counter(special_characters)
        if special_char_count.most_common():
            max_special_char = special_char_count.most_common()[0][0]
            return max_special_char
        else:
            return "None"
    if type(text) == float:
        return np.NaN

def feature_freq_max_special_character(text):
    if type(text) == str:
        special_characters = re.findall(r"[\@\#\$\%\^\&\~\`\*\(\)\<\>\\\[\]\{\}\|]", text)
        special_char_count = Counter(special_characters)
        if special_char_count.most_common():
            freq_max_special_char = special_char_count.most_common()[0][1]
            return freq_max_special_char
        else:
            return "0"
    if type(text) == float:
        return np.NaN

In [29]:
#Feature Extraction - Jaydeep
  
def check_single_sentence(clean_text):
    if type(clean_text) == str:
        ending_punc = re.findall(r"[.?!]", clean_text)
        ending_punc_count = Counter(ending_punc)
        single_sentence = False
        if(ending_punc_count):
            max_ep_char = max(ending_punc_count, key=ending_punc_count.get)
            max_ep_value = max(ending_punc_count.values())
        else:
            max_ep_char = ''
            max_ep_value = 0
        if max_ep_value<=1:
            single_sentence = True
    elif type(clean_text) == float:
        return np.NaN 
    return single_sentence

In the next few cells ,we have extracted the features for the text part of all emails in the dataset using the functions defined in the above cells.

In [30]:
email_length = df["Text"].apply(lambda row: feature_email_length_characters(row))
email_length.dropna()

0        1046
1           8
2        1955
3         172
4         400
         ... 
53194      98
53195      23
53196     119
53197     168
53198       8
Name: Text, Length: 52210, dtype: int64

In [31]:
digit_density = df["Text"].apply(lambda row: feature_digit_density(row))
digit_density.dropna()

0        0.009560
1        0.000000
2        0.009719
3        0.075581
4        0.010000
           ...   
53194    0.000000
53195    0.000000
53196    0.067227
53197    0.005952
53198    0.000000
Name: Text, Length: 52210, dtype: float64

In [32]:
space_density = df["Text"].apply(lambda row: feature_space_density(row))
space_density.dropna()

0        0.278203
1        0.375000
2        0.267519
3        0.267442
4        0.282500
           ...   
53194    0.295918
53195    0.347826
53196    0.310924
53197    0.351190
53198    0.500000
Name: Text, Length: 52210, dtype: float64

In [33]:
number_paragraphs = df["Text"].apply(lambda row: feature_paragraph(row))
number_paragraphs.dropna()

0        1.0
1        1.0
2        3.0
3        1.0
4        1.0
        ... 
53194    1.0
53195    1.0
53196    1.0
53197    1.0
53198    1.0
Name: Text, Length: 52210, dtype: float64

In [34]:
average_sentences_paragraph = df["Text"].apply(lambda row: feature_average_characters_paragraph(row))
average_sentences_paragraph.dropna()

0        22.0
1         1.0
2        13.0
3         3.0
4         7.0
         ... 
53194     2.0
53195     2.0
53196     2.0
53197     7.0
53198     1.0
Name: Text, Length: 52210, dtype: float64

In [35]:
farewell_words = df["Text"].apply(lambda row: feature_farewell_words(row))
raw_freq_farewell_words = farewell_words.value_counts()
print(raw_freq_farewell_words)
raw_freq_farewell_words = list(raw_freq_farewell_words[raw_freq_farewell_words>20].index)
print(raw_freq_farewell_words)
freq_farewell_words = []
for word in raw_freq_farewell_words:
    tokens = nlp(word)
    for token in tokens:
        if token.pos_ not in ["PROPN"]:
            freq_farewell_words.append(token.text)

farewell_words = farewell_words.apply(lambda row: feature_freq_farewell_words(freq_farewell_words, row))
farewell_words.dropna()

vince       3974
kay         3019
thanks      2425
sara        1570
kate        1445
            ... 
3300           1
90011682       1
bases          1
tossed         1
splits         1
Name: Text, Length: 5284, dtype: int64
['vince', 'kay', 'thanks', 'sara', 'kate', 'jeff', 'sally', 'best', 'you', '713', 'pl', 'fyi', 'it', 'me', 'fax', 'http', 'ss', 'this', 'know', 'the', '853', 'ckm', 'today', 'susan', 'to', 'questions', 'help', 'eric', 'that', 'kaminski', 'e', 'now', 'week', 'mark', 'there', 'time', 'tomorrow', 'out', 'in', 'john', 'soon', 'is', 'up', 'call', '\n', 'one', 'tonight', 'think', 'on', 'day', 'i', 'weekend', 'done', 'them', 'a', 'again', 'well', 'yet', '3', 'list', 'yes', 'ok', 'do', 'good', 'comments', 'night', 'of', 'work', 'am', 'agreement', 'and', 'too', 'for', '2', 'go', 'here', 'sue', 'love', '4', 'below', 'pm', 'deal', 'be', '20', 'though', 'please', 'meeting', 'later', 'monday', 'much', 'us', 'attached', '1', 'changed', 'morning', 'back', 'will', 'with', 'email'

0          Other
1            you
2          Other
3         thanks
4          Other
          ...   
53194    morning
53195       good
53196         10
53197      Other
53198      Other
Name: Text, Length: 52210, dtype: object

In [36]:
greeting_words = df["Text"].apply(lambda row: feature_get_greeting(row))
greeting_words.dropna()

0             Hey
1           Thank
2               a
3           Frank
4             don
           ...   
53194         are
53195    whatever
53196          we
53197           i
53198        lisa
Name: Text, Length: 52210, dtype: object

In [37]:
most_common_word = df["Text"].apply(lambda row: feature_most_common_word(row))
most_common_word.dropna()

0             the
1           Thank
2             the
3             the
4              to
           ...   
53194         you
53195    whatever
53196          at
53197           i
53198        lisa
Name: Text, Length: 52083, dtype: object

In [38]:
most_common_word[most_common_word.isna()==True]
df.iloc[1714]

Author                                                     John
Folder                                                 arnold-j
File                                                       57.0
Message ID                              12437191.1075852711582.
Text          your guys are probably seeing this as well, bu...
Raw Text      Message-ID: <12437191.1075852711582.JavaMail.e...
Name: 1732, dtype: object

In [39]:
subjectivity = df["Text"].apply(lambda row: feature_subjectivity(row))
polarity = df["Text"].apply(lambda row: feature_polarity(row))
print(subjectivity.dropna())
print(polarity.dropna())

0        0.493526
1        0.000000
2        0.423773
3        0.100000
4        0.402626
           ...   
53194    0.000000
53195    0.600000
53196    0.000000
53197    0.506944
53198    0.000000
Name: Text, Length: 52210, dtype: float64
0        0.135994
1        0.000000
2        0.019855
3        0.066667
4       -0.008687
           ...   
53194    0.000000
53195    0.700000
53196    0.000000
53197    0.198611
53198    0.000000
Name: Text, Length: 52210, dtype: float64


In [40]:
freq_most_common_word = df["Text"].apply(lambda row: feature_freq_most_common_word(row))
freq_most_common_word.dropna()

0        14
1         1
2        15
3         3
4         5
         ..
53194     4
53195     1
53196     3
53197     5
53198     1
Name: Text, Length: 52210, dtype: int64

In [41]:
pos = df["Text"].apply(lambda row: feature_most_pos(row))
pos

0         NN
1        NNP
2         NN
3         NN
4         NN
        ... 
53194     NN
53195    WDT
53196     NN
53197     VB
53198     JJ
Name: Text, Length: 52210, dtype: object

In [42]:
pos.dropna()

0         NN
1        NNP
2         NN
3         NN
4         NN
        ... 
53194     NN
53195    WDT
53196     NN
53197     VB
53198     JJ
Name: Text, Length: 52210, dtype: object

In [43]:
last_punc = df["Text"].apply(lambda row: feature_ending_punctuation(row))
last_punc.dropna()

0           .
1           .
2           .
3           ,
4           .
         ... 
53194       ?
53195       .
53196       .
53197       ?
53198    None
Name: Text, Length: 52210, dtype: object

In [44]:
freq_punc = df["Text"].apply(lambda row: feature_most_used_punctuation(row))
freq_punc.dropna()

0           .
1           .
2           .
3           ,
4           .
         ... 
53194       ?
53195       .
53196       .
53197       .
53198    None
Name: Text, Length: 52210, dtype: object

In [45]:
avg_len = df["Text"].apply(lambda row: avg_length(row))
avg_len

0        4.250000
1        4.000000
2        4.304933
3        3.902439
4        4.061224
           ...   
53194    3.920000
53195    4.600000
53196    3.382353
53197    3.274510
53198    4.000000
Name: Text, Length: 52210, dtype: float64

In [46]:
avg_sent_len = df["Text"].apply(lambda row: avg_sentence_length(row))
avg_sent_len

0        56.291667
1         5.500000
2        61.268293
3        57.250000
4        64.750000
           ...    
53194    42.333333
53195    10.333333
53196    52.333333
53197    28.625000
53198    13.000000
Name: Text, Length: 52210, dtype: float64

In [47]:
short_word_ratio = df["Text"].apply(lambda row: feature_short_word_ratio(row))
short_word_ratio.dropna()

0        0.475410
1        0.500000
2        0.403587
3        0.487805
4        0.438776
           ...   
53194    0.560000
53195    0.200000
53196    0.617647
53197    0.607843
53198    0.000000
Name: Text, Length: 52210, dtype: float64

In [48]:
punc_freq = df["Text"].apply(lambda row: punctuation_frequency(row))
punc_freq

0        30
1         1
2        60
3         9
4        11
         ..
53194     3
53195     2
53196     3
53197     9
53198     0
Name: Text, Length: 52210, dtype: int64

In [49]:
punc_greet = df["Text"].apply(lambda row: punctuation_greeting(row))
punc_greet.dropna()

0           :
1        None
2        None
3           :
4        None
         ... 
53194    None
53195    None
53196    None
53197    None
53198    None
Name: Text, Length: 52210, dtype: object

In [50]:
number_words = df["Text"].apply(lambda row: feature_number_words(row))
number_words

0        247
1          2
2        456
3         42
4        100
        ... 
53194     25
53195      5
53196     34
53197     52
53198      2
Name: Text, Length: 52210, dtype: int64

In [51]:
number_special_characters = df["Text"].apply(lambda row: feature_number_special_characters(row))
print(number_special_characters.dropna())
max_special_character = df["Text"].apply(lambda row: feature_max_special_character(row))
print(max_special_character.dropna())
freq_max_special_character = df["Text"].apply(lambda row: feature_freq_max_special_character(row))
print(freq_max_special_character.dropna())

single_sentence = df["Text"].apply(lambda row: check_single_sentence(row))
print(single_sentence.dropna())

0        1
1        0
2        6
3        3
4        0
        ..
53194    0
53195    0
53196    0
53197    0
53198    0
Name: Text, Length: 52210, dtype: int64
0           $
1        None
2           $
3           %
4        None
         ... 
53194    None
53195    None
53196    None
53197    None
53198    None
Name: Text, Length: 52210, dtype: object
0        1
1        0
2        6
3        2
4        0
        ..
53194    0
53195    0
53196    0
53197    0
53198    0
Name: Text, Length: 52210, dtype: object
0        False
1         True
2        False
3        False
4        False
         ...  
53194    False
53195    False
53196    False
53197    False
53198     True
Name: Text, Length: 52210, dtype: bool


In [52]:
df.loc[0, ["Text", "File", "Folder"]]

Text      Hey:\nHaven't had the best of months.  Like yo...
File                                                   36.0
Folder                                             arnold-j
Name: 0, dtype: object

Here we are appending all the extracted features into the dataframe

In [53]:
#Combine everything into one dataset

df["Email Length"] = email_length
df["Digit Density"] = digit_density
df["Space Density"] = space_density
df["Number of Paragraphs"] = number_paragraphs
df["Average Sentences per Paragraph"] = average_sentences_paragraph
df["Farewell Words"] = farewell_words
df["Freq Punc"] = freq_punc
df["Last Punc"] = last_punc
df["Average Word Length"] = avg_len
df["Average Sentence Length"] = avg_sent_len
df["Short Word Ratio"] = short_word_ratio
df["Punc Frequency"] = punc_freq
df["Punc after Greeting"] = punc_greet
df["Number Words"] = number_words
df["Subjectivity"] = subjectivity
df["Polarity"] = polarity
df["Most Common POS"] = pos
df["Single Sentence"] = single_sentence
df["Greeting"] = greeting_words
df["Most Common Word"] = most_common_word
df["Freq Most Common Word"] = freq_most_common_word
df["Total Special Character Count"] = number_special_characters
df["Max Occurring Special Char"] = max_special_character
df["Count of Max Special Char"] = freq_max_special_character

df

Unnamed: 0,Author,Folder,File,Message ID,Text,Raw Text,Email Length,Digit Density,Space Density,Number of Paragraphs,...,Subjectivity,Polarity,Most Common POS,Single Sentence,Greeting,Most Common Word,Freq Most Common Word,Total Special Character Count,Max Occurring Special Char,Count of Max Special Char
0,John,arnold-j,36.0,33491127.1075857594966.,Hey:\nHaven't had the best of months. Like yo...,Message-ID: <33491127.1075857594966.JavaMail.e...,1046,0.009560,0.278203,1.0,...,0.493526,0.135994,NN,False,Hey,the,14,1,$,1
1,John,arnold-j,667.0,6384662.1075857656041.,Thank you.\n\n,Message-ID: <6384662.1075857656041.JavaMail.ev...,8,0.000000,0.375000,1.0,...,0.000000,0.000000,NNP,True,Thank,Thank,1,0,,0
2,John,arnold-j,759.0,21884118.1075857658063.,a couple of observations from here:\ncash/futu...,Message-ID: <21884118.1075857658063.JavaMail.e...,1955,0.009719,0.267519,3.0,...,0.423773,0.019855,NN,False,a,the,15,6,$,6
3,John,arnold-j,313.0,11352651.1075857600972.,"Frank:\nThe $5,000,000 extra VAR disappears in...",Message-ID: <11352651.1075857600972.JavaMail.e...,172,0.075581,0.267442,1.0,...,0.100000,0.066667,NN,False,Frank,the,3,3,%,2
4,John,arnold-j,710.0,25732708.1075857656969.,don't care about the front. i think its vulne...,Message-ID: <25732708.1075857656969.JavaMail.e...,400,0.010000,0.282500,1.0,...,0.402626,-0.008687,NN,False,don,to,5,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53194,Matthew,lenhart-m,226.0,14926455.1075845207288.,are you going out this weekend? i think you s...,Message-ID: <14926455.1075845207288.JavaMail.e...,98,0.000000,0.295918,1.0,...,0.000000,0.000000,NN,False,are,you,4,0,,0
53195,"Lenhart, Matthew",lenhart-m,318.0,9871308.1075845209613.,whatever. ride would be good.\n\n,Message-ID: <9871308.1075845209613.JavaMail.ev...,23,0.000000,0.347826,1.0,...,0.600000,0.700000,WDT,False,whatever,whatever,1,0,,0
53196,"Lenhart, Matthew",lenhart-m,752.0,9553008.1075862009396.,we are meeting these people sunday morning at ...,Message-ID: <9553008.1075862009396.JavaMail.ev...,119,0.067227,0.310924,1.0,...,0.000000,0.000000,NN,False,we,at,3,0,,0
53197,"Lenhart, Matthew",lenhart-m,351.0,5044717.1075845210498.,i forgot you had a wedding. i don't go up to ...,Message-ID: <5044717.1075845210498.JavaMail.ev...,168,0.005952,0.351190,1.0,...,0.506944,0.198611,VB,False,i,i,5,0,,0


In [54]:
df.to_csv("Enron_29_Features.csv")