In [1]:
import requests
import pandas
from dateutil import parser
host = 'http://18.188.56.207:9200/'
requests.get(host + '_cat/indices/enron').content

b'yellow open enron WtsTmsTpRw6kcHbnT4BA7Q 1 1 251734 32 505.4mb 505.4mb\n'

In [3]:
doc = {
    "size": 10000,
    "from": 0,
    "query" : {
        "match_all" : {}
    }
}
import json
r=requests.get(host + 'enron/_search', data=json.dumps(doc), headers={'Content-Type':'application/json'})
print(len(r.json()['hits']['hits']))

10000


In [4]:
def elasticsearch_results_to_df(results):
    '''
    A function that will take the results of a requests.get 
    call to Elasticsearch and return a pandas.DataFrame object 
    with the results 
    '''
    hits = results.json()['hits']['hits']
    data = pandas.DataFrame([i['_source'] for i in hits], index = [i['_id'] for i in hits])
    data['date'] = data['date'].apply(parser.parse)
    return(data)

def print_df_row(row):
    '''
    A function that will take a row of the data frame and print it out
    '''
    print('____________________')
    print('RE: %s' % row.get('subject',''))
    print('At: %s' % row.get('date',''))
    print('From: %s' % row.get('sender',''))
    print('To: %s' % row.get('recipients',''))
    print('CC: %s' % row.get('cc',''))
    print('BCC: %s' % row.get('bcc',''))
    print('Body:\n%s' % row.get('text',''))
    print('____________________')


In [5]:
# Put elasticsearch results into a pandas.DataFrame object
df = elasticsearch_results_to_df(r)
df.head()
# print(df)
# print_df_row(df.iloc[0])

Unnamed: 0,sender,recipients,cc,text,bcc,date,subject
78,sally.beck@enron.com,sharron.westbrook@enron.com,christina.valdez@enron.com,I have an ENW budget review meeting with EGM (...,christina.valdez@enron.com,2001-10-10 23:30:04+00:00,RE: Time change
79,david.port@enron.com,greg.whalley@enron.com,rick.buy@enron.com vince.kaminski@enron.com,Here is a draft position report we talked abou...,rick.buy@enron.com vince.kaminski@enron.com,2001-04-05 14:26:00+00:00,Position Report
80,noreply@ccomad3.uu.commissioner.com,dutch.quigley@enron.com,,\n[IMAGE] \t\n Battle your friends for glory i...,,2001-10-24 10:45:24+00:00,Commissioner.COM E-Reports for UHFFL 10/24/01
81,debra.perlingiere@enron.com,kaye.ellis@enron.com,,FINE WITH ME\n\nDebra Perlingiere\nEnron North...,,2001-02-06 15:10:00+00:00,"Re: Vacation Day, Friday, February 16"
82,dale.surbey@enron.com,kate.bruges@enron.com,anjam.ahmad@enron.com vince.kaminski@enron.com,"Kate,\n\nHas the paperwork been finalized yet?...",anjam.ahmad@enron.com vince.kaminski@enron.com,2000-07-26 10:54:00+00:00,Re: Sharad Agnihotri


In [5]:
# Query For a full text match in the "text" field
# Uses the "match" query: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html
doc = {
    "query": {
        "match" : {
            "text" : "important reporting"
        } 
    },
    "from" : 0, # Starting message to return. 
    "size" : 2000, # Return this many messages. Can't be more than 10,000
}
r=requests.get(host + 'enron/_search',
               data=json.dumps(doc), headers={'Content-Type':'application/json'})
r.raise_for_status()
print("Found %s messages matching the query, of " % r.json()['hits']['total'])
df = elasticsearch_results_to_df(r)
print("Returned %s messages" % df.shape[0])
print_df_row(df.iloc[0])

Found {'value': 10000, 'relation': 'gte'} messages matching the query, of 
Returned 2000 messages
____________________
RE: RE: Reporting replication issue is now fixed
At: 2001-10-09 17:11:18+00:00
From: lynn.blair@enron.com
To: jennifer.lowry@enron.com  group.dl-ets@enron.com
CC: nan
BCC: nan
Body:
	Jennifer, how long has this been a problem?  Is there a concern we have caused
	customers problems in nominating due to bad information?  Thanks. Lynn

 -----Original Message-----
From: 	Lowry, Jennifer   
Sent:	Tuesday, October 09, 2001 10:11 AM
To:	DL-ETS TMS Modification Group
Subject:	Reporting replication issue is now fixed


Yesterday we noticed a problem where reports were not reporting on the correct cycle, or were not picking up information between cycles.  As it turns out, an important table was not being replicated from the application database to the reporting database.  

I was told that this problem has been fixed, and on first inspection of the tables, everything looks corre

In [6]:
# Query For a text match in the "text" or "subject" fields. Uses the multi-match query:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html
doc = {
  "query": {
    "multi_match" : {
      "query":    "settlement", 
      "fields": [ "subject", "text" ] 
    }
  }
}
r=requests.get(host + 'enron/_search',
               data=json.dumps(doc), headers={'Content-Type':'application/json'})
r.raise_for_status()
print("Found %s messages matching the query, of " % r.json()['hits']['total'])
df = elasticsearch_results_to_df(r)
print("Returned %s messages" % df.shape[0])
print_df_row(df.iloc[9])

Found {'value': 4102, 'relation': 'eq'} messages matching the query, of 
Returned 10 messages
____________________
RE: Status of final statements and replacement invoices
At: 2001-10-24 01:34:50+00:00
From: thailu@ercot.com
To: jackson.amie@enron.com  pratka.amy@enron.com  mitrey.andy@enron.com  williams.angela@enron.com  garza.beth@enron.com  palmer.bill@enron.com  cooper.bob@enron.com  edwards.brady@enron.com  green.brenda@enron.com  smith.carl@enron.com  smith.carl@enron.com  carey.dan@enron.com  sarti.dan@enron.com  leger.dana@enron.com  wessels.david@enron.com  pawlik.debbie@enron.com  bailey.debra@enron.com  dyc.dennis@enron.com  slover.eric@enron.com  nitschmann.frances@enron.com  herndon.gary@enron.com  geissler.ginger@enron.com  striedel.james@enron.com  holland.janet@enron.com  jeffrey.miller@enron.com  doyas.jenny@enron.com  burt.jerry@enron.com  barker.joe@enron.com  favalora.joe@enron.com  forney.john@enron.com  fitzmaurice.kathy@enron.com  minear.kelly@enron.com  koliba.k

In [7]:
# "OR" query for two phrase matches. Generally you get fancy query parsing with this:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html
doc = {
    "query": {
        "query_string" : {
            "default_field" : "text",
            "query" : "(reach a settlement) OR (continue to pursue)" 
        }
    }
}
r=requests.get(host + 'enron/_search',
               data=json.dumps(doc), headers={'Content-Type':'application/json'})
r.raise_for_status()
print("Found %s messages matching the query" % r.json()['hits']['total']['value'])
df = elasticsearch_results_to_df(r)
print("Returned %s messages" % df.shape[0])
print_df_row(df.iloc[0])

Found 10000 messages matching the query
Returned 10 messages
____________________
RE: Settlement Conversation Recap
At: 2002-05-02 18:26:52+00:00
From: michael.bridges@enron.com
To: koikosp@talgov.com
CC: chris.germany@enron.com
BCC: chris.germany@enron.com
Body:
Hi Pete,

Wanted to recap our conversation yesterday, update you on our timing and get you my contact information.

Regarding a possible settlement, it appears to me that this is a concept that you have considered and are willing to pursue.  We will have a valuation of the contract for you Monday.  It is my suggestion that you review the proposal, make sure that you agree with the outstanding transactions and complete your own valuation.  Once we have agreed on the universe of transactions, you and I will discuss and finalize a settlement amount that we are comfortable will be approved by the creditor committee and bankruptcy judge.

From this point, Enron will begin the process of filing a motion for settlement and schedule a

In [8]:
df.iloc[9]

sender                                    w..cantrell@enron.com
recipients    d..steffes@enron.com  leslie.lawner@enron.com ...
cc                                                          NaN
text          \nNGI's Daily Gas Price Index \npublished : Au...
bcc                                                         NaN
date                                  2001-08-31 14:44:25+00:00
subject       NGI Article:  ALJ Advocates Money Settlement i...
Name: 95720, dtype: object

In [9]:
# Do a count of all documents in the database by month
doc = {
    "aggs" : {
        "aggregation_var_name" : {
            "date_histogram" : {
                "field" : "date",
                "interval" : "month"
            }
        }
    }
}
r=requests.get(host + 'enron/_search',
               data=json.dumps(doc), headers={'Content-Type':'application/json'})
r.raise_for_status()
def aggregation_to_df(response,var_name='aggregation_var_name'):
    r = response.json()
    r['aggregations'][var_name]
    df = pandas.DataFrame(r['aggregations'][var_name]['buckets'])
    df['date'] = df['key_as_string'].apply(parser.parse)
    df = df[(df['date'] >= '1999-01-1') & (df['date'] < '2002-07-01')]
    df = df[['date','doc_count']]
    return df
df = aggregation_to_df(r)
print(df)

                         date  doc_count
228 1999-01-01 00:00:00+00:00         65
229 1999-02-01 00:00:00+00:00         43
230 1999-03-01 00:00:00+00:00         50
231 1999-04-01 00:00:00+00:00         45
232 1999-05-01 00:00:00+00:00        338
233 1999-06-01 00:00:00+00:00        327
234 1999-07-01 00:00:00+00:00        446
235 1999-08-01 00:00:00+00:00        509
236 1999-09-01 00:00:00+00:00        588
237 1999-10-01 00:00:00+00:00        643
238 1999-11-01 00:00:00+00:00        594
239 1999-12-01 00:00:00+00:00       1248
240 2000-01-01 00:00:00+00:00       2142
241 2000-02-01 00:00:00+00:00       2471
242 2000-03-01 00:00:00+00:00       3001
243 2000-04-01 00:00:00+00:00       2991
244 2000-05-01 00:00:00+00:00       3718
245 2000-06-01 00:00:00+00:00       4820
246 2000-07-01 00:00:00+00:00       4493
247 2000-08-01 00:00:00+00:00       6200
248 2000-09-01 00:00:00+00:00       6872
249 2000-10-01 00:00:00+00:00       8273
250 2000-11-01 00:00:00+00:00      10617
251 2000-12-01 0

In [10]:
# Do a count of all documents matching a query by month
doc = {
    "query": {
        "match" : {
            "text" : "important reporting"
        }
    },
    "aggs" : {
        "aggregation_var_name" : {
            "date_histogram" : {
                "field" : "date",
                "interval" : "month"
            }
        }
    }
}
r=requests.get(host + 'enron/_search',
               data=json.dumps(doc), headers={'Content-Type':'application/json'})
r.raise_for_status()
df = aggregation_to_df(r)
print(df)

                         date  doc_count
228 1999-01-01 00:00:00+00:00         12
229 1999-02-01 00:00:00+00:00          7
230 1999-03-01 00:00:00+00:00          2
231 1999-04-01 00:00:00+00:00          8
232 1999-05-01 00:00:00+00:00         12
233 1999-06-01 00:00:00+00:00         13
234 1999-07-01 00:00:00+00:00         21
235 1999-08-01 00:00:00+00:00         22
236 1999-09-01 00:00:00+00:00         14
237 1999-10-01 00:00:00+00:00         15
238 1999-11-01 00:00:00+00:00         31
239 1999-12-01 00:00:00+00:00         55
240 2000-01-01 00:00:00+00:00         89
241 2000-02-01 00:00:00+00:00        104
242 2000-03-01 00:00:00+00:00        116
243 2000-04-01 00:00:00+00:00         92
244 2000-05-01 00:00:00+00:00        157
245 2000-06-01 00:00:00+00:00        176
246 2000-07-01 00:00:00+00:00        180
247 2000-08-01 00:00:00+00:00        268
248 2000-09-01 00:00:00+00:00        312
249 2000-10-01 00:00:00+00:00        403
250 2000-11-01 00:00:00+00:00        625
251 2000-12-01 0

In [11]:
# Search an exact match in a specific feild
doc = {
    "query": {
        "match" : {
            "recipients" : "stephen.schwarzbach@enron.com"
        } 
    },
}
r=requests.get(host + 'enron/_search',
               data=json.dumps(doc), headers={'Content-Type':'application/json'})
r.raise_for_status()
print("Found %s messages matching the query, of " % r.json()['hits']['total']['value'])
df = elasticsearch_results_to_df(r)
print("Returned %s messages" % df.shape[0])
print_df_row(df.iloc[0])

Found 16 messages matching the query, of 
Returned 10 messages
____________________
RE: FW: Information Request
At: 2001-11-16 20:18:21+00:00
From: tracy.geaccone@enron.com
To: stephen.schwarzbach@enron.com
CC: nan
BCC: nan
Body:


 -----Original Message-----
From: 	Walker, Blake  
Sent:	Friday, November 16, 2001 1:57 PM
To:	Geaccone, Tracy
Subject:	RE: Information Request

Tracy,

Attached is the info request. I have populated 3 worksheets:

EGS IS
EGS Burn
EGS Exp

These are rough numbers and need polishing!

Blake

 




____________________
