In [1]:
import pandas as pd

from projects.common import get_project_data_frame
from projects.common.analysis import shows_per_date
from correlate_tweets import METAPHORS_URL_TEMPLATE

We want to see the columns loaded into the following tables and join
them into one table. First let's examine the so-called "project data frame," which only has annotated instances of metaphor, not examples that are not included in the analysis.

In [8]:
metaphors_url = METAPHORS_URL_TEMPLATE.format(2016)
df2016 = get_project_data_frame(metaphors_url)
print(df2016.columns)
df2016.head()

Index(['Unnamed: 0', 'start_localtime', 'start_time', 'stop_time',
       'runtime_seconds', 'network', 'program_name', 'iatv_id', 'facet_word',
       'conceptual_metaphor', 'spoken_by', 'subjects', 'objects',
       'active_passive', 'text', 'tense', 'repeat', 'repeat_index'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,start_localtime,start_time,stop_time,runtime_seconds,network,program_name,iatv_id,facet_word,conceptual_metaphor,spoken_by,subjects,objects,active_passive,text,tense,repeat,repeat_index
0,0,2016-09-16 17:00:00,2016-09-17 00:00:00,2016-09-17 01:01:00,3660.0,FOXNEWSW,The O'Reilly Factor,FOXNEWSW_20160917_000000_The_OReilly_Factor,hit,political strategy is a physical impact,,Donald Trump,News Shows,Active,"BREAKING TONIGHT, ONE OF THE MOST DRAMATIC ME...",present,False,
1,1,2016-09-27 21:00:00,2016-09-28 04:00:00,2016-09-28 05:01:01,3661.0,CNNW,Anderson Cooper 360,CNNW_20160928_040000_Anderson_Cooper_360,hit,criticism is a physical attack,,Donald Trump,Hillary Clinton,active,YOU THINK HE POURPFULLY DIDN'T PIVOT 0 HIT H...,past,False,
2,2,2016-09-27 21:00:00,2016-09-28 04:00:00,2016-09-28 05:01:01,3661.0,CNNW,Anderson Cooper 360,CNNW_20160928_040000_Anderson_Cooper_360,hit,political election is a fight,,Clinton Campaign,Donald Trump,Active,"FROM VENEZUELA, WINNER OF TRUMP'S MISS UNIVE...",past,False,
3,3,2016-09-27 21:00:00,2016-09-28 04:00:00,2016-09-28 05:01:01,3661.0,CNNW,Anderson Cooper 360,CNNW_20160928_040000_Anderson_Cooper_360,hit,political election is a fight,,Hillary Clinton,Donald Trump,Active,AND CLINTON HIT TRUMP FOR VOICING SUPPORT FO...,past,False,
4,4,2016-09-19 21:00:00,2016-09-20 04:00:00,2016-09-20 05:01:00,3660.0,MSNBCW,The Rachel Maddow Show,MSNBCW_20160920_040000_The_Rachel_Maddow_Show,hit,,Rachel Maddow,political trial,New Jersey,active,"ACTUALLY, NEVER MIND. PERHAPS NOT MAKING THE ...",present,False,


In [5]:
date_index = pd.date_range('2016-09-01', '2016-11-30', freq='D')
iatv_corpus_name = 'Viomet Sep-Nov 2016'
spd = shows_per_date(date_index, iatv_corpus_name)
print('shows per date not by network:', spd.head())

spd = shows_per_date(date_index, iatv_corpus_name, by_network=True)
print('shows per date by network:', spd.head())

shows per date not by network: 2016-09-01      6
2016-09-02      5
2016-09-03      2
2016-09-04    NaN
2016-09-05      2
Freq: D, dtype: object
shows per date by network:             MSNBCW  CNNW  FOXNEWSW
2016-09-01     2.0   2.0       2.0
2016-09-02     1.0   2.0       2.0
2016-09-03     1.0   0.0       1.0
2016-09-04     0.0   0.0       0.0
2016-09-05     0.0   1.0       1.0


Looking at the current implementation of `shows_per_date`, we have these lines that read out a two-tuple from Mongo:

```python
prog_dates = set(
    [
        (d.program_name, d.start_localtime.date())
        for d in docs
    ]
)
```

and these that read out a three-tuple when we want daily episodes by 

```python
prog_dates = set(
    [
        (d.program_name, d.network, d.start_localtime.date())
        for d in docs
    ]
)
```

So it seems the fields would be contained in the "project df" if the project df was all instances. In that case, make the project df all instances. Then at the point of loading the annotations, take only figurative non-repeats from the project df. When it's time to count episodes, it appears we only need to swap out a line of code to read from the project df and not Mongo.

But how has the project dataframe been made in the past? Using the
ProjectExporter in `projects/common/export_project.py`, which can be run as an executable. Currently does this to read from the DB

```python
self.project = Project.objects.get(name=project_name)

self.keyed_instances = (
    (facet.word, instance)
    for facet in self.project.facets
    for instance in facet.instances
    if instance.include
)
self.column_names =\
    IATV_DOCUMENT_COLUMNS + \
    ['facet_word'] + \
    INSTANCE_COLUMNS
```

Plan: get rid of `if instance.include` at this stage. Include it in a new function `get_metaphor_instances` that reads out desired fields where `include == True` from this new table.

In [10]:
from projects.common.analysis import shows_per_date

Now let's try our new `get_metaphor_instances` and `shows_per_date`, which take a project data frame as input.

In [15]:
proj_df = get_project_data_frame('Data/viomet-sep-nov-2016.csv')

print(proj_df.columns)
proj_df.head()

Index(['Unnamed: 0', 'start_localtime', 'start_time', 'stop_time', 'network',
       'program_name', 'iatv_id', 'facet_word', 'subjects', 'objects', 'text',
       'include', 'repeat', 'repeat_index'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,start_localtime,start_time,stop_time,network,program_name,iatv_id,facet_word,subjects,objects,text,include,repeat,repeat_index
0,0,2016-09-16 17:00:00,2016-09-17 00:00:00,2016-09-17 01:01:00,FOXNEWSW,The O'Reilly Factor,FOXNEWSW_20160917_000000_The_OReilly_Factor,hit,Donald Trump,News Shows,"BREAKING TONIGHT, ONE OF THE MOST DRAMATIC ME...",True,False,
1,1,2016-10-10 19:00:00,2016-10-11 02:00:00,2016-10-11 03:01:01,MSNBCW,The Last Word With Lawrence O'Donnell,MSNBCW_20161011_020000_The_Last_Word_With_Lawr...,hit,,,"EVERYBODY HIT THE GROUND, WHICH WE USUALLY D...",False,False,
2,2,2016-11-23 16:00:00,2016-11-24 00:00:00,2016-11-24 01:01:01,CNNW,Erin Burnett OutFront,CNNW_20161124_000000_Erin_Burnett_OutFront,hit,,,"WHEN A BULLY HITS YOU, YOU HIT THAT BULLY RI...",False,False,
3,3,2016-11-23 16:00:00,2016-11-24 00:00:00,2016-11-24 01:01:01,CNNW,Erin Burnett OutFront,CNNW_20161124_000000_Erin_Burnett_OutFront,hit,,,Reporter: THEY SAY KIM JONG UN'S JEEM PROBAB...,False,False,
4,4,2016-11-23 16:00:00,2016-11-24 00:00:00,2016-11-24 01:01:01,CNNW,Erin Burnett OutFront,CNNW_20161124_000000_Erin_Burnett_OutFront,hit,,,"FIRST OF ALL, JIM, IN THIS PARTICULAR CASE N...",False,False,


In [18]:
# Now to create the df of only instances
insts = proj_df[proj_df.include]
print(len(insts))  # total number of metaphorical instances found
print(len(proj_df))  # total number of times violence signals used

917
4138
