# Preparing NYT and FT dataframes

In [1]:
from datetime import datetime

import blaze
import pandas as pd

## NYT

In [2]:
# 8 csv files previously built on another machine. Sentiment included.
nyt_data_blaze = blaze.Data('../nytimes/nyt_data/data_*.csv', delimiter=',')

In [3]:
len(nyt_data_blaze)

934654

In [4]:
nyt_sentiment_data_df = blaze.odo(nyt_data_blaze, pd.DataFrame)

In [5]:
nyt_sent_data_df = nyt_sentiment_data_df[['article_id', 'pub_date', 'term', 'sentence', 'sentiment']].copy()
nyt_sent_data_df = nyt_sent_data_df.rename(index=str, columns={'pub_date': 'date'})
nyt_sent_data_df['source'] = 'nyt'

In [6]:
nyt_sent_data_df.head()

Unnamed: 0,article_id,date,term,sentence,sentiment,source
0,4fd20d7d8eb7c8105d77daf9,1999-01-16,new venture,Keeneland's new venture came as a surprise to ...,pos,nyt
1,4fd20db88eb7c8105d77e511,1999-01-03,manager,The plan Presley and his hucksterish business ...,neg,nyt
2,5482d92a38f0d8403d9cd045,2008-07-08,Twitch,In the male toadfish the swimbladder has been ...,pos,nyt
3,5456ab1438f0d86ef22a094e,2009-10-04,executive,Mr. Pozycki is the chief executive of SJP Prop...,pos,nyt
4,5456ab1438f0d86ef22a094e,2009-10-04,executive,Mr. Pozycki 59 is the chief executive of SJP P...,pos,nyt


## FT

In [7]:
# 8 csv files previously built on another machine. Sentiment included.
ft_data_blaze = blaze.Data('../nytimes/new_ft_data_*.csv', delimiter=',')

In [8]:
len(ft_data_blaze)

749475

In [9]:
ft_sentiment_data_df = blaze.odo(ft_data_blaze, pd.DataFrame)

In [10]:
ft_sent_data_df = ft_sentiment_data_df[['article_id', 'date', 'search_term', 'sentence', 'sentiment']].copy()
ft_sent_data_df = ft_sent_data_df.rename(index=str, columns={'search_term': 'term'})
ft_sent_data_df['source'] = 'ft'

In [11]:
%%time

ft_sent_data_df['date'] = ft_sent_data_df.date.apply(lambda x: pd.to_datetime(x, format='%Y%m%d'))

CPU times: user 2min 21s, sys: 3.56 s, total: 2min 25s
Wall time: 2min 16s


In [12]:
ft_sent_data_df.head()

Unnamed: 0,article_id,date,term,sentence,sentiment,source
0,40429008280,2004-04-29,microsoft,and Microsoft.,pos,ft
1,40429008280,2004-04-29,google,"has pulled off a series of acquisitions, inclu...",pos,ft
2,40429008280,2004-04-29,google,Earlier this year it dropped the Google search...,pos,ft
3,40429008280,2004-04-29,google,Yahoo could now be better placed than Google t...,pos,ft
4,40429008280,2004-04-29,google,user spends nearly three hours a month on the ...,pos,ft


## Together

In [13]:
sent_data_df = nyt_sent_data_df.append(ft_sent_data_df, ignore_index=True)

In [14]:
len(sent_data_df)

1684129

In [15]:
sent_data_df.head()

Unnamed: 0,article_id,date,term,sentence,sentiment,source
0,4fd20d7d8eb7c8105d77daf9,1999-01-16,new venture,Keeneland's new venture came as a surprise to ...,pos,nyt
1,4fd20db88eb7c8105d77e511,1999-01-03,manager,The plan Presley and his hucksterish business ...,neg,nyt
2,5482d92a38f0d8403d9cd045,2008-07-08,Twitch,In the male toadfish the swimbladder has been ...,pos,nyt
3,5456ab1438f0d86ef22a094e,2009-10-04,executive,Mr. Pozycki is the chief executive of SJP Prop...,pos,nyt
4,5456ab1438f0d86ef22a094e,2009-10-04,executive,Mr. Pozycki 59 is the chief executive of SJP P...,pos,nyt


In [16]:
def diff_month(d1, d2):
    return (d2.year - d1.year) * 12 + d2.month - d1.month

In [17]:
%%time

sent_data_df['n_months'] = sent_data_df.date.apply(lambda x: diff_month(datetime(1999, 1, 1), x.date()))

CPU times: user 8.12 s, sys: 78.8 ms, total: 8.2 s
Wall time: 8.11 s


In [18]:
sent_data_df.head()

Unnamed: 0,article_id,date,term,sentence,sentiment,source,n_months
0,4fd20d7d8eb7c8105d77daf9,1999-01-16,new venture,Keeneland's new venture came as a surprise to ...,pos,nyt,0
1,4fd20db88eb7c8105d77e511,1999-01-03,manager,The plan Presley and his hucksterish business ...,neg,nyt,0
2,5482d92a38f0d8403d9cd045,2008-07-08,Twitch,In the male toadfish the swimbladder has been ...,pos,nyt,114
3,5456ab1438f0d86ef22a094e,2009-10-04,executive,Mr. Pozycki is the chief executive of SJP Prop...,pos,nyt,129
4,5456ab1438f0d86ef22a094e,2009-10-04,executive,Mr. Pozycki 59 is the chief executive of SJP P...,pos,nyt,129


## Save results

In [19]:
sent_data_df.to_csv('nyt_ft_sent_data.csv', index=False)