# Numerical Data - Cleanup

<br>
**Author** Mary Letey <br>
**Date** 4/16/18 <br>
**File** megaset.ipynb <br>
**Purpose** Creating a mega-dataset aggregating all previous data sets, sorted by date for the last 5 years

<br><br>

In [1]:
# Imports
import numpy as np
import pandas as pd
import datetime
from dateutil import parser
import matplotlib.pylab as plt
%matplotlib inline

In [61]:
def cleanup(df,name):
    dates = list(df["date"]);
    newdates = []
    for i in range(0,len(dates)):
        mystr = dates[i]
        newdates.append(parser.parse(mystr))
    df["date"] = newdates
    df.sort_values(df.columns[1])
    title = name+"-title"
    body = name+"-body"
    
    titles = list(df["title"])
    bodies = list(df["body"])
    
    df = pd.DataFrame({'date': newdates, title: titles, body: bodies})
    columnsTitles=["date",title,body]
    df = df.reindex(columns = columnsTitles)
    return df

In [62]:
def combine(df1,df2,name):
    
    title = name+"-title"
    body = name+"-body"
    
    date1 = list(df1["date"])
    title1 = list(df1[title])
    body1 = list(df1[body])
    date2 = list(df2["date"])
    title2 = list(df2[title])
    body2 = list(df2[body])
    
    newdate = date1 + date2
    newtitle = title1 + title2
    newbody = body1 + body2
    
    df = pd.DataFrame({'date': newdate, title: newtitle, body: newbody})
    
    return df

In [69]:
dell_wsj_df = pd.read_csv('../data/nlp/dell_5yrs_wsj.csv')
dell_wsj_df = cleanup_wsj(dell_wsj_df, [15,16])
columnsTitles=["date","title","body"]
dell_wsj_df = dell_wsj_df.reindex(columns = columnsTitles)
title = "dell"+"-title"
body = "dell"+"-body"
dates = list(dell_wsj_df["date"])
titles = list(dell_wsj_df["title"])
bodies = list(dell_wsj_df["body"])
dell_wsj_df = pd.DataFrame({'date': dates, title: titles, body: bodies})
columnsTitles=["date",title,body]
dell_wsj_df = dell_wsj_df.reindex(columns = columnsTitles)
dell_wsj_df.head()

Unnamed: 0,date,dell-title,dell-body
0,2013-02-06 05:44:00,Lenovo’s Diplomatic Response to Dell Buyout,BLOOMBERGDell Inc.’s $24.4 billion deal to tak...
1,2013-04-22 12:50:00,Blackstone Ends Pursuit of Dell,Blackstone Group LP has ended its pursuit of D...
2,2013-02-05 20:50:00,Dell to Sell Itself for $24.4 Billion,Dell Inc. on Tuesday struck a $24.4 billion de...
3,2013-04-23 22:00:00,Dell Approves $91.1 Million in Retention Bonuses,Dell Inc. disclosed Tuesday that its board app...
4,2013-05-06 18:57:00,Dell Deal Negotiator Walks Fine Line,"Last August, Alex Mandl was hosting dinner gue..."


In [112]:
# -- dell 
df = pd.read_csv("../data/nlp/dell_5yrs_seekingalpha.csv");
dfdell = cleanup(df,"dell")
dfdell = combine(dfdell, dell_wsj_df,"dell")
dfdell.sort_values(dfdell.columns[1])

# -- hp 
df = pd.read_csv("../data/nlp/hp_5yrs_fool.csv");
dfhp = cleanup(df,"hp")
df = pd.read_csv("../data/nlp/hp_5yrs_seekingalpha.csv")
df = cleanup(df,"hp")
dfhp = combine(dfhp,df,"hp")

# -- ibm 
df = pd.read_csv("../data/nlp/ibm_5yrs_fool.csv");
dfibm = cleanup(df,"ibm")
df = pd.read_csv("../data/nlp/ibm_5yrs_seekingalpha.csv")
df = cleanup(df,"ibm")
dfibm = combine(dfibm,df,"ibm")

# -- seagate
df = pd.read_csv("../data/nlp/seagate_5yrs_fool.csv");
dfseagate = cleanup(df,"seagate")
df = pd.read_csv("../data/nlp/seagate_5yrs_seekingalpha.csv")
df = cleanup(df,"seagate")
dfseagate = combine(dfseagate,df,"seagate")

# -- western_digital
df = pd.read_csv("../data/nlp/western_digital_5yrs_fool.csv");
dfwestern_digital = cleanup(df,"western-digital")
dfwestern_digital.sort_values(dfwestern_digital.columns[0])

total_df = pd.merge(dfdell, dfhp, on="date")
total_df = pd.merge(total_df, dfibm, on="date")
total_df = pd.merge(total_df, dfseagate, on="date")
#total_df = pd.merge(dfdell, dfwestern_digital, on="date")


In [113]:
dfhp.head()

Unnamed: 0,date,hp-body,hp-title
0,2013-04-05 20:30:00,Warren Buffett became the greatest investor of...,Warren Buffett Doesn't Buy Junk Stocks (but Ma...
1,2013-02-01 11:30:00,On this day in economic and financial history....,Lessons of the Past for Boeing and HP Today
2,2013-04-18 13:36:00,"Like most investors, you probably aim for the ...",5 Leading Companies in Employee Relations
3,2013-03-05 18:00:00,"Chances are if you have kids, or you're just a...",DreamWorks Animation: Where Innovation and Ima...
4,2013-03-03 11:00:00,On this day in economic and financial history ...,Origin Stories of 3 of the World's Largest Com...


In [106]:
dfdell.head()

Unnamed: 0,date,dell-body,dell-title
0,2013-03-18 00:00:00,(At Least Not At These Prices)Despite differen...,"Dude, You're Not Getting DELL...Or AUTO"
1,2013-03-14 00:00:00,The media is currently focused on the saga of ...,Critical Analysis Of Dell Buyout Plan Shows Hi...
2,2013-03-14 00:00:00,Let me go through a EBIT multiple valuation me...,The Value Of Dell Using The EBIT Multiple Valu...
3,2018-02-07 00:00:00,Henry Blodget is pulling on exactly the right ...,Dell LBO Deal Structure - Like Buying A Rental...
4,2013-03-12 08:55:00,"In the view of many investors, billionaire Car...",Billionaire Carl Icahn Moving Into Multilevel-...


In [107]:
total_df.head()

Unnamed: 0,date,dell-body,dell-title,hp-body,hp-title,ibm-body,ibm-title,seagate-body,seagate-title
0,2018-06-05,"Last week, the session border controller (SBC)...",Sonus And AudioCodes: Your 1-2 Punch For Poten...,"Last week, the session border controller (SBC)...",Sonus And AudioCodes: Your 1-2 Punch For Poten...,"Let's get to right down to the ""meat on the bo...",IBM: Bet On Big Blue's Boom?,"OK, now that I have your attention, we can tal...",Who Buys Micron?
1,2018-06-05,"Last week, the session border controller (SBC)...",Sonus And AudioCodes: Your 1-2 Punch For Poten...,"Aruba, a Hewlett Packard Enterprise (NYSE:HPE)...",Hewlett Packard Enterprise's Aruba Leverages H...,"Let's get to right down to the ""meat on the bo...",IBM: Bet On Big Blue's Boom?,"OK, now that I have your attention, we can tal...",Who Buys Micron?


In [109]:
date = list(total_df["date"])
print(len(date))

2
