In [293]:
# imports
import pandas as pd
import numpy as np
import datetime
import math
from dateutil import parser
from functools import reduce

In [294]:
# read all csvs into dataframes
dell_df = pd.read_csv('./datafiles/numerical-initial/dell-data.csv')
dow_df = pd.read_csv('./datafiles/numerical-initial/dow-data.csv', sep='\t')
hp_df = pd.read_csv('./datafiles/numerical-initial/hp-data.csv')
ibm_df = pd.read_csv('./datafiles/numerical-initial/ibm-data.csv')
nasdaq_df = pd.read_csv('./datafiles/numerical-initial/nasdaq-data.csv')
sandp500_df = pd.read_csv('./datafiles/numerical-initial/sandp500-data.csv')
seagate_df = pd.read_csv('./datafiles/numerical-initial/seagate-data.csv')
westdig_df = pd.read_csv('./datafiles/numerical-initial/westdig-data.csv')

In [295]:
# remove NaN rows
dell_df = dell_df.dropna(axis = 0, how = 'all')
dow_df = dow_df.dropna(axis = 0, how = 'all')
hp_df = hp_df.dropna(axis = 0, how = 'all')
ibm_df = ibm_df.dropna(axis = 0, how = 'all')
nasdaq_df = nasdaq_df.dropna(axis = 0, how = 'all')
sandp500_df = sandp500_df.dropna(axis = 0, how = 'all')
seagate_df = seagate_df.dropna(axis = 0, how = 'all')
westdig_df = westdig_df.dropna(axis = 0, how = 'all')

In [296]:
# function to convert volume o numerical
def df_cleanup(df):
    volumes = []
    parsed_dates = []
    for volume_str in df['Volume']:
        try:
            if volume_str.endswith('M'):
                volume_num = float(volume_str[0:len(volume_str)-1])*(10**6)
            elif volume_str.endswith('k'):
                volume_num = float(volume_str[0:len(volume_str)-1])*(10**3)
        except:
            volume_num = None
        volumes.append(volume_num)

        
    volume_df = pd.DataFrame({'Volume' : volumes})
    df['Volume'] = volume_df['Volume']
    
    for date in df['Date']:
        parsed_dates.append(parser.parse(date))
    df = df.drop(columns = ['SMAVG(15)'])
    df = df.drop(columns = ['Day'])
    df['Date'] = parsed_dates
    return df

dell_df = df_cleanup(dell_df)
hp_df = df_cleanup(hp_df)
ibm_df = df_cleanup(ibm_df)
seagate_df = df_cleanup(seagate_df)
westdig_df = df_cleanup(westdig_df)

dow_df = df_cleanup(dow_df)
nasdaq_df = df_cleanup(nasdaq_df)
sandp500_df = df_cleanup(sandp500_df)

In [297]:
def rename(df, company_name):
    names = []
    for old_name in list(df.columns):
        if old_name != 'Date':
            names.append('{}_{}'.format(company_name,old_name))
        else:
            names.append(old_name)
    df.columns = names
    return df

dell_df = rename(dell_df, 'dell')
hp_df = rename(hp_df, 'hp')
ibm_df = rename(ibm_df, 'ibm')
seagate_df = rename(seagate_df, 'seagate')
westdig_df = rename(westdig_df, 'westdig')

dow_df = rename(dow_df, 'dow')
nasdaq_df = rename(nasdaq_df, 'nasdaq')
sandp500_df = rename(sandp500_df, 'sandp')

In [298]:
dfs = [dell_df, hp_df, ibm_df, seagate_df, westdig_df, dow_df, nasdaq_df, sandp500_df]
mega_df = reduce(lambda  left,right: pd.merge(left,right,on=['Date'], how='outer'), dfs)
mega_df.to_csv('./data-cleanup/combined_data_set.csv')

In [299]:
dell_wsj_df = pd.read_csv('./scraper/dell_5yrs_wsj.csv')

In [300]:
def cleanup_wsj(df):
    df = df.drop([15,16])
    new_dates = []
    for date_str in df['date']:
        new_date_str = date_str.replace('Updated ', '')
        new_date_str = new_date_str.replace(' ET', '')
        new_date_str = new_date_str.replace(' | WSJ Pro', '')
        new_dates.append(parser.parse(new_date_str))
#         new_dates.append(new_date_str)
    date_df = pd.DataFrame({'date' : new_dates})
    df['date'] = date_df['date']
    
    return df

In [301]:
dell_wsj_df = cleanup_wsj(dell_wsj_df)
dell_wsj_df.to_csv('./data-cleanup/cleaned-dell-wsj.csv')

In [302]:
dell_wsj_df

Unnamed: 0,title,date,body
0,Lenovo’s Diplomatic Response to Dell Buyout,2013-02-06 05:44:00,BLOOMBERGDell Inc.’s $24.4 billion deal to tak...
1,Blackstone Ends Pursuit of Dell,2013-04-22 12:50:00,Blackstone Group LP has ended its pursuit of D...
2,Dell to Sell Itself for $24.4 Billion,2013-02-05 20:50:00,Dell Inc. on Tuesday struck a $24.4 billion de...
3,Dell Approves $91.1 Million in Retention Bonuses,2013-04-23 22:00:00,Dell Inc. disclosed Tuesday that its board app...
4,Dell Deal Negotiator Walks Fine Line,2013-05-06 18:57:00,"Last August, Alex Mandl was hosting dinner gue..."
5,Michael Dell Has Dreamliner on Order,2013-01-23 19:38:00,Add Michael Dell to the list of planned custom...
6,Dell Laptop Does Flips To Try to Be A Thick Ta...,2013-04-02 21:00:00,Welcome to another in our series of reviews on...
7,Blackstone’s Letter to Dell: ‘We Will Not be P...,2013-04-19 07:31:00,"Blackstone Group, which ended its pursuit of D..."
8,Criador da Dell fecha capital para proteger se...,2013-02-06 00:03:00,"By Ben Worthen e Anupreeta DaFeb. 6, 2013 12:0..."
9,The Hidden Deal Maker Behind the Dell Buyout,2013-02-05 19:30:00,The buyout of Dell Inc. is a breakout moment f...
