In [68]:
%%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import torch
import torchvision
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.utils import save_image
from torchvision.datasets import MNIST, CIFAR10

from sklearn import preprocessing, model_selection
from datetime import datetime, timedelta
import calendar

import os
import gc
import sys

CPU times: user 587 µs, sys: 58 µs, total: 645 µs
Wall time: 649 µs


In [69]:
# Unpacks the date column to multiple features

def process_dates(df, column='Date', dform='%Y-%m-%d', year=True, month=True, 
                  day=True, weekday=True, inplace=False, drop_date=False):
    # Extracts desired features from date
    dates = df[column]
        
    date_dict = {'year': [], 'month': [], 'day': [], 'weekday': []}
    
    for i in range(len(df)):
        date = datetime.strptime(dates[i], dform)
        if year:
            date_dict['year'].append(date.year)
        if month:
            date_dict['month'].append(date.month)
        if day:
            date_dict['day'].append(date.day)
        if weekday:
            date_dict['weekday'].append(date.weekday())
    
    for feature in date_dict:
        if len(date_dict[feature]) == 0:
            del date_dict[feature]
            
    if drop_date:
        df.drop(column, inplace=True, axis=1)
    
    if inplace:
        for key in date_dict:
            df[key] = date_dict[key]
        return None
            
    return date_dict


def process_datetimes(df, column='datetime', dform='%Y-%m-%d %H:%M', year=True, 
                      month=True, day=True, weekday=True, hour=True, 
                      minute=True, second=False, inplace=False, 
                      drop_date=False):
    # Extracts desired features from date
    datetimes = df[column]
        
    datetime_dict = {
        'year': [], 'month': [], 'day': [], 'weekday': [], 
        'hour': [], 'minute': [], 'second': []
    }
    
    for i in range(len(df)):
        datetime_ = datetime.strptime(datetimes[i], dform)
        datetime_ = datetime_ - timedelta(hours=4)
        if year:
            datetime_dict['year'].append(datetime_.year)
        if month:
            datetime_dict['month'].append(datetime_.month)
        if day:
            datetime_dict['day'].append(datetime_.day)
        if weekday:
            datetime_dict['weekday'].append(datetime_.weekday())
        if hour:
            datetime_dict['hour'].append(datetime_.hour)
        if minute:
            datetime_dict['minute'].append(datetime_.minute)
        if second:
            datetime_dict['second'].append(datetime_.second)
    
#     for feature in datetime_dict:
#         if len(datetime_dict[feature]) == 0:
#             del datetime_dict[feature]
            
    if drop_date:
        df.drop(column, inplace=True, axis=1)
    
    if inplace:
        for key in datetime_dict:
            if len(datetime_dict[key]) > 0:
                df[key] = datetime_dict[key]
        return None
            
    return datetime_dict


def create_series(df, col, out='Target', inplace=False):
    
    if not inplace:
        df = df.copy()
    
    next_list = []
    for i in range(1, len(df)):
        next_list.append(df.iloc[i][col])
        
    df.drop(len(df) - 1, inplace=True)
    df[out] = next_list
    
    if not inplace:
        return df
    return None


def create_bin_series(df, col, out='Target', inplace=False):
    if not inplace:
        df = df.copy()
    
    next_list = []
    for i in range(1, len(df)):
        next_list.append(df.iloc[i][col] - df.iloc[i-1][col])
        
    for i in range(len(next_list)):
        if next_list[i] >= 0:
            next_list[i] = 1
        else:
            next_list[i] = 0
        
    df.drop(len(df) - 1, inplace=True)
    df[out] = next_list
    
    if not inplace:
        return df
    return None

In [8]:
data_path = '/Users/michaeldjaballah/Data/Crypto/raw_import2/raw'
all_filenames = os.listdir(data_path)
new_filenames = []
for i in range(len(all_filenames)):
    if all_filenames[i][0] == '2':
        new_filenames.append(all_filenames[i])
all_filenames = new_filenames

In [42]:
# all but csv
filename[:4]
# minutes
filename[14:16]
# date and time without seconds
filename[:16]

renaming_dict = {}

for filename in all_filenames:
    timename = filename[:16]
    if filename[15] == '1':
        timename = timename[:15] + '0'
    elif filename[15] == '6':
        timename = timename[:15] + '5'
    renaming_dict[timename] = filename

In [44]:
%%time
dataframe_dict = {}

for timename in renaming_dict:
    dataframe_dict[timename] = pd.read_csv(data_path + '/' + renaming_dict[timename])

CPU times: user 32.3 s, sys: 2.48 s, total: 34.8 s
Wall time: 39 s


In [57]:
# test_df = dataframe_dict[timename].copy()
# test_df['datetime'] = [timename] * len(test_df)
# process_datetimes(test_df, inplace=True, drop_date=True)
# test_df

In [75]:
%%time
dform='%Y-%m-%d %H:%M'

# date = datetime.strptime(dates[i], dform)

for timename in renaming_dict:
    df = dataframe_dict[timename]
    df.drop('Unnamed: 10', axis=1, inplace=True)
    df['datetime'] = [timename] * len(df)
    process_datetimes(df, inplace=True, drop_date=True)

CPU times: user 1min 36s, sys: 609 ms, total: 1min 36s
Wall time: 1min 37s


In [81]:
sum([sys.getsizeof(dataframe_dict[name]) for name in dataframe_dict])

1269643172

In [83]:
1269643172/(1024 * 1024)

1210.826084136963

In [87]:
%%time
df_list = [dataframe_dict[timename] for timename in dataframe_dict]
big_df = pd.concat(df_list)

CPU times: user 5.46 s, sys: 133 ms, total: 5.59 s
Wall time: 5.61 s


In [103]:
big_df.head(10)

Unnamed: 0,#,Name,Symbol,Market Cap,Price,Circulating Supply,Volume (24h),% 1h,% 24h,% 7d,year,month,day,weekday,hour,minute
0,1,Bitcoin,BTC,"$126,314,012,094","$6,896.05","18,316,862 BTC","$43,352,916,378",0.11%,-5.09%,2.27%,2020,4,10,4,17,55
1,2,Ethereum,ETH,"$17,636,306,349",$159.66,"110,465,037 ETH","$17,955,924,594",0.53%,-6.13%,12.58%,2020,4,10,4,17,55
2,3,XRP,XRP,"$8,325,384,980",$0.189304,"43,978,966,311 XRP *","$2,107,571,588",0.53%,-4.35%,5.75%,2020,4,10,4,17,55
3,4,Tether,USDT,"$6,362,759,432",$1.00,"6,361,032,509 USDT *","$55,956,566,575",0.22%,0.26%,-0.19%,2020,4,10,4,17,55
4,5,Bitcoin Cash,BCH,"$4,314,224,998",$234.77,"18,376,481 BCH","$4,245,854,728",0.66%,-8.62%,-0.32%,2020,4,10,4,17,55
5,6,Bitcoin SV,BSV,"$3,474,348,788",$189.08,"18,375,290 BSV","$3,457,956,063",0.12%,-11.78%,6.63%,2020,4,10,4,17,55
6,7,Litecoin,LTC,"$2,735,715,645",$42.43,"64,480,532 LTC","$3,807,862,439",0.48%,-8.05%,5.21%,2020,4,10,4,17,55
7,8,EOS,EOS,"$2,312,147,765",$2.51,"921,679,807 EOS *","$3,635,038,426",0.17%,-7.65%,7.87%,2020,4,10,4,17,55
8,9,Binance Coin,BNB,"$2,171,948,813",$13.96,"155,536,713 BNB *","$411,246,860",0.48%,-6.81%,5.11%,2020,4,10,4,17,55
9,10,Tezos,XTZ,"$1,356,901,619",$1.92,"706,552,810 XTZ *","$202,719,606",-0.64%,-8.75%,13.76%,2020,4,10,4,17,55


In [111]:
%%time
comp_col = []

for i in range(len(big_df)):
    p = big_df.iloc[i]
    comp =''
    for attr in ['year', 'month', 'day', 'hour', 'minute']:
        comp_ = str(p[attr])
        if len(comp_) < 2:
            comp += '0' + comp_
        else:
            comp += comp_
    comp = int(comp)
    comp_col.append(comp)

CPU times: user 5min 41s, sys: 505 ms, total: 5min 42s
Wall time: 5min 42s


In [113]:
big_df['sort'] = comp_col

In [117]:
%%time
big_df.sort_values('sort', ignore_index=True, inplace=True)

CPU times: user 515 ms, sys: 78.9 ms, total: 593 ms
Wall time: 616 ms


In [118]:
big_df

Unnamed: 0,#,Name,Symbol,Market Cap,Price,Circulating Supply,Volume (24h),% 1h,% 24h,% 7d,year,month,day,weekday,hour,minute,sort
0,2,Ethereum,ETH,"$20,934,619,876",$190.22,"110,053,295 ETH","$15,384,861,352",-1.79%,-4.70%,-14.84%,2020,3,11,2,13,25,202003111325
1,74,Ren,REN,"$61,686,791",$0.072466,"851,253,651 REN *","$10,302,859",-1.29%,2.56%,12.17%,2020,3,11,2,13,25,202003111325
2,73,BitTorrent,BTT,"$64,844,357",$0.000306,"212,116,500,000 BTT *","$72,507,799",-0.47%,-3.56%,-16.38%,2020,3,11,2,13,25,202003111325
3,72,v.systems,VSYS,"$64,928,586",$0.033416,"1,943,023,245 VSYS *","$3,672,293",-2.45%,-7.56%,-38.21%,2020,3,11,2,13,25,202003111325
4,71,Bytecoin,BCN,"$66,245,314",$0.000360,"184,066,828,814 BCN","$6,940",0.09%,8.32%,-20.19%,2020,3,11,2,13,25,202003111325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1967591,131,Electroneum,ETN,"$19,637,779",$0.001951,"10,064,293,701 ETN","$85,713",-1.62%,-1.11%,-10.11%,2020,4,14,1,17,25,202004141725
1967592,130,TrueChain,TRUE,"$20,230,771",$0.254234,"79,575,543 TRUE *","$34,003,535",-0.26%,1.79%,-11.47%,2020,4,14,1,17,25,202004141725
1967593,129,Elastos,ELA,"$20,459,851",$1.11,"18,394,684 ELA","$1,396,131",0.22%,-1.19%,-12.51%,2020,4,14,1,17,25,202004141725
1967594,139,MX Token,MX,"$18,618,723",$0.096331,"193,278,361 MX *","$8,732,880",-0.90%,3.93%,0.34%,2020,4,14,1,17,25,202004141725


In [121]:
%%time
sys.getsizeof(big_df)

CPU times: user 30.5 s, sys: 92.3 ms, total: 30.6 s
Wall time: 30.7 s


1283810020

In [122]:
1283810020/(1024**2)

1224.3366432189941

In [123]:
save_path = '/Users/michaeldjaballah/Data/Crypto/processed/'

In [124]:
# big_df.to_csv(save_path + 'ri2.csv')