# Bitcoin data cleaning


Dataset obtained from https://www.kaggle.com/datasets/mczielinski/bitcoin-historical-data, where there was over 4 million entries of data. Due to large amount of data, the size of the file was 300,000 KB, and made it too big to push to github. We used the fear and greed index dataset as a restriction, where it starts from 2018-02-01. So we will only use the data from 2018-02-01, to match the time frame. 

After cleaning the data, we then export it in to the file data

In [1]:
# Importing the libraries to be used:
from datetime import datetime
import numpy as np
import pandas as pd
import requests
import os

In [2]:
def date_convertor(timestamp):
    return datetime.fromtimestamp(float(timestamp))

In [3]:
bitcoin_data = pd.read_csv("data/bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv", sep=",",parse_dates = ["Timestamp"], date_parser = date_convertor, index_col=False) 

In [4]:
bitcoin_clean = bitcoin_data.dropna()
bitcoin_clean = bitcoin_clean[1900000:]
print(bitcoin_clean)

                  Timestamp      Open      High       Low     Close  \
3098119 2017-11-25 02:03:00   8185.00   8185.00   8180.11   8185.00   
3098120 2017-11-25 02:04:00   8185.00   8185.00   8185.00   8185.00   
3098121 2017-11-25 02:05:00   8185.00   8185.14   8185.00   8185.14   
3098122 2017-11-25 02:06:00   8182.01   8182.01   8182.01   8182.01   
3098123 2017-11-25 02:07:00   8185.13   8185.14   8185.13   8185.14   
...                     ...       ...       ...       ...       ...   
4857372 2021-03-30 19:56:00  58714.31  58714.31  58686.00  58686.00   
4857373 2021-03-30 19:57:00  58683.97  58693.43  58683.97  58685.81   
4857374 2021-03-30 19:58:00  58693.43  58723.84  58693.43  58723.84   
4857375 2021-03-30 19:59:00  58742.18  58770.38  58742.18  58760.59   
4857376 2021-03-30 20:00:00  58767.75  58778.18  58755.97  58778.18   

         Volume_(BTC)  Volume_(Currency)  Weighted_Price  
3098119      0.520570        4259.672912     8182.705868  
3098120      0.182275        

In [5]:
timestamp = np.array(bitcoin_clean["Timestamp"])

In [6]:
bitcoin_day = []
start_index = 0
prev = timestamp[0]
for i in range(timestamp.shape[0]):
    if str(prev)[:10] not in str(timestamp[i])[:10]:
        if str(prev)[:10] >= "2018-01-31":
            bitcoin_day.append(bitcoin_clean.iloc[i])
        prev = timestamp[i]

In [7]:
bitcoin_pd = pd.DataFrame(bitcoin_day)
#bitcoin_pd["Timestamp"] = bitcoin_pd["Timestamp"].dt.strftime('%m-%d-%Y')
#print(bitcoin_pd["Timestamp"].iloc[0])

In [8]:
bitcoin_pd = bitcoin_pd.reset_index()
bitcoin_pd = bitcoin_pd.drop(columns=["index", "Timestamp"])
print(bitcoin_pd)

          Open      High       Low     Close  Volume_(BTC)  Volume_(Currency)  \
0     10092.62  10109.93  10092.62  10109.93      0.270443        2733.799973   
1      8630.00   8630.00   8594.12   8630.00      9.315129       80242.200123   
2      8408.72   8419.79   8400.10   8414.00      1.044576        8789.768163   
3      9044.01   9044.01   9030.29   9031.93      2.555955       23095.214725   
4      8278.56   8286.74   8254.07   8254.07      0.271247        2241.175148   
...        ...       ...       ...       ...           ...                ...   
1149  52449.23  52537.81  52449.23  52527.35      2.742287      143985.370640   
1150  54821.66  54856.01  54821.66  54856.01      0.261643       14346.416748   
1151  56095.81  56158.59  56095.81  56149.01      0.898928       50459.981106   
1152  55361.25  55417.15  55361.25  55401.67      1.314104       72804.822995   
1153  57069.04  57072.89  57033.66  57051.13      0.857266       48912.428151   

      Weighted_Price  
0   

 # Extracting Fear and Greed Index

In [9]:
response = requests.get("https://api.alternative.me/fng/?limit=1600&format=csv&date_format=us")

In [10]:
fngIndex = []
startIndex = 0
endIndex = 0
count = 0
label = ["Date", "fng_value", "fng_classification"]
for i in response.iter_lines():
    line = i.decode("utf-8").strip("\t\n").split(",")
    if "03-30-2021" == line[0]:
        startIndex = count
    if "02-01-2018" == line[0]:
        endIndex = count+1
    count+=1
    fngIndex.append(line)
fngIndex = fngIndex[startIndex:endIndex]

In [11]:
fngArr = np.array(fngIndex[::-1])
fngPd = pd.DataFrame(fngArr, columns = label)
print(fngPd)

            Date fng_value fng_classification
0     02-01-2018        30               Fear
1     02-02-2018        15       Extreme Fear
2     02-03-2018        40               Fear
3     02-04-2018        24       Extreme Fear
4     02-05-2018        11       Extreme Fear
...          ...       ...                ...
1146  03-26-2021        54            Neutral
1147  03-27-2021        65              Greed
1148  03-28-2021        74              Greed
1149  03-29-2021        72              Greed
1150  03-30-2021        72              Greed

[1151 rows x 3 columns]


# Combining Fear and Greed w/ Bitcoin data

In [12]:
merged_df = pd.concat([bitcoin_pd, fngPd], axis = 1, join = "inner").dropna()

In [13]:
print(merged_df)

          Open      High       Low     Close  Volume_(BTC)  Volume_(Currency)  \
0     10092.62  10109.93  10092.62  10109.93      0.270443        2733.799973   
1      8630.00   8630.00   8594.12   8630.00      9.315129       80242.200123   
2      8408.72   8419.79   8400.10   8414.00      1.044576        8789.768163   
3      9044.01   9044.01   9030.29   9031.93      2.555955       23095.214725   
4      8278.56   8286.74   8254.07   8254.07      0.271247        2241.175148   
...        ...       ...       ...       ...           ...                ...   
1146  54673.79  54677.14  54638.56  54638.56      0.348995       19077.618823   
1147  54342.42  54342.42  54272.46  54279.85      0.480468       26086.504869   
1148  52129.52  52157.81  52046.67  52050.00      9.465353      492683.589000   
1149  52449.23  52537.81  52449.23  52527.35      2.742287      143985.370640   
1150  54821.66  54856.01  54821.66  54856.01      0.261643       14346.416748   

      Weighted_Price       

# Converting to CSV

In [14]:
if not(os.path.exists("data/bitstamp_2018-02-01_to_2021_03_31.csv")):
    bitcoin_pd.to_csv("data/bitstamp_2018-02-01_to_2021_03_31.csv")
else:
    print("Need to remove old file bitcoin")

Need to remove old file bitcoin


In [15]:
if not(os.path.exists("data/merge_bitcoin_n_fear.csv")):
    merged_df.to_csv("data/merge_bitcoin_n_fear.csv") 
else:
    print("Need to remove old file merge file")

Need to remove old file merge file
