Extracting Data from https://coincodex.com/crypto/bitcoin/  and preprocessing and saving onto a MongoDB database

In [1]:
# import libraries
import pandas as pd
import numpy as np

In [9]:
# Define the file path. Adjust the username and path to match your local environment.
file_path = '/Users/moritzrichter/Desktop/cryptocurrency_analysis/data/Bitcoin_(BTC)/bitcoin_2019-04-16_2024-04-14.csv'

# Load the dataset from the specified file path
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to ensure it has loaded correctly
df.head()

Unnamed: 0,Start,End,Open,High,Low,Close,Volume,Market Cap
0,2024-04-13,2024-04-14,67033.18,67890.08,61480.0,64255.94,170782100000.0,1309283000000.0
1,2024-04-12,2024-04-13,70120.03,71173.45,66012.76,67208.21,156471000000.0,1367866000000.0
2,2024-04-11,2024-04-12,70436.71,71247.81,69587.99,70112.05,163489100000.0,1386264000000.0
3,2024-04-10,2024-04-11,69125.17,71078.17,67590.15,70513.6,158897300000.0,1361469000000.0
4,2024-04-09,2024-04-10,71618.12,71714.42,68442.77,69198.22,132405500000.0,1380411000000.0


In [10]:
# checking for mull values
df.isnull().sum().sum()
df.isna().sum()

Start         0
End           0
Open          0
High          0
Low           0
Close         0
Volume        0
Market Cap    0
dtype: int64

In [11]:
# Insert the "Name" column with all values set to "Bitcoin"
df.insert(0, 'Name', 'Bitcoin')

# Assume 'End' is the column to be used as 'Date' and that you need to rename 'Market Cap' to 'Marketcap'
df.rename(columns={'End': 'Date', 'Market Cap': 'Marketcap'}, inplace=True)

# Reorder the columns to the desired order
desired_columns = ['Name', 'Date', 'High', 'Low', 'Open', 'Close', 'Volume', 'Marketcap']
df = df[desired_columns]

# Save the modified DataFrame back to CSV or display it
df.to_csv('modified_data.csv', index=False)  # Save to new CSVprint(df.head())

In [13]:
df.head()

Unnamed: 0,Name,Date,High,Low,Open,Close,Volume,Marketcap
0,Bitcoin,2024-04-14,67890.08,61480.0,67033.18,64255.94,170782100000.0,1309283000000.0
1,Bitcoin,2024-04-13,71173.45,66012.76,70120.03,67208.21,156471000000.0,1367866000000.0
2,Bitcoin,2024-04-12,71247.81,69587.99,70436.71,70112.05,163489100000.0,1386264000000.0
3,Bitcoin,2024-04-11,71078.17,67590.15,69125.17,70513.6,158897300000.0,1361469000000.0
4,Bitcoin,2024-04-10,71714.42,68442.77,71618.12,69198.22,132405500000.0,1380411000000.0


In [14]:
# converting date to datetime format
df['Date'] = pd.to_datetime(df['Date'])

In [16]:
# data being analyzed for last five years, so filtering data for that date range
dates = df['Date'].unique()
print(dates)

<DatetimeArray>
['2024-04-14 00:00:00', '2024-04-13 00:00:00', '2024-04-12 00:00:00',
 '2024-04-11 00:00:00', '2024-04-10 00:00:00', '2024-04-09 00:00:00',
 '2024-04-08 00:00:00', '2024-04-07 00:00:00', '2024-04-06 00:00:00',
 '2024-04-05 00:00:00',
 ...
 '2019-04-26 00:00:00', '2019-04-25 00:00:00', '2019-04-24 00:00:00',
 '2019-04-23 00:00:00', '2019-04-22 00:00:00', '2019-04-21 00:00:00',
 '2019-04-20 00:00:00', '2019-04-19 00:00:00', '2019-04-18 00:00:00',
 '2019-04-17 00:00:00']
Length: 1825, dtype: datetime64[ns]


In [18]:
# creating a new dataframe after applying the above filters
bitcoin_df = df
bitcoin_df

Unnamed: 0,Name,Date,High,Low,Open,Close,Volume,Marketcap
0,Bitcoin,2024-04-14,67890.080000,61480.000000,67033.180000,64255.940000,1.707821e+11,1.309283e+12
1,Bitcoin,2024-04-13,71173.450000,66012.760000,70120.030000,67208.210000,1.564710e+11,1.367866e+12
2,Bitcoin,2024-04-12,71247.810000,69587.990000,70436.710000,70112.050000,1.634891e+11,1.386264e+12
3,Bitcoin,2024-04-11,71078.170000,67590.150000,69125.170000,70513.600000,1.588973e+11,1.361469e+12
4,Bitcoin,2024-04-10,71714.420000,68442.770000,71618.120000,69198.220000,1.324055e+11,1.380411e+12
...,...,...,...,...,...,...,...,...
1820,Bitcoin,2019-04-21,5372.559483,5314.995062,5318.985321,5363.756655,1.208157e+10,9.440771e+10
1821,Bitcoin,2019-04-20,5349.185233,5253.519639,5308.334719,5317.962150,1.072242e+10,9.345605e+10
1822,Bitcoin,2019-04-19,5330.747550,5268.381876,5271.352176,5309.256996,1.106357e+10,9.346719e+10
1823,Bitcoin,2019-04-18,5309.767053,5247.411896,5274.409619,5271.109695,1.171812e+10,9.299844e+10


In [19]:
# Define the file path where you want to save the CSV
file_path = '/Users/moritzrichter/Desktop/cryptocurrency_analysis/data/Bitcoin_(BTC)/bitcoin(fiveyears).csv'

# Save the DataFrame to CSV at the specified path
bitcoin_df.to_csv(file_path, index=False)  # `index=False` to prevent writing row indices in the file

# Storing file in MongoDB (DB name : Finale)

In [11]:
# Read in mongodb server location as client
client = MongoClient("mongodb+srv://Group7:Finale@finalsegment1.690c0.mongodb.net/bitcoin_db?retryWrites=true&w=majority",tlsCAFile=certifi.where())

In [12]:
# Find Databases
client.list_database_names()

['Bitcoin_db',
 'Cardano_DB',
 'Ethereum_db',
 'Finale',
 'Tether_DB',
 'XRP_DB',
 'admin',
 'local']

In [13]:
# connecting to the db on MongoDb and creating collection in the db
db = client['Finale']
collection = db['Bitcoin']

In [14]:
# adding dataframe to mongoDb
bitcoin_df.reset_index(inplace=True)
bitcoin_df_dict = bitcoin_df.to_dict("records")

In [15]:
# Insert collection
collection.insert_many(bitcoin_df_dict)

<pymongo.results.InsertManyResult at 0x15bb6b587c0>