Extracting Data from https://coincodex.com/crypto/ethereum/  and preprocessing and saving onto a MongoDB database

In [2]:
# import libraries
import pandas as pd
import numpy as np

In [3]:
# Define the file path. Adjust the username and path to match your local environment.
file_path = '/Users/moritzrichter/Desktop/cryptocurrency_analysis/data/Ethereum_(ETH)/ethereum_2019-04-16_2024-04-14.csv'

# Load the dataset from the specified file path
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to ensure it has loaded correctly
df.head()

Unnamed: 0,Start,End,Open,High,Low,Close,Volume,Market Cap
0,2024-04-13,2024-04-14,3231.1639,3297.6372,2874.401,3024.3586,32796790000.0,384236700000.0
1,2024-04-12,2024-04-13,3515.9494,3559.298,3207.709,3245.0984,20597450000.0,413222700000.0
2,2024-04-11,2024-04-12,3544.0727,3619.6398,3487.4192,3516.6996,23359330000.0,425841400000.0
3,2024-04-10,2024-04-11,3505.5251,3559.9659,3426.2983,3547.5273,25494310000.0,421861200000.0
4,2024-04-09,2024-04-10,3702.1783,3719.8724,3480.2572,3510.2155,27414390000.0,432872600000.0


In [4]:
# checking for mull values
df.isnull().sum().sum()
df.isna().sum()

Start         0
End           0
Open          0
High          0
Low           0
Close         0
Volume        0
Market Cap    0
dtype: int64

In [5]:
# Insert the "Name" column with all values set to "Bitcoin"
df.insert(0, 'Name', 'Ethereum')

# Assume 'End' is the column to be used as 'Date' and that you need to rename 'Market Cap' to 'Marketcap'
df.rename(columns={'End': 'Date', 'Market Cap': 'Marketcap'}, inplace=True)

# Reorder the columns to the desired order
desired_columns = ['Name', 'Date', 'High', 'Low', 'Open', 'Close', 'Volume', 'Marketcap']
df = df[desired_columns]

# Save the modified DataFrame back to CSV or display it
df.to_csv('modified_data.csv', index=False)  # Save to new CSVprint(df.head())

In [6]:
df.head()

Unnamed: 0,Name,Date,High,Low,Open,Close,Volume,Marketcap
0,Ethereum,2024-04-14,3297.6372,2874.401,3231.1639,3024.3586,32796790000.0,384236700000.0
1,Ethereum,2024-04-13,3559.298,3207.709,3515.9494,3245.0984,20597450000.0,413222700000.0
2,Ethereum,2024-04-12,3619.6398,3487.4192,3544.0727,3516.6996,23359330000.0,425841400000.0
3,Ethereum,2024-04-11,3559.9659,3426.2983,3505.5251,3547.5273,25494310000.0,421861200000.0
4,Ethereum,2024-04-10,3719.8724,3480.2572,3702.1783,3510.2155,27414390000.0,432872600000.0


In [7]:
# converting date to datetime format
df['Date'] = pd.to_datetime(df['Date'])

In [8]:
# data being analyzed for last five years, so filtering data for that date range
dates = df['Date'].unique()
print(dates)

<DatetimeArray>
['2024-04-14 00:00:00', '2024-04-13 00:00:00', '2024-04-12 00:00:00',
 '2024-04-11 00:00:00', '2024-04-10 00:00:00', '2024-04-09 00:00:00',
 '2024-04-08 00:00:00', '2024-04-07 00:00:00', '2024-04-06 00:00:00',
 '2024-04-05 00:00:00',
 ...
 '2019-04-26 00:00:00', '2019-04-25 00:00:00', '2019-04-24 00:00:00',
 '2019-04-23 00:00:00', '2019-04-22 00:00:00', '2019-04-21 00:00:00',
 '2019-04-20 00:00:00', '2019-04-19 00:00:00', '2019-04-18 00:00:00',
 '2019-04-17 00:00:00']
Length: 1825, dtype: datetime64[ns]


In [9]:
# creating a new dataframe after applying the above filters
ethereum_df = df
ethereum_df

Unnamed: 0,Name,Date,High,Low,Open,Close,Volume,Marketcap
0,Ethereum,2024-04-14,3297.637200,2874.401000,3231.163900,3024.358600,3.279679e+10,3.842367e+11
1,Ethereum,2024-04-13,3559.298000,3207.709000,3515.949400,3245.098400,2.059745e+10,4.132227e+11
2,Ethereum,2024-04-12,3619.639800,3487.419200,3544.072700,3516.699600,2.335933e+10,4.258414e+11
3,Ethereum,2024-04-11,3559.965900,3426.298300,3505.525100,3547.527300,2.549431e+10,4.218612e+11
4,Ethereum,2024-04-10,3719.872400,3480.257200,3702.178300,3510.215500,2.741439e+10,4.328726e+11
...,...,...,...,...,...,...,...,...
1820,Ethereum,2019-04-21,177.250340,173.480841,174.860856,175.282926,4.891869e+09,1.851742e+10
1821,Ethereum,2019-04-20,175.010126,171.873865,174.901697,174.800910,5.053026e+09,1.838350e+10
1822,Ethereum,2019-04-19,176.325867,168.036023,168.402685,174.767110,4.994551e+09,1.833714e+10
1823,Ethereum,2019-04-18,170.759335,167.288903,169.485605,168.445130,4.252172e+09,1.779057e+10


In [10]:
# Define the file path where you want to save the CSV
file_path = '/Users/moritzrichter/Desktop/cryptocurrency_analysis/data/Ethereum_(ETH)/ethereum(fiveyears).csv'

# Save the DataFrame to CSV at the specified path
ethereum_df.to_csv(file_path, index=False)  # `index=False` to prevent writing row indices in the file

# Storing file in MongoDB (DB name : Finale)

In [11]:
# Read in mongodb server location as client
client = MongoClient("mongodb+srv://Group7:Finale@finalsegment1.690c0.mongodb.net/bitcoin_db?retryWrites=true&w=majority",tlsCAFile=certifi.where())

In [12]:
# Find Databases
client.list_database_names()

['Bitcoin_db',
 'Cardano_DB',
 'Ethereum_db',
 'Finale',
 'Tether_DB',
 'XRP_DB',
 'admin',
 'local']

In [13]:
# connecting to the db on MongoDb and creating collection in the db
db = client['Finale']
collection = db['Bitcoin']

In [14]:
# adding dataframe to mongoDb
bitcoin_df.reset_index(inplace=True)
bitcoin_df_dict = bitcoin_df.to_dict("records")

In [15]:
# Insert collection
collection.insert_many(bitcoin_df_dict)

<pymongo.results.InsertManyResult at 0x15bb6b587c0>