Extracting Data from https://coincodex.com/crypto/polkadot/ and preprocessing and saving onto a MongoDB database

In [1]:
# import libraries
import pandas as pd
import numpy as np

In [5]:
# Define the file path. Adjust the username and path to match your local environment.
file_path = '/Users/moritzrichter/Desktop/cryptocurrency_analysis/data/Polkadot_(DOT)/polkadot_2019-04-16_2024-04-14.csv'

# Load the dataset from the specified file path
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to ensure it has loaded correctly
df.head()

Unnamed: 0,Start,End,Open,High,Low,Close,Volume,Market Cap
0,2024-04-13,2024-04-14,7.241344,7.322462,5.830095,6.452436,1414697000.0,6883850000.0
1,2024-04-12,2024-04-13,8.398041,8.485889,6.637766,7.236212,728838400.0,7936690000.0
2,2024-04-11,2024-04-12,8.390784,8.517427,8.201845,8.381547,647681200.0,8250478000.0
3,2024-04-10,2024-04-11,8.648202,8.689573,8.192059,8.415371,736863300.0,8330455000.0
4,2024-04-09,2024-04-10,9.03972,9.095918,8.642528,8.667247,640497200.0,8796045000.0


In [6]:
# checking for mull values
df.isnull().sum().sum()
df.isna().sum()

Start         0
End           0
Open          0
High          0
Low           0
Close         0
Volume        0
Market Cap    0
dtype: int64

In [7]:
# Insert the "Name" column with all values set to "Bitcoin"
df.insert(0, 'Name', 'Polkadot')

# Assume 'End' is the column to be used as 'Date' and that you need to rename 'Market Cap' to 'Marketcap'
df.rename(columns={'End': 'Date', 'Market Cap': 'Marketcap'}, inplace=True)

# Reorder the columns to the desired order
desired_columns = ['Name', 'Date', 'High', 'Low', 'Open', 'Close', 'Volume', 'Marketcap']
df = df[desired_columns]

# Save the modified DataFrame back to CSV or display it
df.to_csv('modified_data.csv', index=False)  # Save to new CSVprint(df.head())

In [8]:
df.head()

Unnamed: 0,Name,Date,High,Low,Open,Close,Volume,Marketcap
0,Polkadot,2024-04-14,7.322462,5.830095,7.241344,6.452436,1414697000.0,6883850000.0
1,Polkadot,2024-04-13,8.485889,6.637766,8.398041,7.236212,728838400.0,7936690000.0
2,Polkadot,2024-04-12,8.517427,8.201845,8.390784,8.381547,647681200.0,8250478000.0
3,Polkadot,2024-04-11,8.689573,8.192059,8.648202,8.415371,736863300.0,8330455000.0
4,Polkadot,2024-04-10,9.095918,8.642528,9.03972,8.667247,640497200.0,8796045000.0


In [9]:
# converting date to datetime format
df['Date'] = pd.to_datetime(df['Date'])

In [10]:
# data being analyzed for last five years, so filtering data for that date range
dates = df['Date'].unique()
print(dates)

<DatetimeArray>
['2024-04-14 00:00:00', '2024-04-13 00:00:00', '2024-04-12 00:00:00',
 '2024-04-11 00:00:00', '2024-04-10 00:00:00', '2024-04-09 00:00:00',
 '2024-04-08 00:00:00', '2024-04-07 00:00:00', '2024-04-06 00:00:00',
 '2024-04-05 00:00:00',
 ...
 '2020-08-31 00:00:00', '2020-08-30 00:00:00', '2020-08-29 00:00:00',
 '2020-08-28 00:00:00', '2020-08-27 00:00:00', '2020-08-26 00:00:00',
 '2020-08-25 00:00:00', '2020-08-24 00:00:00', '2020-08-23 00:00:00',
 '2020-08-22 00:00:00']
Length: 1332, dtype: datetime64[ns]


In [11]:
# creating a new dataframe after applying the above filters
polkadot_df = df
polkadot_df

Unnamed: 0,Name,Date,High,Low,Open,Close,Volume,Marketcap
0,Polkadot,2024-04-14,7.322462,5.830095,7.241344,6.452436,1.414697e+09,6.883850e+09
1,Polkadot,2024-04-13,8.485889,6.637766,8.398041,7.236212,7.288384e+08,7.936690e+09
2,Polkadot,2024-04-12,8.517427,8.201845,8.390784,8.381547,6.476812e+08,8.250478e+09
3,Polkadot,2024-04-11,8.689573,8.192059,8.648202,8.415371,7.368633e+08,8.330455e+09
4,Polkadot,2024-04-10,9.095918,8.642528,9.039720,8.667247,6.404972e+08,8.796045e+09
...,...,...,...,...,...,...,...,...
1327,Polkadot,2020-08-26,5.739077,4.501397,4.571726,5.469447,7.595200e+08,4.561539e+09
1328,Polkadot,2020-08-25,4.766288,3.820481,3.990420,4.552361,2.745461e+08,0.000000e+00
1329,Polkadot,2020-08-24,4.486948,3.731285,4.486948,3.974245,3.623563e+08,0.000000e+00
1330,Polkadot,2020-08-23,4.518678,2.822951,2.867354,4.477365,1.810368e+08,0.000000e+00


In [13]:
# Define the file path where you want to save the CSV
file_path = '/Users/moritzrichter/Desktop/cryptocurrency_analysis/data/Polkadot_(DOT)/polkadot(fiveyears).csv'

# Save the DataFrame to CSV at the specified path
polkadot_df.to_csv(file_path, index=False)  # `index=False` to prevent writing row indices in the file

# Storing file in MongoDB (DB name : Finale)

In [11]:
# Read in mongodb server location as client
client = MongoClient("mongodb+srv://Group7:Finale@finalsegment1.690c0.mongodb.net/bitcoin_db?retryWrites=true&w=majority",tlsCAFile=certifi.where())

In [12]:
# Find Databases
client.list_database_names()

['Bitcoin_db',
 'Cardano_DB',
 'Ethereum_db',
 'Finale',
 'Tether_DB',
 'XRP_DB',
 'admin',
 'local']

In [13]:
# connecting to the db on MongoDb and creating collection in the db
db = client['Finale']
collection = db['Bitcoin']

In [14]:
# adding dataframe to mongoDb
bitcoin_df.reset_index(inplace=True)
bitcoin_df_dict = bitcoin_df.to_dict("records")

In [15]:
# Insert collection
collection.insert_many(bitcoin_df_dict)

<pymongo.results.InsertManyResult at 0x15bb6b587c0>