In [1]:
# import libaries for getting data from database
from pymongo import MongoClient

# EDA
import pandas as pd

We'll first pull the data from the MongoDB into a Pandas dataframe and do EDA.

EDA will consist of: 

1. Initial look
2. Drop null columns, drop columns with all entries the same (such as the creater of the collection, which is 1 artist)
3. Examine datatypes & look at certain rows & entries in detail; specifically look at entries that are of type dictionary and pull out necessary values and add them as columns to our DF. The columns with the dictionary values that we will look in detail are: 
    - a. payment_token
    - b. asset 
    - c. transaction
    - d. seller 
    - f. winner_account 
4. Lastly, look at columns that have few non-nulls
    - a. asset & asset_bundle (there are only 3 that are asset_bundles so let's merge these two columns) 
    - b. to_account
    - c. dev_fee_payment_event
    - d. from_account
    - e. is_private
    - f. transaction_from_account_username
    - g. seller_username
    - h. winner_account_username

## Get Data from DB

In [2]:
client = MongoClient()

In [3]:
client.list_database_names()

['admin', 'books', 'config', 'events', 'local', 'outings', 'pak']

In [4]:
db = client.pak
salesPoets = db.salesPoets

In [5]:
cursor = db.salesPoets.find({})
len(list(cursor))

10050

In [6]:
salesPoets_data = salesPoets.find()
sales_df = pd.DataFrame(salesPoets_data)
sales_df.head(3)

Unnamed: 0,_id,approved_account,asset,asset_bundle,auction_type,bid_amount,collection_slug,contract_address,created_date,custom_event_name,...,is_private,owner_account,payment_token,quantity,seller,starting_price,to_account,total_price,transaction,winner_account
0,61b41aebd6ab32dbd9a84ea0,,"{'id': 44596334, 'token_id': '1', 'num_sales':...",,,,lostpoets,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,2021-12-11T03:27:23.942099,,...,False,,"{'id': 1, 'symbol': 'ETH', 'address': '0x00000...",1,"{'user': {'username': '0xBub'}, 'profile_img_u...",,,365000000000000000,{'block_hash': '0x180bac5e21b88d73f73d41a27bed...,"{'user': {'username': 'nikolas17'}, 'profile_i..."
1,61b41aebd6ab32dbd9a84ea1,,"{'id': 59286920, 'token_id': '5594', 'num_sale...",,,,lostpoets,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,2021-12-11T03:20:03.826890,,...,,,"{'id': 2, 'symbol': 'WETH', 'address': '0xc02a...",1,"{'user': {'username': 'adamludwin'}, 'profile_...",,,500000000000000000,{'block_hash': '0xce55c3a9e03678eb14230114a239...,"{'user': {'username': '858'}, 'profile_img_url..."
2,61b41aebd6ab32dbd9a84ea2,,"{'id': 59336121, 'token_id': '8839', 'num_sale...",,,,lostpoets,0x7be8076f4ea4a4ad08075c2508e481d6c946d12b,2021-12-11T02:47:15.343322,,...,False,,"{'id': 1, 'symbol': 'ETH', 'address': '0x00000...",1,"{'user': {'username': 'DirtySderty'}, 'profile...",,,1000000000000000000,{'block_hash': '0x24bd21c21379fd46e5cc513b939d...,"{'user': {'username': 'Iamchef'}, 'profile_img..."


## EDA

### 1. Initial look

In [7]:
sales_df.columns

Index(['_id', 'approved_account', 'asset', 'asset_bundle', 'auction_type',
       'bid_amount', 'collection_slug', 'contract_address', 'created_date',
       'custom_event_name', 'dev_fee_payment_event',
       'dev_seller_fee_basis_points', 'duration', 'ending_price', 'event_type',
       'from_account', 'id', 'is_private', 'owner_account', 'payment_token',
       'quantity', 'seller', 'starting_price', 'to_account', 'total_price',
       'transaction', 'winner_account'],
      dtype='object')

In [8]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10050 entries, 0 to 10049
Data columns (total 27 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   _id                          10050 non-null  object
 1   approved_account             0 non-null      object
 2   asset                        10047 non-null  object
 3   asset_bundle                 3 non-null      object
 4   auction_type                 0 non-null      object
 5   bid_amount                   0 non-null      object
 6   collection_slug              10050 non-null  object
 7   contract_address             10050 non-null  object
 8   created_date                 10050 non-null  object
 9   custom_event_name            0 non-null      object
 10  dev_fee_payment_event        9637 non-null   object
 11  dev_seller_fee_basis_points  10050 non-null  int64 
 12  duration                     0 non-null      object
 13  ending_price                 0 

## 2. Drop null columns, drop columns with all entries the same

OK, great let's drop the null columns so that we can see a bit better.

In [9]:
sales_df.dropna(axis = 1, how='all', inplace = True)

In [10]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10050 entries, 0 to 10049
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   _id                          10050 non-null  object
 1   asset                        10047 non-null  object
 2   asset_bundle                 3 non-null      object
 3   collection_slug              10050 non-null  object
 4   contract_address             10050 non-null  object
 5   created_date                 10050 non-null  object
 6   dev_fee_payment_event        9637 non-null   object
 7   dev_seller_fee_basis_points  10050 non-null  int64 
 8   event_type                   10050 non-null  object
 9   from_account                 6 non-null      object
 10  id                           10050 non-null  int64 
 11  is_private                   8423 non-null   object
 12  payment_token                10050 non-null  object
 13  quantity                     10

Let's find the columns where the value is the same and drop those as well.

In [11]:
sales_df.contract_address.nunique()

1

In [12]:
sales_df._id.nunique()

10050

In [13]:
sales_df.collection_slug.nunique()

1

In [14]:
sales_df.created_date.nunique()

10050

In [15]:
sales_df.dev_seller_fee_basis_points.nunique()

1

In [16]:
sales_df.dev_seller_fee_basis_points.iloc[0]

1000

In [17]:
sales_df.event_type.nunique()

1

In [18]:
sales_df.event_type.iloc[0]

'successful'

In [19]:
sales_df.id.nunique()

10050

In [20]:
sales_df.is_private.unique()

array([False, None, True], dtype=object)

In [21]:
sales_df.quantity.unique()

array(['1', '2', '5', '3', '8', '14', '4', '50', '10', '12', '7', '30',
       '6', '20', '16', '40', '27', '9', '11'], dtype=object)

In [22]:
sales_df.total_price.nunique()

1383

In [23]:
(sales_df.drop(['event_type', 'dev_seller_fee_basis_points', 
                'collection_slug', 'contract_address' ], axis = 1, inplace = True))

In [24]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10050 entries, 0 to 10049
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   _id                    10050 non-null  object
 1   asset                  10047 non-null  object
 2   asset_bundle           3 non-null      object
 3   created_date           10050 non-null  object
 4   dev_fee_payment_event  9637 non-null   object
 5   from_account           6 non-null      object
 6   id                     10050 non-null  int64 
 7   is_private             8423 non-null   object
 8   payment_token          10050 non-null  object
 9   quantity               10050 non-null  object
 10  seller                 10050 non-null  object
 11  to_account             6 non-null      object
 12  total_price            10050 non-null  object
 13  transaction            10050 non-null  object
 14  winner_account         10050 non-null  object
dtypes: int64(1), object

## 3. Examine datatypes & specific columns/values

We look at created_date to do sanity check on the scrape.

In [25]:
sales_df.created_date.iloc[103]

'2021-12-09T06:13:07.671768'

In [26]:
type(sales_df.created_date.iloc[0])

str

Now, let's look at a couple of rows & specific entries in detail.

The entries we are going to be at are for the columns that have a dictionary associated with them, which means we can extract more detail from them. 

In [27]:
sales_df.iloc[0]

_id                                               61b41aebd6ab32dbd9a84ea0
asset                    {'id': 44596334, 'token_id': '1', 'num_sales':...
asset_bundle                                                          None
created_date                                    2021-12-11T03:27:23.942099
dev_fee_payment_event                                                 None
from_account                                                          None
id                                                              2427455930
is_private                                                           False
payment_token            {'id': 1, 'symbol': 'ETH', 'address': '0x00000...
quantity                                                                 1
seller                   {'user': {'username': '0xBub'}, 'profile_img_u...
to_account                                                            None
total_price                                             365000000000000000
transaction              

**a. payment_token**

In [28]:
sales_df.payment_token.iloc[5]

{'id': 1,
 'symbol': 'ETH',
 'address': '0x0000000000000000000000000000000000000000',
 'image_url': 'https://storage.opensea.io/files/6f8e2979d428180222796ff4a33ab929.svg',
 'name': 'Ether',
 'decimals': 18,
 'eth_price': '1.000000000000000',
 'usd_price': '3983.769999999999982000'}

We want the symbol and the usd_price out of this dictionary & as new columns, then we can drop the payment_token column. 

In [29]:
sales_df['payment_token_symbol'] = (sales_df.payment_token.apply(lambda x: 'unknown'
                              if (pd.isnull(x)) else x['symbol']))

In [30]:
sales_df['payment_token_usd_price'] = (sales_df.payment_token.apply(lambda x: 'unknown'
                              if (pd.isnull(x)) else x['usd_price']))

In [31]:
sales_df.head(1)

Unnamed: 0,_id,asset,asset_bundle,created_date,dev_fee_payment_event,from_account,id,is_private,payment_token,quantity,seller,to_account,total_price,transaction,winner_account,payment_token_symbol,payment_token_usd_price
0,61b41aebd6ab32dbd9a84ea0,"{'id': 44596334, 'token_id': '1', 'num_sales':...",,2021-12-11T03:27:23.942099,,,2427455930,False,"{'id': 1, 'symbol': 'ETH', 'address': '0x00000...",1,"{'user': {'username': '0xBub'}, 'profile_img_u...",,365000000000000000,{'block_hash': '0x180bac5e21b88d73f73d41a27bed...,"{'user': {'username': 'nikolas17'}, 'profile_i...",ETH,3983.77


In [32]:
(sales_df.drop(['payment_token'], axis = 1, inplace = True))

**b. asset**

In [33]:
sales_df.asset.iloc[4]

{'id': 67491120,
 'token_id': '22655',
 'num_sales': 2,
 'background_color': None,
 'image_url': 'https://lh3.googleusercontent.com/nlVFhb7yVUDIXgl9SbV0bGQYpMBzvPccp6vasAky1Ltke9n-SJrhxKnsK94flUN_BL6ILm11XCMrgUIQSnKrICvzvZVN2wf2u3pRaw',
 'image_preview_url': 'https://lh3.googleusercontent.com/nlVFhb7yVUDIXgl9SbV0bGQYpMBzvPccp6vasAky1Ltke9n-SJrhxKnsK94flUN_BL6ILm11XCMrgUIQSnKrICvzvZVN2wf2u3pRaw=s250',
 'image_thumbnail_url': 'https://lh3.googleusercontent.com/nlVFhb7yVUDIXgl9SbV0bGQYpMBzvPccp6vasAky1Ltke9n-SJrhxKnsK94flUN_BL6ILm11XCMrgUIQSnKrICvzvZVN2wf2u3pRaw=s128',
 'image_original_url': 'https://d1xxei964ioe0z.cloudfront.net/full/7e2f64d90eeac43d9cab7072db5cb11524c81a6c44e2cdaa5f2302ee01a873ce.png',
 'animation_url': None,
 'animation_original_url': None,
 'name': 'Poet #22655',
 'description': None,
 'external_link': None,
 'asset_contract': {'address': '0x4b3406a41399c7fd2ba65cbc93697ad9e7ea61e5',
  'asset_contract_type': 'non-fungible',
  'created_date': '2021-09-25T05:35:23.49023

In [34]:
sales_df.asset.iloc[4]['token_id']

'22655'

In [35]:
sales_df['asset_token_id'] = (sales_df.asset.apply(lambda x: '0'
                              if (pd.isnull(x)) else x['token_id']))

In [36]:
sales_df['asset_token_id'].value_counts()

1        5125
16030       4
16422       4
23407       4
11052       4
         ... 
1202        1
7328        1
1205        1
22452       1
15477       1
Name: asset_token_id, Length: 4306, dtype: int64

In [37]:
sales_df.asset.iloc[4]['num_sales']

2

In [38]:
sales_df['asset_num_sales'] = (sales_df.asset.apply(lambda x: 0
                              if (pd.isnull(x)) else x['num_sales']))

In [39]:
sales_df['asset_num_sales'].value_counts()

14807    5125
1        3588
2        1125
3         187
4          20
0           3
5           2
Name: asset_num_sales, dtype: int64

In [40]:
sales_df.asset.iloc[4]['id']

67491120

In [41]:
sales_df['asset_id'] = (sales_df.asset.apply(lambda x: 0
                              if (pd.isnull(x)) else x['id']))

In [42]:
sales_df['asset_id'].value_counts()

44596334    5125
60191059       4
60405635       4
67925902       4
59398797       4
            ... 
59268584       1
59308876       1
59268606       1
67315009       1
59985090       1
Name: asset_id, Length: 4306, dtype: int64

In [43]:
sales_df.asset.iloc[4]['image_url']

'https://lh3.googleusercontent.com/nlVFhb7yVUDIXgl9SbV0bGQYpMBzvPccp6vasAky1Ltke9n-SJrhxKnsK94flUN_BL6ILm11XCMrgUIQSnKrICvzvZVN2wf2u3pRaw'

In [44]:
sales_df['asset_image_url'] = (sales_df.asset.apply(lambda x: 'unknown'
                              if (pd.isnull(x)) else x['image_url']))

In [45]:
sales_df.head(1)

Unnamed: 0,_id,asset,asset_bundle,created_date,dev_fee_payment_event,from_account,id,is_private,quantity,seller,to_account,total_price,transaction,winner_account,payment_token_symbol,payment_token_usd_price,asset_token_id,asset_num_sales,asset_id,asset_image_url
0,61b41aebd6ab32dbd9a84ea0,"{'id': 44596334, 'token_id': '1', 'num_sales':...",,2021-12-11T03:27:23.942099,,,2427455930,False,1,"{'user': {'username': '0xBub'}, 'profile_img_u...",,365000000000000000,{'block_hash': '0x180bac5e21b88d73f73d41a27bed...,"{'user': {'username': 'nikolas17'}, 'profile_i...",ETH,3983.77,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...


In [46]:
(sales_df.drop(['asset'], axis = 1, inplace = True))

**c. transaction**

In [47]:
sales_df.transaction.iloc[4]

{'block_hash': '0xf6bab2cf41a53489b2b8c456664f4654f72930cc6c902d00c5fee7e4be8fa176',
 'block_number': '13781198',
 'from_account': {'user': {'username': 'Iamchef'},
  'profile_img_url': 'https://storage.googleapis.com/opensea-static/opensea-profile/32.png',
  'address': '0xd7cd27993eae6c0e32657852305441e9e44360e9',
  'config': ''},
 'id': 220361608,
 'timestamp': '2021-12-11T02:40:13',
 'to_account': {'user': {'username': 'OpenSea-Orders'},
  'profile_img_url': 'https://storage.googleapis.com/opensea-static/opensea-profile/22.png',
  'address': '0x7be8076f4ea4a4ad08075c2508e481d6c946d12b',
  'config': 'verified'},
 'transaction_hash': '0x1dfbc7e2c2d2b108926808a5334961f73a4bdaba6c37116c43a36dd9de7b8920',
 'transaction_index': '217'}

In [48]:
sales_df.transaction.iloc[12]['from_account']['user']['username']

'ram-che'

In [49]:
sales_df['transaction_from_account'] = (sales_df.transaction.apply(lambda x: 'unknown'
                              if (pd.isnull(x)) else x['from_account']))

In [50]:
sales_df['transaction_from_account_user'] = (sales_df.transaction_from_account.
                                             apply(lambda x: 'unknown'
                                                   if (pd.isnull(x)) else x['user']))

In [51]:
sales_df['transaction_from_account_username'] = (sales_df.transaction_from_account_user.apply
                                                 (lambda x: 'unknown' 
                                                  if (pd.isnull(x)) else x['username']))

In [52]:
(sales_df.drop(['transaction_from_account_user', 'transaction_from_account'], axis = 1, inplace = True))

In [53]:
sales_df.transaction.iloc[15]['to_account']['user']['username']

'OpenSea-Orders'

In [54]:
sales_df['transaction_to_account'] = (sales_df.transaction.apply(lambda x: 'unknown'
                              if (pd.isnull(x)) else x['to_account']))

In [55]:
sales_df['transaction_to_account_user'] = (sales_df.transaction_to_account.
                                             apply(lambda x: 'unknown'
                                                   if (pd.isnull(x)) else x['user']))

In [56]:
sales_df['transaction_to_account_username'] = (sales_df.transaction_to_account_user.apply
                                                 (lambda x: 'unknown' 
                                                  if (pd.isnull(x)) else x['username']))

In [57]:
(sales_df.drop(['transaction_to_account_user', 'transaction_to_account', 'transaction'], 
               axis = 1, inplace = True))

In [58]:
sales_df['transaction_to_account_username'] .value_counts()

OpenSea-Orders    10025
unknown              25
Name: transaction_to_account_username, dtype: int64

We're going to drop this as well

In [59]:
(sales_df.drop(['transaction_to_account_username'], axis = 1, inplace = True))

**d. seller**

In [60]:
sales_df.seller.iloc[97]

{'user': {'username': '666luckerr'},
 'profile_img_url': 'https://storage.googleapis.com/opensea-static/opensea-profile/1.png',
 'address': '0xfddda4259c41908dea4f3ec0a51f0e9bbb0893dd',
 'config': ''}

In [61]:
sales_df['seller_user'] = (sales_df.seller.
                            apply(lambda x: 'unknown'
                            if (pd.isnull(x)) else x['user']))

In [62]:
sales_df['seller_username'] = (sales_df.seller_user.apply
                                            (lambda x: 'unknown' 
                                            if (pd.isnull(x)) else x['username']))

In [63]:
(sales_df.drop(['seller_user', 'seller'], 
               axis = 1, inplace = True))

**e.winner account**

In [64]:
sales_df.winner_account.iloc[4]

{'user': {'username': 'Iamchef'},
 'profile_img_url': 'https://storage.googleapis.com/opensea-static/opensea-profile/32.png',
 'address': '0xd7cd27993eae6c0e32657852305441e9e44360e9',
 'config': ''}

In [65]:
sales_df['winner_account_user'] = (sales_df.winner_account.
                                apply(lambda x: 'unknown'
                                if (pd.isnull(x)) else x['user']))

In [66]:
sales_df['winner_account_username'] = (sales_df.winner_account_user.apply
                                            (lambda x: 'unknown' 
                                            if (pd.isnull(x)) else x['username']))

In [67]:
(sales_df.drop(['winner_account_user', 'winner_account'], 
               axis = 1, inplace = True))

In [68]:
sales_df.head(3)

Unnamed: 0,_id,asset_bundle,created_date,dev_fee_payment_event,from_account,id,is_private,quantity,to_account,total_price,payment_token_symbol,payment_token_usd_price,asset_token_id,asset_num_sales,asset_id,asset_image_url,transaction_from_account_username,seller_username,winner_account_username
0,61b41aebd6ab32dbd9a84ea0,,2021-12-11T03:27:23.942099,,,2427455930,False,1,,365000000000000000,ETH,3983.77,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,nikolas17,0xBub,nikolas17
1,61b41aebd6ab32dbd9a84ea1,,2021-12-11T03:20:03.826890,,,2427391259,,1,,500000000000000000,WETH,3972.7,5594,2,59286920,https://lh3.googleusercontent.com/ko_eY806byoe...,adamludwin,adamludwin,858
2,61b41aebd6ab32dbd9a84ea2,,2021-12-11T02:47:15.343322,,,2427093401,False,1,,1000000000000000000,ETH,3983.77,8839,1,59336121,https://lh3.googleusercontent.com/yNssfCMygvRC...,Iamchef,DirtySderty,Iamchef


In [69]:
sales_df.winner_account_username.nunique()

2453

### 4. Columns with too many null values

In [70]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10050 entries, 0 to 10049
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   _id                                10050 non-null  object
 1   asset_bundle                       3 non-null      object
 2   created_date                       10050 non-null  object
 3   dev_fee_payment_event              9637 non-null   object
 4   from_account                       6 non-null      object
 5   id                                 10050 non-null  int64 
 6   is_private                         8423 non-null   object
 7   quantity                           10050 non-null  object
 8   to_account                         6 non-null      object
 9   total_price                        10050 non-null  object
 10  payment_token_symbol               10050 non-null  object
 11  payment_token_usd_price            10050 non-null  object
 12  asse

**a. asset & asset_bundle**

In [71]:
sales_df[pd.notnull(sales_df.asset_bundle)]

Unnamed: 0,_id,asset_bundle,created_date,dev_fee_payment_event,from_account,id,is_private,quantity,to_account,total_price,payment_token_symbol,payment_token_usd_price,asset_token_id,asset_num_sales,asset_id,asset_image_url,transaction_from_account_username,seller_username,winner_account_username
1649,61b41b06d6ab32dbd9a85511,"{'maker': {'user': {'username': 'xxyzz'}, 'pro...",2021-11-20T17:26:25.504649,"{'asset': None, 'asset_bundle': None, 'event_t...",,2056727815,True,8,,0,ETH,3983.77,0,0,0,unknown,,xxyzz,
3635,61b41b2bd6ab32dbd9a85cd3,{'maker': {'user': {'username': 'Strongheart'}...,2021-11-02T05:55:26.503712,"{'asset': None, 'asset_bundle': None, 'event_t...",,1655102337,False,8,,7000000000000000000,ETH,3983.77,0,0,0,unknown,HalfLifeXxVault,Strongheart,HalfLifeXxVault
5955,61b41b55d6ab32dbd9a865e3,{'maker': {'user': {'username': 'Strongheart'}...,2021-10-13T21:20:58.732398,"{'asset': None, 'asset_bundle': None, 'event_t...",,1359212707,False,5,,5750000000000000000,ETH,3983.77,0,0,0,unknown,mexpex,Strongheart,mexpex


In [72]:
sales_df.asset_bundle.iloc[1649]

{'maker': {'user': {'username': 'xxyzz'},
  'profile_img_url': 'https://storage.googleapis.com/opensea-static/opensea-profile/24.png',
  'address': '0xdc5cdb19d6ab7dff6e6b155c6d27ebe349579524',
  'config': ''},
 'slug': 'd-exf',
 'assets': [{'id': 59269612,
   'token_id': '1777',
   'num_sales': 0,
   'background_color': None,
   'image_url': 'https://lh3.googleusercontent.com/5mpIjvIP9ruMlySj4zlgTr5LeNJiw_Lzq3sMEXPUx-NECTebWFe3-VB8sgFgxlqUyujAo3TEU4Y4X71NQX8jfZERozPF3fWLsR7-',
   'image_preview_url': 'https://lh3.googleusercontent.com/5mpIjvIP9ruMlySj4zlgTr5LeNJiw_Lzq3sMEXPUx-NECTebWFe3-VB8sgFgxlqUyujAo3TEU4Y4X71NQX8jfZERozPF3fWLsR7-=s250',
   'image_thumbnail_url': 'https://lh3.googleusercontent.com/5mpIjvIP9ruMlySj4zlgTr5LeNJiw_Lzq3sMEXPUx-NECTebWFe3-VB8sgFgxlqUyujAo3TEU4Y4X71NQX8jfZERozPF3fWLsR7-=s128',
   'image_original_url': 'https://d1xxei964ioe0z.cloudfront.net/full/b346580a37c08c6dc1ac5272e92bd1fb15ac68a8438fc71c0e95f66feb466669.png',
   'animation_url': None,
   'animation_o

There is not enough here to grant distribution so let's drop this column too. 

In [73]:
(sales_df.drop(['asset_bundle'], axis = 1, inplace = True))

**b. from_account**

In [74]:
sales_df[pd.notnull(sales_df.from_account)]

Unnamed: 0,_id,created_date,dev_fee_payment_event,from_account,id,is_private,quantity,to_account,total_price,payment_token_symbol,payment_token_usd_price,asset_token_id,asset_num_sales,asset_id,asset_image_url,transaction_from_account_username,seller_username,winner_account_username
3330,61b41b26d6ab32dbd9a85ba2,2021-11-05T06:29:14.639950,"{'asset': None, 'asset_bundle': None, 'event_t...","{'user': {'username': 'FYeah'}, 'profile_img_u...",1701961702,False,1,"{'user': {'username': None}, 'profile_img_url'...",441000000000000000,ETH,3983.77,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,,FYeah,
5091,61b41b45d6ab32dbd9a86283,2021-10-21T07:22:30.093491,"{'asset': None, 'asset_bundle': None, 'event_t...","{'user': {'username': None}, 'profile_img_url'...",1469592134,False,1,"{'user': {'username': 'DB'}, 'profile_img_url'...",670000000000000000,ETH,3983.77,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,DB,,DB
5313,61b41b49d6ab32dbd9a86361,2021-10-19T07:00:00.657831,"{'asset': None, 'asset_bundle': None, 'event_t...","{'user': {'username': 'gyro_vlt'}, 'profile_im...",1438774598,False,1,"{'user': None, 'profile_img_url': 'https://sto...",719000000000000000,ETH,3983.77,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,unknown,gyro_vlt,unknown
7944,61b41b78d6ab32dbd9a86da8,2021-10-05T20:56:47.815319,"{'asset': None, 'asset_bundle': None, 'event_t...","{'user': {'username': 'nate_nus'}, 'profile_im...",1254456301,False,1,"{'user': {'username': 'KaldoLove'}, 'profile_i...",1000000000000000000,ETH,3983.77,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,KaldoLove,nate_nus,KaldoLove
8704,61b41b88d6ab32dbd9a870a0,2021-10-02T17:05:27.055131,"{'asset': None, 'asset_bundle': None, 'event_t...","{'user': {'username': 'Blue_Kid'}, 'profile_im...",1207377365,False,1,"{'user': {'username': 'mjt123'}, 'profile_img_...",989000000000000000,ETH,3983.77,18811,1,62116764,https://lh3.googleusercontent.com/KiDza55cZ89q...,mjt123,Blue_Kid,mjt123
9049,61b41b8ed6ab32dbd9a871f9,2021-09-30T22:57:37.377598,"{'asset': None, 'asset_bundle': None, 'event_t...","{'user': {'username': None}, 'profile_img_url'...",1183970011,False,1,"{'user': {'username': 'Kulture_'}, 'profile_im...",1050000000000000000,ETH,3983.77,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,Kulture_,,Kulture_


Not enough values to grant keeping

In [75]:
(sales_df.drop(['from_account'], axis = 1, inplace = True))

**c. dev_fee_payment_event**

In [76]:
sales_df[pd.notnull(sales_df.dev_fee_payment_event)]

Unnamed: 0,_id,created_date,dev_fee_payment_event,id,is_private,quantity,to_account,total_price,payment_token_symbol,payment_token_usd_price,asset_token_id,asset_num_sales,asset_id,asset_image_url,transaction_from_account_username,seller_username,winner_account_username
298,61b41aeed6ab32dbd9a84fca,2021-12-05T18:59:45.104653,"{'asset': None, 'asset_bundle': None, 'event_t...",2354882018,False,1,,400000000000000000,ETH,3983.769999999999982000,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,larryp_eth,punk42069,larryp_eth
299,61b41aeed6ab32dbd9a84fcb,2021-12-05T17:31:42.954420,"{'asset': None, 'asset_bundle': None, 'event_t...",2353544989,False,1,,1020000000000000000,ETH,3983.769999999999982000,17774,1,61226810,https://lh3.googleusercontent.com/h-vSHcgf_qMn...,unknown,,unknown
300,61b41aefd6ab32dbd9a84fcc,2021-12-05T17:27:12.853093,"{'asset': None, 'asset_bundle': None, 'event_t...",2353479286,False,1,,399000000000000000,ETH,3983.769999999999982000,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,TaipeiBatman,loopx,TaipeiBatman
301,61b41aefd6ab32dbd9a84fcd,2021-12-05T17:25:44.415360,"{'asset': None, 'asset_bundle': None, 'event_t...",2353458550,False,1,,1100000000000000000,ETH,3983.769999999999982000,2226,1,59270141,https://lh3.googleusercontent.com/2UqDJoje13ys...,TaipeiBatman,A_V1,TaipeiBatman
302,61b41aefd6ab32dbd9a84fce,2021-12-05T17:21:11.210842,"{'asset': None, 'asset_bundle': None, 'event_t...",2353393355,False,1,,395000000000000000,ETH,3983.769999999999982000,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,TaipeiBatman,JonnyB,TaipeiBatman
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10045,61b41ba0d6ab32dbd9a875dd,2021-09-28T05:43:19.935340,"{'asset': None, 'asset_bundle': None, 'event_t...",1140292383,False,1,,2800000000000000000,ETH,3983.769999999999982000,17480,1,61049557,https://lh3.googleusercontent.com/1RP4PasyvmF3...,ModeratsArt,Winniee,ModeratsArt
10046,61b41ba0d6ab32dbd9a875de,2021-09-28T05:38:25.800408,"{'asset': None, 'asset_bundle': None, 'event_t...",1140224886,False,1,,1199000000000000000,ETH,3983.769999999999982000,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,youaintmyfry,head-trip,youaintmyfry
10047,61b41ba0d6ab32dbd9a875df,2021-09-28T05:34:39.947798,"{'asset': None, 'asset_bundle': None, 'event_t...",1140172188,False,1,,2000000000000000000,ETH,3983.769999999999982000,16379,1,60380137,https://lh3.googleusercontent.com/ZILAvYcwulP9...,head-trip,MANDYxTHEJUNGLE,head-trip
10048,61b41ba0d6ab32dbd9a875e0,2021-09-28T05:33:37.656539,"{'asset': None, 'asset_bundle': None, 'event_t...",1140157076,False,1,,1000000000000000000,ETH,3983.769999999999982000,10542,1,59374117,https://lh3.googleusercontent.com/3K2T9Yp_UdO9...,totvault,chiggun,totvault


In [77]:
sales_df.dev_fee_payment_event.iloc[298]

{'asset': None,
 'asset_bundle': None,
 'event_type': 'payout',
 'event_timestamp': '2021-12-06T01:48:24',
 'auction_type': None,
 'total_price': None,
 'payment_token': {'id': 1,
  'symbol': 'ETH',
  'address': '0x0000000000000000000000000000000000000000',
  'image_url': 'https://storage.opensea.io/files/6f8e2979d428180222796ff4a33ab929.svg',
  'name': 'Ether',
  'decimals': 18,
  'eth_price': '1.000000000000000',
  'usd_price': '3983.769999999999982000'},
 'transaction': {'block_hash': '0x0c965c24be97864a0dc43265daab5b7a4c246826b9fe725a43756e5f8e1f4a94',
  'block_number': '13749472',
  'from_account': None,
  'id': 216441852,
  'timestamp': None,
  'to_account': None,
  'transaction_hash': '0x8f894aeacea2206207d8afae89889760788182ad31eaa99361407aa0c0a90127',
  'transaction_index': '68'},
 'created_date': '2021-12-06T01:49:19.460184',
 'quantity': None}

Again, not enough info here to keep.

In [78]:
(sales_df.drop(['dev_fee_payment_event'], axis = 1, inplace = True))

**d. to_account**

In [79]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10050 entries, 0 to 10049
Data columns (total 16 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   _id                                10050 non-null  object
 1   created_date                       10050 non-null  object
 2   id                                 10050 non-null  int64 
 3   is_private                         8423 non-null   object
 4   quantity                           10050 non-null  object
 5   to_account                         6 non-null      object
 6   total_price                        10050 non-null  object
 7   payment_token_symbol               10050 non-null  object
 8   payment_token_usd_price            10050 non-null  object
 9   asset_token_id                     10050 non-null  object
 10  asset_num_sales                    10050 non-null  int64 
 11  asset_id                           10050 non-null  int64 
 12  asse

In [80]:
sales_df[pd.notnull(sales_df.to_account)]

Unnamed: 0,_id,created_date,id,is_private,quantity,to_account,total_price,payment_token_symbol,payment_token_usd_price,asset_token_id,asset_num_sales,asset_id,asset_image_url,transaction_from_account_username,seller_username,winner_account_username
3330,61b41b26d6ab32dbd9a85ba2,2021-11-05T06:29:14.639950,1701961702,False,1,"{'user': {'username': None}, 'profile_img_url'...",441000000000000000,ETH,3983.77,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,,FYeah,
5091,61b41b45d6ab32dbd9a86283,2021-10-21T07:22:30.093491,1469592134,False,1,"{'user': {'username': 'DB'}, 'profile_img_url'...",670000000000000000,ETH,3983.77,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,DB,,DB
5313,61b41b49d6ab32dbd9a86361,2021-10-19T07:00:00.657831,1438774598,False,1,"{'user': None, 'profile_img_url': 'https://sto...",719000000000000000,ETH,3983.77,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,unknown,gyro_vlt,unknown
7944,61b41b78d6ab32dbd9a86da8,2021-10-05T20:56:47.815319,1254456301,False,1,"{'user': {'username': 'KaldoLove'}, 'profile_i...",1000000000000000000,ETH,3983.77,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,KaldoLove,nate_nus,KaldoLove
8704,61b41b88d6ab32dbd9a870a0,2021-10-02T17:05:27.055131,1207377365,False,1,"{'user': {'username': 'mjt123'}, 'profile_img_...",989000000000000000,ETH,3983.77,18811,1,62116764,https://lh3.googleusercontent.com/KiDza55cZ89q...,mjt123,Blue_Kid,mjt123
9049,61b41b8ed6ab32dbd9a871f9,2021-09-30T22:57:37.377598,1183970011,False,1,"{'user': {'username': 'Kulture_'}, 'profile_im...",1050000000000000000,ETH,3983.77,1,14807,44596334,https://lh3.googleusercontent.com/9GgkzN-7si-y...,Kulture_,,Kulture_


Again, not enough info here to keep.

In [81]:
(sales_df.drop(['to_account'], axis = 1, inplace = True))

**e. is_private**

In [82]:
sales_df.is_private.value_counts()

False    8396
True       27
Name: is_private, dtype: int64

In [83]:
sales_df.is_private = sales_df.is_private.fillna(value='unknown')

**f. transaction_from_account_username**

In [84]:
sales_df.transaction_from_account_username = (sales_df.
                                              transaction_from_account_username.
                                              fillna(value='unknown'))

**g. seller_username**

In [85]:
sales_df.seller_username = (sales_df.seller_username.fillna(value='unknown'))

**h. winner_account_username**

In [86]:
sales_df.winner_account_username.iloc[0:5]

0    nikolas17
1          858
2      Iamchef
3      Iamchef
4      Iamchef
Name: winner_account_username, dtype: object

In [87]:
sales_df.winner_account_username = (sales_df.winner_account_username.
                                              fillna(value='unknown'))

## Saving File

Finally, let's save this to a CSV so that we can model it and EDA in the next notebook

In [88]:
sales_df.to_csv('sales.csv', index= False)