# Capstone: NFTs
## Part IV: Similarity Score Model

### 1. Imports

In [6]:
import pandas as pd
import numpy as np
import json
from scipy import sparse

# nltk imports:
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

# scikit-learn imports:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

### 2. Read In and Inspect Data

In [7]:
indiv = pd.read_csv('../data/clean_individual.csv')

In [8]:
trans = pd.read_csv('../data/clean_transactions.csv')

In [9]:
trans.head()

Unnamed: 0,usd,date,trans,punk_id,accessories,type
0,401.264,2021-08-30,Sold,9001,"['Clown Nose', 'Bandana', 'Nerd Glasses']",female
1,383.662,2021-08-30,Sold,9001,"['Clown Nose', 'Bandana', 'Nerd Glasses']",female
2,19.23,2021-02-17,Sold,9001,"['Clown Nose', 'Bandana', 'Nerd Glasses']",female
3,2.085,2020-10-03,Sold,9001,"['Clown Nose', 'Bandana', 'Nerd Glasses']",female
4,0.085,2019-05-06,Sold,9001,"['Clown Nose', 'Bandana', 'Nerd Glasses']",female


In [10]:
indiv.head()

Unnamed: 0,punk_id,type,accessories,3d glasses,bandana,beanie,big beard,big shades,black lipstick,blonde bob,...,total_withdrawn_bids,avg_usd_bid,max_usd_bid,total_offers,total_withdrawn_offers,avg_usd_offer,max_usd_offer,total_transfers,recent_usd_sale,rarity_score
0,0,female,"['Green Eye Shadow', 'Earring', 'Blonde Bob']",0,0,0,0,0,0,1,...,4.0,1029.904667,1140.0,0.0,0.0,,,0.0,,114.832417
1,1,male,"['Smile', 'Mohawk']",0,0,0,0,0,0,0,...,4.0,449.666,742.652,0.0,0.0,,,0.0,,70.169684
2,2,female,['Wild Hair'],0,0,0,0,0,0,0,...,2.0,61.223,149.433,0.0,0.0,,,1.0,,56.01781
3,3,male,"['Wild Hair', 'Nerd Glasses', 'Pipe']",0,0,0,0,0,0,0,...,1.0,95.708,95.708,0.0,0.0,,,0.0,,76.289503
4,4,male,"['Big Shades', 'Wild Hair', 'Earring', 'Goat']",0,0,0,0,1,0,0,...,1.0,,,0.0,0.0,,,0.0,,88.738357


In [11]:
indiv.describe()

Unnamed: 0,punk_id,3d glasses,bandana,beanie,big beard,big shades,black lipstick,blonde bob,blonde short,blue eye shadow,...,total_withdrawn_bids,avg_usd_bid,max_usd_bid,total_offers,total_withdrawn_offers,avg_usd_offer,max_usd_offer,total_transfers,recent_usd_sale,rarity_score
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,3398.0,3398.0,10000.0,10000.0,4032.0,4032.0,10000.0,3052.0,10000.0
mean,4999.5,0.0286,0.0481,0.0044,0.0146,0.0535,0.0617,0.0147,0.0129,0.0266,...,0.6268,251.174313,527.119069,3.189,0.4918,2696.513,9298.732,0.5335,462.895212,104.0
std,2886.89568,0.166688,0.213988,0.06619,0.119951,0.22504,0.240622,0.120355,0.112849,0.160919,...,1.28759,1001.368989,4605.873032,6.91292,1.364595,56065.89,286284.0,0.903747,9637.335349,143.795619
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1e-05,1e-05,0.0,0.0,-1.0,-1.0,0.0,1e-05,19.949661
25%,2499.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,41.4605,45.8055,0.0,0.0,147.7243,223.9715,0.0,89.51375,65.124199
50%,4999.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,109.45425,136.1275,0.0,0.0,352.0361,548.6745,0.0,237.85,88.471016
75%,7499.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,265.175482,360.38275,3.0,0.0,574.8121,979.3845,1.0,411.16575,114.107407
max,9999.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,17.0,21910.26475,87640.0,95.0,42.0,2440237.0,17080000.0,10.0,532410.0,10336.243742


### 3. Create New Dataframe and Drop Unnecessary Columns


In [12]:
nearest_neighbors = indiv[['punk_id', 'accessories', 'type']]

In [13]:
nearest_neighbors

Unnamed: 0,punk_id,accessories,type
0,0,"['Green Eye Shadow', 'Earring', 'Blonde Bob']",female
1,1,"['Smile', 'Mohawk']",male
2,2,['Wild Hair'],female
3,3,"['Wild Hair', 'Nerd Glasses', 'Pipe']",male
4,4,"['Big Shades', 'Wild Hair', 'Earring', 'Goat']",male
...,...,...,...
9995,9995,"['Purple Eye Shadow', 'Straight Hair Dark']",female
9996,9996,"['Cigarette', 'Earring', 'Crazy Hair', 'Smile']",male
9997,9997,"['Front Beard', 'Cap Forward']",zombie
9998,9998,"['Wild White Hair', 'Black Lipstick', 'Clown E...",female


### 4. Create Cleaner Function

In [14]:
# citation: Breakfast Hour - NLP Practice I

def regex_cleaner(words):
    
    # set token
    my_tokenizer = RegexpTokenizer("[\w']+|\$[\d\.]+")
    
    # tokenize words
    clean_words = my_tokenizer.tokenize(words.lower())
    
    # return words
    return ' '.join(clean_words)

In [15]:
nearest_neighbors['clean_accessories'] = nearest_neighbors['accessories'].map(regex_cleaner)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nearest_neighbors['clean_accessories'] = nearest_neighbors['accessories'].map(regex_cleaner)


In [16]:
nearest_neighbors['clean_accessories'] = nearest_neighbors['type'] + " " + nearest_neighbors['clean_accessories']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nearest_neighbors['clean_accessories'] = nearest_neighbors['type'] + " " + nearest_neighbors['clean_accessories']


In [17]:
nearest_neighbors

Unnamed: 0,punk_id,accessories,type,clean_accessories
0,0,"['Green Eye Shadow', 'Earring', 'Blonde Bob']",female,female 'green eye shadow' 'earring' 'blonde bob'
1,1,"['Smile', 'Mohawk']",male,male 'smile' 'mohawk'
2,2,['Wild Hair'],female,female 'wild hair'
3,3,"['Wild Hair', 'Nerd Glasses', 'Pipe']",male,male 'wild hair' 'nerd glasses' 'pipe'
4,4,"['Big Shades', 'Wild Hair', 'Earring', 'Goat']",male,male 'big shades' 'wild hair' 'earring' 'goat'
...,...,...,...,...
9995,9995,"['Purple Eye Shadow', 'Straight Hair Dark']",female,female 'purple eye shadow' 'straight hair dark'
9996,9996,"['Cigarette', 'Earring', 'Crazy Hair', 'Smile']",male,male 'cigarette' 'earring' 'crazy hair' 'smile'
9997,9997,"['Front Beard', 'Cap Forward']",zombie,zombie 'front beard' 'cap forward'
9998,9998,"['Wild White Hair', 'Black Lipstick', 'Clown E...",female,female 'wild white hair' 'black lipstick' 'clo...


### 5. Create Recommendations Function for Similarity Scores

In [18]:
# adapted from: https://goodboychan.github.io/python/datacamp/natural_language_processing/2020/07/17/04-TF-IDF-and-similarity-scores.html

def get_recommendations(punk_id):
    
    # get indices
    indices = pd.Series(nearest_neighbors.index, index=nearest_neighbors['punk_id'])
        
    # get accessories list column
    acc_list = nearest_neighbors['clean_accessories']
        
    # instantiate tfidf
    tfidf = TfidfVectorizer()

    # construct the TF-IDF matrix
    tfidf_matrix = tfidf.fit_transform(acc_list)

    # generate the cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    
    # get the index of the punk that matches the punk_id
    idx = indices[punk_id]
    
    # get the pairwsie similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # sort the punks based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    return certain_types_scores(sim_scores, punk_id)

In [19]:
def certain_types_scores(sim_scores, punk_id):
    
    ids_and_scores = []
    
    # only want to return similar types
    if (nearest_neighbors['type'].iloc[punk_id] == 'alien') | (nearest_neighbors['type'].iloc[punk_id] == 'ape') | (nearest_neighbors['type'].iloc[punk_id] == 'zombie'):
        for i, val in enumerate(sim_scores):
            # if it is the same as punk_id entered, don't include
            if (nearest_neighbors['punk_id'].iloc[punk_id] == nearest_neighbors['punk_id'].iloc[sim_scores[i][0]]):
                pass         
            # if it is not the same type, don't include    
            elif (nearest_neighbors['type'].iloc[sim_scores[i][0]] == 'female') | (nearest_neighbors['type'].iloc[sim_scores[i][0]] == 'male'):
                pass
            else:
                ids_and_scores.append(sim_scores[i])
    else:
        for i, val in enumerate(sim_scores):
            # if it is the same as punk_id entered, don't include
            if  (nearest_neighbors['punk_id'].iloc[punk_id] == nearest_neighbors['punk_id'].iloc[sim_scores[i][0]]):
                pass
            # if it is not the same type, don't include
            elif (nearest_neighbors['type'].iloc[sim_scores[i][0]] == 'alien') | (nearest_neighbors['type'].iloc[sim_scores[i][0]] == 'ape') | (nearest_neighbors['type'].iloc[sim_scores[i][0]] == 'zombie') :
                pass
            else:
                ids_and_scores.append(sim_scores[i])
                
    # get the ids and scores for 10 most similar punks
    ids_and_scores = ids_and_scores[0:10]
    
    # transform tuples to lists        
    list_version = [list(ids_and_scores[i]) for i in range(len(ids_and_scores))]
    
    # round to 4 digits
    for i in range(len(list_version)):
        list_version[i][1] = round(list_version[i][1], 4)
    
    print('completed ' + str(punk_id))
    
    return list_version

In [20]:
print(get_recommendations(5577))

completed 5577
[[9280, 0.8699], [2491, 0.6668], [4156, 0.6317], [8498, 0.6217], [2924, 0.6073], [6965, 0.594], [6145, 0.5871], [8219, 0.5854], [372, 0.5691], [5795, 0.5617]]


### 6. Add Nearest Neighbors Column

Add neighbors column to `nearest_neighbors` dataframe:

In [21]:
# nearest_neighbors['neighbors'] = nearest_neighbors['punk_id'].apply(get_recommendations)

Note: Running the cell above takes a long time so, best to just save it immediately and read in data to view dataframe and avoid re running the cell.

In [22]:
# nearest_neighbors.to_csv('../data/nearest_neighbors.csv')

In [23]:
nearest_neighbors = pd.read_csv('../data/nearest_neighbors.csv', index_col = 0)

In [24]:
nearest_neighbors

Unnamed: 0,punk_id,accessories,type,clean_accessories,neighbors
0,0,"['Green Eye Shadow', 'Earring', 'Blonde Bob']",female,female 'green eye shadow' 'earring' 'blonde bob',"[[184, 0.964], [3582, 0.964], [9431, 0.964], [..."
1,1,"['Smile', 'Mohawk']",male,male 'smile' 'mohawk',"[[7591, 0.923], [24, 0.8664], [3288, 0.8664], ..."
2,2,['Wild Hair'],female,female 'wild hair',"[[200, 1.0], [566, 1.0], [3412, 1.0], [97, 0.8..."
3,3,"['Wild Hair', 'Nerd Glasses', 'Pipe']",male,male 'wild hair' 'nerd glasses' 'pipe',"[[723, 0.8287], [3937, 0.8287], [5091, 0.8287]..."
4,4,"['Big Shades', 'Wild Hair', 'Earring', 'Goat']",male,male 'big shades' 'wild hair' 'earring' 'goat',"[[7599, 0.8322], [8459, 0.8322], [3751, 0.8307..."
...,...,...,...,...,...
9995,9995,"['Purple Eye Shadow', 'Straight Hair Dark']",female,female 'purple eye shadow' 'straight hair dark',"[[5844, 1.0], [3615, 0.9578], [729, 0.9098], [..."
9996,9996,"['Cigarette', 'Earring', 'Crazy Hair', 'Smile']",male,male 'cigarette' 'earring' 'crazy hair' 'smile',"[[8393, 0.9534], [495, 0.8561], [1049, 0.8561]..."
9997,9997,"['Front Beard', 'Cap Forward']",zombie,zombie 'front beard' 'cap forward',"[[6491, 0.8235], [8553, 0.7275], [9909, 0.7008..."
9998,9998,"['Wild White Hair', 'Black Lipstick', 'Clown E...",female,female 'wild white hair' 'black lipstick' 'clo...,"[[4424, 0.9268], [4086, 0.8919], [7083, 0.8919..."


In [25]:
# check types
nearest_neighbors.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   punk_id            10000 non-null  int64 
 1   accessories        10000 non-null  object
 2   type               10000 non-null  object
 3   clean_accessories  10000 non-null  object
 4   neighbors          10000 non-null  object
dtypes: int64(1), object(4)
memory usage: 468.8+ KB


In [26]:
nearest_neighbors['neighbors'][0]

'[[184, 0.964], [3582, 0.964], [9431, 0.964], [2496, 0.9379], [7196, 0.9], [8487, 0.9], [8539, 0.8948], [7520, 0.8687], [103, 0.8645], [2751, 0.8566]]'

The neighbors column is in string format, so we will need to convert it to list format:

In [27]:
# citation: https://stackoverflow.com/questions/1894269/how-to-convert-string-representation-of-list-to-a-list

# test if it works
json.loads(nearest_neighbors['neighbors'][0])

[[184, 0.964],
 [3582, 0.964],
 [9431, 0.964],
 [2496, 0.9379],
 [7196, 0.9],
 [8487, 0.9],
 [8539, 0.8948],
 [7520, 0.8687],
 [103, 0.8645],
 [2751, 0.8566]]

In [28]:
# function for converting string to nested list
def convert_to_list(x):
    return json.loads(x)

In [29]:
nearest_neighbors['neighbors_clean'] = nearest_neighbors['neighbors'].apply(convert_to_list)

In [30]:
# check that it worked
nearest_neighbors['neighbors_clean'][0]

[[184, 0.964],
 [3582, 0.964],
 [9431, 0.964],
 [2496, 0.9379],
 [7196, 0.9],
 [8487, 0.9],
 [8539, 0.8948],
 [7520, 0.8687],
 [103, 0.8645],
 [2751, 0.8566]]

In [52]:
def convert_to_percent(list):
    for i in len(range(list)):
        x[i][1] = x[i][1] * 100

### 7. Add Averages of Neighbors/Similar Types to Empty Values

As stated in the II-Cleaning-and-Preprocessing Notebook, there are missing values in the columns of **avg_usd_bid, max_usd_bid, avg_usd_offer, max_usd_offer, avg_usd_sale** for the individual dataset, so I will replace the NaN values with the average of their nearest neighbors with the exception of the avg_usd_sale column.

For the **average_usd_sale** column, I will replace the NaN values with the average of the recent sales for that particular type.

In [31]:
nearest_neighbors.head()

Unnamed: 0,punk_id,accessories,type,clean_accessories,neighbors,neighbors_clean
0,0,"['Green Eye Shadow', 'Earring', 'Blonde Bob']",female,female 'green eye shadow' 'earring' 'blonde bob',"[[184, 0.964], [3582, 0.964], [9431, 0.964], [...","[[184, 0.964], [3582, 0.964], [9431, 0.964], [..."
1,1,"['Smile', 'Mohawk']",male,male 'smile' 'mohawk',"[[7591, 0.923], [24, 0.8664], [3288, 0.8664], ...","[[7591, 0.923], [24, 0.8664], [3288, 0.8664], ..."
2,2,['Wild Hair'],female,female 'wild hair',"[[200, 1.0], [566, 1.0], [3412, 1.0], [97, 0.8...","[[200, 1.0], [566, 1.0], [3412, 1.0], [97, 0.8..."
3,3,"['Wild Hair', 'Nerd Glasses', 'Pipe']",male,male 'wild hair' 'nerd glasses' 'pipe',"[[723, 0.8287], [3937, 0.8287], [5091, 0.8287]...","[[723, 0.8287], [3937, 0.8287], [5091, 0.8287]..."
4,4,"['Big Shades', 'Wild Hair', 'Earring', 'Goat']",male,male 'big shades' 'wild hair' 'earring' 'goat',"[[7599, 0.8322], [8459, 0.8322], [3751, 0.8307...","[[7599, 0.8322], [8459, 0.8322], [3751, 0.8307..."


In [32]:
#indiv.loc[:,['type', 'accessories']].apply(lambda x: x[0]+x[1], axis = 1)

**Functions**

In [43]:
# function to get average values of nearest neighbors
def get_avg_of_neighbors(punk_id, column_name):
    if pd.isnull(indiv[column_name].iloc[punk_id]):
        neighbors__values_list = []
        recs = nearest_neighbors['neighbors_clean'].iloc[punk_id]
        punk_rec_indices = [recs[i][0] for i in range(len(recs))]
        
        for count, val in enumerate(punk_rec_indices):
            neighbor__value = indiv[column_name].iloc[val]
            if not pd.isnull(neighbor__value):
                neighbors__values_list.append(neighbor__value)
        #print(neighbors__values_list)
        if (sum(neighbors__values_list) == 0):
            return 0
        else:
            return(sum(neighbors__values_list)/len(neighbors__values_list))
    
    else:
        return(indiv[column_name].iloc[punk_id])
        

In [34]:
# test function
get_avg_of_neighbors(4, 'avg_usd_bid')

[217.404, 76.802, 44.1515, 0.558, 58.892, 174.704]


95.41858333333334

Create individual dataframes:

In [35]:
alien_df = indiv[indiv['type'] == 'alien']
avg_alien_sale = alien_df['avg_usd_sale'].mean()
print(avg_alien_sale)

ape_df = indiv[indiv['type'] == 'ape']
avg_ape_sale = ape_df['avg_usd_sale'].mean()
print(avg_ape_sale)

zombie_df = indiv[indiv['type'] == 'zombie']
avg_zombie_sale = zombie_df['avg_usd_sale'].mean()
print(avg_zombie_sale)

female_df = indiv[indiv['type'] == 'female']
avg_female_sale = female_df['avg_usd_sale'].mean()
print(avg_female_sale)

male_df = indiv[indiv['type'] == 'male']
avg_male_sale = male_df['avg_usd_sale'].mean()
print(avg_male_sale)

nan
4605.0
2315.9106428571426
478.0636207617703
226.47906066615582


We don't want an empty average value, so for aliens we will get the average sales from past NINE months instead of SIX months:

In [36]:
trans_alien = trans[trans['type']== 'alien']

In [37]:
trans_sold_alien = trans_alien[trans_alien['trans'] == 'Sold']

In [38]:
trans_sold_alien

Unnamed: 0,usd,date,trans,punk_id,accessories,type
46219,1.646,2017-07-10,Sold,7523,"['Earring', 'Knitted Cap', 'Medical Mask']",alien
51298,7570.0,2021-03-11,Sold,7804,"['Cap Forward', 'Pipe', 'Small Shades']",alien
51299,14.988,2018-01-10,Sold,7804,"['Cap Forward', 'Pipe', 'Small Shades']",alien
56461,2.61,2017-07-01,Sold,6089,"['Earring', 'Knitted Cap']",alien
87318,1.646,2017-07-10,Sold,5822,['Bandana'],alien
88669,2.69,2017-07-04,Sold,5905,"['Do-rag', 'Small Shades']",alien
108574,7580.0,2021-03-11,Sold,3100,['Headband'],alien
108575,2.127,2017-07-06,Sold,3100,['Headband'],alien
140261,761.889,2021-01-23,Sold,2890,['Cap'],alien
140262,2.127,2017-07-06,Sold,2890,['Cap'],alien


In [39]:
avg_alien_sale = (7570.000 + 7580.000) / 2

In [40]:
# function to get averages values of particular types
def get_avg_of_types(punk_id):
    if pd.isnull(indiv['avg_usd_sale'].iloc[punk_id]):
        if indiv['type'].iloc[punk_id] == 'alien':
            return(avg_alien_sale)
        elif indiv['type'].iloc[punk_id] == 'ape':
            return(avg_ape_sale)
        elif indiv['type'].iloc[punk_id] == 'zombie':
            return(avg_zombie_sale)
        elif indiv['type'].iloc[punk_id] == 'female':
            return(avg_female_sale)
        else:
            return(avg_male_sale)
    else:
        return(indiv['avg_usd_sale'].iloc[punk_id])

**Average USD Sale Column**

In [41]:
# add avg_usd_sale_type column to indiv dataframe
indiv['avg_usd_sale_type'] = 0

for i in range(len(indiv)):
    indiv['avg_usd_sale_type'].iloc[i] = get_avg_of_types(i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


**Average USD Bid Column**

In [44]:
# add avg_usd_bid_neigh column to indiv dataframe
indiv['avg_usd_bid_neigh'] = 0

for i in range(len(indiv)):
    indiv['avg_usd_bid_neigh'].iloc[i] = get_avg_of_neighbors(i, 'avg_usd_bid')

**Max USD Bid Column**

In [45]:
# add max_usd_bid_neigh column to indiv dataframe
indiv['max_usd_bid_neigh'] = 0

for i in range(len(indiv)):
    indiv['max_usd_bid_neigh'].iloc[i] = get_avg_of_neighbors(i, 'max_usd_bid')

**Average USD Offer Column**

In [46]:
# add avg_usd_offer_neigh column to indiv dataframe
indiv['avg_usd_offer_neigh'] = 0

for i in range(len(indiv)):
    indiv['avg_usd_offer_neigh'].iloc[i] = get_avg_of_neighbors(i, 'avg_usd_offer')

**Max USD Offer Column**

In [47]:
# add max_usd_offer_neigh column to indiv dataframe
indiv['max_usd_offer_neigh'] = 0

for i in range(len(indiv)):
    indiv['max_usd_offer_neigh'].iloc[i] = get_avg_of_neighbors(i, 'max_usd_offer')

In [48]:
indiv[['type', 'avg_usd_sale','avg_usd_sale_type']]

Unnamed: 0,type,avg_usd_sale,avg_usd_sale_type
0,female,,478.063621
1,male,,226.479061
2,female,,478.063621
3,male,,226.479061
4,male,,226.479061
...,...,...,...
9995,female,,478.063621
9996,male,,226.479061
9997,zombie,,2315.910643
9998,female,266378.2875,266378.287500


### 8. Save Dataframes

In [49]:
nearest_neighbors.to_csv('../data/clean_nearest_neighbors.csv', index=False)

In [420]:
indiv.to_csv('../data/extra_clean_individual.csv', index=False)