In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import spacy 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator,TransformerMixin
import string
from nltk import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
import unicodedata
%matplotlib inline
from sklearn import preprocessing,model_selection,metrics,feature_selection
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
data = pd.read_csv('../data/interim/001-pe-initial-clean.csv',encoding="latin-1",sep=';',index_col=False)

### Create a text processing unit

In [25]:
class NLTKPreprocesor(BaseEstimator,TransformerMixin):
    """
    This is a powerful class that can take care of different things.
    """
    def __init__(self,stopwords = None,punct = None,lower = True,strip=True):
        self.lower = lower
        self.strip = strip
        self.stopwords = stopwords or set(sw.words('english'))
        self.punct = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self,X,y=None):
        return self

    def inverse_transform(self,X):
        pass

    def transform(self,X):
        out = []
        for doc in X:
            temp = []
            for w in self.tokenize(doc):
                temp.append(w)
            out.append(temp)
        return out

    def tokenize(self,document):

        for sent in sent_tokenize(document):
            for token,tag in pos_tag(wordpunct_tokenize(sent)):
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token
                token = token.strip('#') if self.strip else token

                if token in self.stopwords:
                    continue

                if all(char in self.punct for char in token):
                    continue

                if len(token) <= 0:
                    continue

                lemma = self.lemmatize(token,tag)
                yield lemma

    def lemmatize(self,token,tag):
        tag ={
            'N' : wn.NOUN,
            'V' : wn.VERB,
            'R' : wn.ADV,
            'J' : wn.ADJ
        }.get(tag[0],wn.NOUN)

        return self.lemmatizer.lemmatize(token,tag)
    
# Remove accent chars
def remove_accents(x):
    return unicodedata.normalize('NFKD',x).encode('ascii','ignore').decode('utf-8','ignore')


# Removes special charachters
def remove_special_chars(x):
    return re.sub('[^a-zA-Z0-9\s]', '', x)


In [3]:
# Dropping the duplicates again!
data.loc[data.category=="other works on paper","category"] = "painting"
data = data.drop_duplicates()

In [4]:
data = data.drop_duplicates(subset=['artist_name', 'auction_date', 'category', 'hammer_price', 'location','materials'])

In [5]:
data

(83054, 12)

In [19]:
replacement_neg_hammer_price = data.loc[
    np.logical_and(data.hammer_price < 0,np.logical_not(data.estimate_high.isnull()))
    ,['estimate_high','estimate_low']].mean(axis=1)

data.loc[replacement_neg_hammer_price.index,'hammer_price'] = replacement_neg_hammer_price

data = data.loc[data.hammer_price > 0]

data = data.drop(['estimate_high','estimate_low'],axis=1)

In [29]:
data.loc[np.logical_and(data.materials=="oil on canvas",data.category=="unclassified"),'category']="painting"

data.loc[np.logical_and(data.materials=="works on paper",data.category=="unclassified"),"category"]="painting"

data.loc[np.logical_and(data.materials=="oil and charcoal",data.category=="unclassified"),"category"]="painting"

data.loc[np.logical_and(data.materials=="sculpture",data.category=="unclassified"),"category"]="sculpture"

In [33]:
data = data.drop_duplicates()

In [40]:
data = data.drop(data.loc[np.logical_and(data.category=="unclassified",data.materials.isnull())].index)

In [54]:
data = data.drop('materials',axis=1)

In [61]:
data = data.drop_duplicates()

In [62]:
data = data.drop('title',axis=1)

In [76]:
data = data.drop_duplicates()

In [87]:
def clean_location(x):        
    x = str(x).lower()
    if "," in x:
        return x.split(',')[-1].strip(" ")
    return x.strip(" ")

In [90]:
data.loc[:,'location']=data.location.apply(clean_location)

In [95]:
valid_locs = list(data.location.value_counts()[data.location.value_counts() > 100].index)

In [99]:
data.loc[~data.location.isin(valid_locs),"location"] = "other"

In [106]:
data.loc[data.location=="nan","location"] = "other"

In [114]:
data.loc[:,'auction_date'] = pd.to_datetime(data.auction_date)
data = data.assign(year =[x.year for x in data.auction_date], month=[x.month for x in data.auction_date],day=[x.day for x in data.auction_date],week=[x.week for x in data.auction_date])
data = data.drop('auction_date',axis=1)

In [118]:
text_features = ['artist_name','category','location']

In [119]:
data = pd.get_dummies(data,columns=text_features)

In [121]:
data.shape

(79473, 167)

In [123]:
data.to_csv("../data/interim/002-pe-features.csv",sep=";",index=False)