In [2]:

#loading the unprocessed data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from google.colab import drive
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
import re
import string
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn import model_selection, naive_bayes, svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import os
     

In [3]:
project_data = pd.read_excel('Appliances_Postings_for_AUD.xlsx')
project_data.head()
     

Unnamed: 0,Posting ID,link,Title,Description,Price,Condition,Make
0,0,https://dallas.craigslist.org/ndf/app/75540765...,Frigidaire Black Built-in Oven - $249 (Pilot P...,Frigidaire Black Built-in Oven Excellent hardl...,249.0,like new,
1,1,https://dallas.craigslist.org/ndf/app/75603287...,Whirlpool Top load Washer - $249 (Pilot Point),Whirlpool Top load WasherNot pretty but works ...,249.0,like new,
2,2,https://dallas.craigslist.org/ndf/app/75593750...,Maytag Top Loading Washer - $149 (Pilot Point),Maytag Top Loading WasherA little older and no...,149.0,like new,
3,3,https://dallas.craigslist.org/ndf/app/75533342...,Kenmore Stainless Steel Range - $400 (Pilot Po...,Kenmore Stainless Steel RangeGently Used Condi...,400.0,like new,
4,4,https://dallas.craigslist.org/ndf/app/75593744...,Whirlpool Black Side by Side Fridge - $449 (Pi...,Whirlpool Black Side by Side Fridge!Excellent ...,449.0,like new,


In [4]:
project_data['Description'] = project_data['Description'].fillna('Unknown')

In [5]:

project_data['Description_char'] = project_data['Description'].values

In [6]:

project_data['ex_flag'] = project_data['Description'].str.contains('excellent|Excellent').astype(int)
project_data['new_flag'] = project_data['Description'].str.contains('new|New').astype(int)
project_data['go_flag'] = project_data['Description'].str.contains('good|Good').astype(int)
     

In [7]:
filtered_data = project_data[~project_data['Condition'].isna()]
filtered_data = filtered_data[~project_data['Price'].isna()]
filtered_data['Price'] = filtered_data['Price'].replace('[\$,]', '', regex=True).astype(float)
filtered_data['summary_size']     = filtered_data['Description'].apply(lambda x: len(x))
filtered_data['words_in_summary'] = filtered_data['Description'].apply(lambda x: len(x.split()))
filtered_data['Condition'] = filtered_data['Condition'].map({'excellent': 1, 'new': 1, 'good':0, 'like new':0, 'fair':0 })
filtered_data = filtered_data[~filtered_data['Condition'].isna()]

filtered_data.shape
     

  filtered_data = filtered_data[~project_data['Price'].isna()]


(3400, 13)

In [8]:
filtered_data['Condition'].value_counts(normalize=True)

1.0    0.640588
0.0    0.359412
Name: Condition, dtype: float64

In [9]:
default_preprocessor = CountVectorizer().build_preprocessor()
def build_text_preprocessor(field):
  field_idx = list(filtered_data.columns).index(field)
  return lambda x: default_preprocessor(x[field_idx])

re_tok = re.compile(f'([{string.punctuation}''])')
def tokenize(s):
  return re_tok.sub(r' ', s).split(' ')

In [10]:

text_vectorizer = FeatureUnion([
                                ('Description', TfidfVectorizer(ngram_range=(1,3), tokenizer=tokenize, min_df=0.01, max_df=0.9,
                                                                strip_accents='unicode',use_idf=1, smooth_idf=1, sublinear_tf=1,
                                                                preprocessor = build_text_preprocessor('Description'))),
                                ('Description_char', TfidfVectorizer(ngram_range=(1,4), tokenizer=tokenize, analyzer='char', 
                                                                     stop_words='english', strip_accents='unicode',
                                                                     max_features=10000,
                                                                preprocessor = build_text_preprocessor('Description_char'))),
                                ('Title', TfidfVectorizer(ngram_range=(1,3), tokenizer=tokenize, min_df=0.01, max_df=0.9,
                                                                strip_accents='unicode',use_idf=1, smooth_idf=1, sublinear_tf=1,
                                                                preprocessor = build_text_preprocessor('Title')))
                              ])
train_tfidf = text_vectorizer.fit_transform(filtered_data.values)

In [11]:

numeric_features = csr_matrix(filtered_data[[ 'summary_size', 'words_in_summary', 'Price', 'ex_flag', 'new_flag', 'go_flag']].values)
train_idf = hstack((train_tfidf, numeric_features)).toarray()

In [12]:
filtered_data

Unnamed: 0,Posting ID,link,Title,Description,Price,Condition,Make,Description_char,ex_flag,new_flag,go_flag,summary_size,words_in_summary
0,0,https://dallas.craigslist.org/ndf/app/75540765...,Frigidaire Black Built-in Oven - $249 (Pilot P...,Frigidaire Black Built-in Oven Excellent hardl...,249.0,0.0,,Frigidaire Black Built-in Oven Excellent hardl...,1,0,0,266,45
1,1,https://dallas.craigslist.org/ndf/app/75603287...,Whirlpool Top load Washer - $249 (Pilot Point),Whirlpool Top load WasherNot pretty but works ...,249.0,0.0,,Whirlpool Top load WasherNot pretty but works ...,0,0,0,184,34
2,2,https://dallas.craigslist.org/ndf/app/75593750...,Maytag Top Loading Washer - $149 (Pilot Point),Maytag Top Loading WasherA little older and no...,149.0,0.0,,Maytag Top Loading WasherA little older and no...,0,0,1,335,62
3,3,https://dallas.craigslist.org/ndf/app/75533342...,Kenmore Stainless Steel Range - $400 (Pilot Po...,Kenmore Stainless Steel RangeGently Used Condi...,400.0,0.0,,Kenmore Stainless Steel RangeGently Used Condi...,0,0,0,257,39
4,4,https://dallas.craigslist.org/ndf/app/75593744...,Whirlpool Black Side by Side Fridge - $449 (Pi...,Whirlpool Black Side by Side Fridge!Excellent ...,449.0,0.0,,Whirlpool Black Side by Side Fridge!Excellent ...,1,0,0,252,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5394,5394,https://sfbay.craigslist.org/sby/app/756255039...,Cuisinart Brand 11 Cup Food Processor + Set of...,The price for everything mentioned below is $1...,100.0,1.0,Cuisinart,The price for everything mentioned below is $1...,1,1,0,1494,223
5395,5395,https://sfbay.craigslist.org/sfc/app/756056209...,1967 Elextrolux Model G - Teal - $100 (SOMA / ...,Vintage Canister Vacuum Cleaner - automatic co...,100.0,0.0,,Vintage Canister Vacuum Cleaner - automatic co...,0,0,0,182,31
5397,5397,https://sfbay.craigslist.org/sby/app/756254792...,steamer - $60 (San Jose),Brand new in the box,60.0,1.0,puresteam,Brand new in the box,0,1,0,20,5
5398,5398,https://sfbay.craigslist.org/sby/app/756254737...,Dishwasher - $140 (willow glen / cambrian),SPT Counter/ mobile dishwasher Open box excell...,140.0,0.0,SPT,SPT Counter/ mobile dishwasher Open box excell...,1,1,0,129,20


In [13]:
y = filtered_data['Condition']

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(train_idf, y, test_size=0.2, random_state=1, stratify = y )

In [15]:

classifier = RidgeClassifier(alpha=.0001)
classifier.fit(X_train, y_train)
pred_tfidf = classifier.decision_function(X_valid)
probs_tfidf = np.exp(pred_tfidf)
pred_tfidf = np.where(probs_tfidf>0.90, 1, 0)
accuracy_val = accuracy_score(y_valid, pred_tfidf)
accuracy_val
     

0.7794117647058824

In [16]:
filtered_data['Condition'].value_counts(normalize=True)

1.0    0.640588
0.0    0.359412
Name: Condition, dtype: float64

In [17]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_valid)
accuracy_gb = accuracy_score(y_valid, y_pred)
print("Gaussian Naive Bayes Accuracy Score -> ",accuracy_gb*100)

Gaussian Naive Bayes Accuracy Score ->  76.3235294117647


In [18]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train,y_train)
predictions_NB = Naive.predict(X_valid)
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, y_valid)*100)
     

Naive Bayes Accuracy Score ->  56.3235294117647


In [19]:

#Random Forest Classifier
random_forest = RandomForestClassifier(n_estimators=50)
y_pred_rf = random_forest.fit(X_train,y_train).predict(X_valid)
accuracy_rf = accuracy_score(y_pred_rf, y_valid)
print("Random Forest Accuracy Score -> ",accuracy_rf*100)

Random Forest Accuracy Score ->  85.1470588235294


In [55]:
import pandas as pd
from keras.preprocessing import text,sequence
import numpy as np
from keras.layers import GRU, Conv1D,CuDNNGRU
from keras.layers import Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.callbacks import Callback,EarlyStopping
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Embedding, Dropout, Conv1D
from keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score
import re,string
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy import sparse
from scipy.sparse import hstack, csr_matrix
from keras.models import model_from_json
import warnings
warnings.filterwarnings("ignore")

In [56]:
embed_size = 300
max_features = 6500
maxlen = 350
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(filtered_data['Description']))
from gensim.models import KeyedVectors
#EMBEDDING_FILE = KeyedVectors.load_word2vec_format('glove.6B.50dg.txt', binary=False)
embedding_dict={} #intitialize empty dictionary 
with open('/content/glove.6B.50d.txt','r',encoding="utf8") as f:
    for line in f:
        values=line.split()
        word = values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
glove=[]
for word, i in word_index.items():
    temp=[]
    if i > max_features: 
        continue
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None: 
        temp.append(embedding_vector.flatten().tolist())
    glove.append(temp)


In [57]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
glove_matrix=pad_sequences(glove,padding="post",dtype='float64') #padding to obtain final 3D 
glove_matrix

array([[[ 0.26818001,  0.14346001, -0.27877   , ..., -0.63209999,
         -0.25027999, -0.38097   ]],

       [[ 0.15272   ,  0.36181   , -0.22168   , ...,  0.43382001,
         -0.084617  ,  0.1214    ]],

       [[ 0.41800001,  0.24968   , -0.41242   , ..., -0.18411   ,
         -0.11514   , -0.78580999]],

       ...,

       [[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[ 0.16141   , -1.03999996,  0.63762999, ...,  0.53530997,
          0.83732998,  1.04569995]]])

In [78]:

embed_size = 50
max_features = 6500
maxlen = 350
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(filtered_data['Description']))
EMBEDDING_FILE = '/content/glove.6B.50d.txt'
print('loading embedding file')
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE,encoding="utf8"))
print('Done loading embedding file')
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
  if i >= max_features: continue
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None: embedding_matrix[i] = embedding_vector

loading embedding file
Done loading embedding file


In [79]:

def build_model(body_flag,body_flag_test, X_train,Y_train,X_valid,Y_valid,early_stop, lr = 0.0, lr_d = 0.0, units = 0, dr = 0.0):
    body_text_flag = body_flag
    inp = Input(shape = (maxlen,))
    print(nb_words)
    x = Embedding(max_features , embed_size, weights = [embedding_matrix], trainable = False)(inp)

    x = SpatialDropout1D(dr)(x)

    x = Bidirectional(GRU(64, return_sequences = True))(x)
    x = Conv1D(64, kernel_size = 1, padding = "valid", kernel_initializer = "he_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    
    # body_text_flag = Input(shape=[body_text_flag.shape[1]], name="body_text_flag")
    x = concatenate([avg_pool, max_pool])
    x = Dense(1, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    ra_val = RocAucEvaluation(validation_data=(X_valid, Y_valid), interval = 1)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train , Y_train, batch_size = 64, epochs = 15, validation_data = (X_valid, Y_valid), 
                        verbose = 1, callbacks = [ra_val, check_point, early_stop])
    #model = load_model(file_path)
    return model

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [80]:
def build_model(body_flag,body_flag_test, X_train,Y_train,X_valid,Y_valid,early_stop, lr = 0.0, lr_d = 0.0, units = 0, dr = 0.0):
    body_text_flag = body_flag
    inp = Input(shape = (maxlen,))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x = SpatialDropout1D(dr)(x)

    x = Bidirectional(GRU(units, return_sequences = True))(x)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    body_text_flag = Input(shape=[body_text_flag.shape[1]], name="body_text_flag")
    x = concatenate([avg_pool, max_pool, body_text_flag])
    x = Dense(1, activation = "sigmoid")(x)
    model = Model(inputs = [inp, body_text_flag], outputs = x)
    ra_val = RocAucEvaluation(validation_data=([X_valid,body_flag_test], Y_valid), interval = 1)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit([X_train,body_flag] , Y_train, batch_size = 64, epochs = 15, validation_data = ([X_valid,body_flag_test], Y_valid), 
                        verbose = 1, callbacks = [ra_val, check_point, early_stop])
    #model = load_model(file_path)
    return model

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [81]:

X_train, X_valid, Y_train, Y_valid = train_test_split(filtered_data.drop(['Condition'], axis=1), y, test_size = 0.2, random_state =1,stratify=y)
raw_text_train = 'Title' + X_train["Title"].astype(str) + ' ' + X_train["Description"]
raw_text_valid = 'Title' + X_valid["Title"].astype(str) + ' ' + X_valid["Description"]

tk = text.Tokenizer(num_words = max_features, lower = True)
tk.fit_on_texts(raw_text_train)
X_train["text_seq"] = tk.texts_to_sequences(raw_text_train)
X_valid["text_seq"] = tk.texts_to_sequences(raw_text_valid)

print('padding')
raw_text_train = pad_sequences(X_train.text_seq, maxlen = maxlen)
raw_text_test = pad_sequences(X_valid.text_seq, maxlen = maxlen)


#file_path = "/content/best_model.hdf5"
#check_point = ModelCheckpoint(file_path, monitor = "val_acc", verbose = 1,
#                   save_best_only = True, mode = "max")
early_stop = EarlyStopping(monitor = "val_acc", mode = "max", patience = 5)
test_flag = X_train[['ex_flag', 'new_flag', 'go_flag']].values
body_flag_test = X_valid[['ex_flag', 'new_flag', 'go_flag']].values
model = build_model(test_flag,body_flag_test,raw_text_train,Y_train,raw_text_test,Y_valid,early_stop,lr = 1e-3, lr_d = 0, units = 64, dr = 0.2)

#model.load_weights(file_path)
predictions = model.predict([raw_text_test,body_flag_test]).reshape((raw_text_test.shape[0],))

# pred = loaded_model.predict(X_valid, batch_size = 1024, verbose = 1)
pred = np.where(predictions>=0.5,1,0)
score = pd.DataFrame(pred)
# print(Y_valid,score.astype(int))
accuracy_val = accuracy_score(Y_valid, score.values)
print('NN OOF accuracy: {}'.format(accuracy_val))

padding
Epoch 1/15




 ROC-AUC - epoch: 1 - score: 0.782195 

Epoch 2/15




 ROC-AUC - epoch: 2 - score: 0.816617 

Epoch 3/15




 ROC-AUC - epoch: 3 - score: 0.825049 

Epoch 4/15




 ROC-AUC - epoch: 4 - score: 0.850964 

Epoch 5/15




 ROC-AUC - epoch: 5 - score: 0.857479 

Epoch 6/15




 ROC-AUC - epoch: 6 - score: 0.869858 

Epoch 7/15




 ROC-AUC - epoch: 7 - score: 0.875367 

Epoch 8/15




 ROC-AUC - epoch: 8 - score: 0.877059 

Epoch 9/15




 ROC-AUC - epoch: 9 - score: 0.877453 

Epoch 10/15




 ROC-AUC - epoch: 10 - score: 0.882172 

Epoch 11/15




 ROC-AUC - epoch: 11 - score: 0.881345 

Epoch 12/15




 ROC-AUC - epoch: 12 - score: 0.877914 

Epoch 13/15




 ROC-AUC - epoch: 13 - score: 0.884184 

Epoch 14/15




 ROC-AUC - epoch: 14 - score: 0.884494 

Epoch 15/15




 ROC-AUC - epoch: 15 - score: 0.882059 

NN OOF accuracy: 0.788235294117647


In [85]:
#model = load_model(file_path)
#model.load_weights(file_path)
predictions = model.predict([raw_text_test,body_flag_test]).reshape((raw_text_test.shape[0],))

# pred = loaded_model.predict(X_valid, batch_size = 1024, verbose = 1)
pred = np.where(predictions>=0.5,1,0)
score = pd.DataFrame(pred)
# print(Y_valid,score.astype(int))
accuracy_val = accuracy_score(Y_valid, score.values)
print('NN OOF accuracy: {}'.format(accuracy_val))

NN OOF accuracy: 0.788235294117647
