# ADVENTURES WITH NLP

### Following along https://github.com/priya-dwivedi/Deep-Learning/blob/master/topic_modeling/LDA_Newsgroup.ipynb

In [117]:
%matplotlib notebook
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from sklearn.model_selection import train_test_split

os.chdir('/Users/mleong/github/used-books')
books = pd.read_csv('used-book-data.csv')



In [118]:
# eliminate duplicate rows
books = books.drop_duplicates()

# id: convert to string
books.id = books.id.astype(str)

# posted: convert to datetime
books.posted = pd.to_datetime(books.posted)

# sold: combine buy now and add to cart categories to be 'for sale'
books = books.replace('Buy now', 'available')

# price: modify to eliminate '$' sign and convert to number
books['price'] = books['price'].str.replace(',', '')
books['price'] = books['price'].str.replace('$', '')
books['price'] = books['price'].astype(int)

# create new column "shipping_cost" that is continuous variable from $0 to whatever shipping fee is
def shipping_cost(shipping_string):
    shipping_after = shipping_string.split(" | ")
    if 'Free' in shipping_after[0]:
        shipping_cost = 0
    elif '$' in shipping_after[0]:
        shipping_cost = shipping_after[0]
        shipping_cost = shipping_cost.replace('$', '') 
    else:
        shipping_cost = None
    return shipping_cost
books['shipping_cost'] = books['shipping'].apply(shipping_cost)
books.shipping_cost = pd.to_numeric(books.shipping_cost, errors='coerce')

# create new simpler column "shipping_time" that is number of days to arrival
def shipping_time(shipping_string):
    if 'days' in shipping_string:
        shipping_after = shipping_string.split(" | ")
        if 'days' in shipping_after[0]:
            shipping_time = shipping_after[0]
            shipping_time = shipping_time.replace('+ days', '')
        elif 'days' in shipping_after[1]:
            shipping_time = shipping_after[1]
            shipping_time = shipping_time.replace('+ days', '')
        else:
            shipping_time = None
    else:
        shipping_time = None
    return shipping_time
books['shipping_time'] = books['shipping'].apply(shipping_time)

# create new simpler column "shipping_location" that is just the state product is coming from
def shipping_location(shipping_string):
    if 'from' in shipping_string:
        shipping_after = shipping_string.split(" | ")
        if 'from' in shipping_after[0]:
            shipping_location = shipping_after[0]
            shipping_location = shipping_location.replace('from ', '')
        elif 'from' in shipping_after[1]:
            shipping_location = shipping_after[1]
            shipping_location = shipping_location.replace('from ', '')
        elif 'from' in shipping_after[2]:
            shipping_location = shipping_after[2]
            shipping_location = shipping_location.replace('from ', '')
        else:
            shipping_location = None
    else:
        shipping_location = None
    return shipping_location
books['shipping_location'] = books['shipping'].apply(shipping_location)

# total price: new column that is sum of price and shipping fee
books['total_price'] = books['price']+books['shipping_cost']

# Making free_shipping column a yes/no
def free_shipping(shipping_cost):
    if shipping_cost == 0:
        free_shipping = 1
    else:
        free_shipping = 0
    return free_shipping
books['free_shipping'] = books.shipping_cost.apply(free_shipping)

# Making brand_included feature thats just a yes or no
books['brand_included'] = books.brand.notnull()
zeroANDones = lambda x: x*1
books.brand_included = books.brand_included.apply(zeroANDones)

# New column sold_true
def sold_true(status):
    if status == 'SOLD':
        sold_true = 1
    else:
        sold_true = 0
    return sold_true
books['sold_true'] = books.sold.apply(sold_true)

# New column summing of description length
def description_length(desc_string):
    description_length = len(desc_string)
    return description_length
books['description_length'] = books.description.apply(description_length)

# New column condition_ordinal from new = 1, like new = 2, good = 3, fair = 4, poor = 5
def condition_ordinal(condition):
    if condition == "New":
        condition_ordinal = 1
    elif condition == "Like new":
        condition_ordinal = 2
    elif condition == "Good":
        condition_ordinal = 3
    elif condition == "Fair":
        condition_ordinal = 4
    elif condition == "Poor":
        condition_ordinal = 5
    else: condition_ordinal = None
    return condition_ordinal
books['condition_ordinal'] = books.condition.apply(condition_ordinal)

# New column days_since_posted. Was scraped on 2019-06-10 so range from 0 and up.
from datetime import date

def days_since_posted(posted):
    posted_day = date(posted.year, posted.month, posted.day)
    scrape_day = date(2019, 6, 10)
    days_since_posted = scrape_day - posted_day
    return days_since_posted.days
books['days_since_posted'] = books.posted.apply(days_since_posted)



In [116]:
books['title']


0                                            Book
1                    Book- When Bad Things Happen
2                         Usborne Wipe-Clean- NEW
3                               Jingle Bells Book
4                           My Sister The Vampire
5                  Usborne Wipe Clean Pen control
6                        Are You My Mother?  Book
7                            STAR WARS Color Book
8                           My forever dress book
9                        Book Disney photomosaics
10        The Watson's Go to Birmingham Hardcover
11           The Pomegranate Witch Hardcover Book
12                                      Baby Book
13                                     Kids Books
14              Disney's princess collection book
15                     Kids Harry Potter Cookbook
16                        Paw Patrol My Busy Book
17                                  usborne books
18                 Courage & Defiance (Paperback)
19        Shadow Theater Entertainment Package Gi


# STEP 2: Data Processing

In [144]:
brand = books['brand'][30]
title = books['title'][30]
description = books['description'][30]
megastring = (str(brand) + " " + str(title) + " " + str(description)).lower()
print(megastring)


if any(word in megastring for word in ['loo', 'books', 'bundle']): 
    print(1)
else:
    print(0)


nan pets on the loose series book 1 and 2 pets on the loose series book 1 and 2
1


In [153]:
# THIS WORKS!

def megastring(row):
  return (str(row['brand']) + " " + str(row['title']) + " " + str(row['description'])).lower()

books['megastring'] = books.apply(megastring, axis = 1)
books.megastring



In [160]:
def disney(megastring):
    if any(word in megastring for word in ['disney', 'pixar', 'frozen', 'moana', 'toy story']): 
        return(1)
    else:
        return(0)
books['disney'] = books.megastring.apply(disney)

def americangirl(megastring):
    if any(word in megastring for word in ['americangirl', 'american girl']): 
        return(1)
    else:
        return(0)
books['americangirl'] = books.megastring.apply(americangirl)

def seuss(megastring):
    if any(word in megastring for word in ['dr seuss', 'dr. seuss', 'drseuss', 'dr.seuss']): 
        return(1)
    else:
        return(0)
books['seuss'] = books.megastring.apply(seuss)

def lego(megastring):
    if any(word in megastring for word in ['lego']): 
        return(1)
    else:
        return(0)
books['lego'] = books.megastring.apply(lego)

def sesame(megastring):
    if any(word in megastring for word in ['sesame street', 'sesamestreet', 'elmo', 'cookie monster', 'big bird']): 
        return(1)
    else:
        return(0)
books['sesame'] = books.megastring.apply(sesame)

def leapfrog(megastring):
    if any(word in megastring for word in ['leap frog', 'leapfrog']): 
        return(1)
    else:
        return(0)
books['leapfrog'] = books.megastring.apply(leapfrog)

def minecraft(megastring):
    if any(word in megastring for word in ['minecraft', 'mine craft']): 
        return(1)
    else:
        return(0)
books['minecraft'] = books.megastring.apply(minecraft)

def marvel(megastring):
    if any(word in megastring for word in ['spider-man', 'spiderman', 'hulk', 'iron man', 'thor', 'captain america',
                                           'wolverine', 'black panther', 'doctor strange', 'captain marvel', 'black widow',
                                           'scarlet witch', 'hawkeye', 'daredevil', 'silver surfer', 'avengers', 'xmen',
                                           'x-men', 'deadpool']):
        return(1)
    else:
        return(0)
books['marvel'] = books.megastring.apply(marvel)
           
def ericcarle(megastring):
    if any(word in megastring for word in ['eric carle', 'hungry caterpillar', 'ericcarle']):
        return(1)
    else:
        return(0)
books['ericcarle'] = books.megastring.apply(ericcarle)
           
def nickelodean(megastring):
    if any(word in megastring for word in ['nickelodean']):
        return(1)
    else:
        return(0)
books['nickelodean'] = books.megastring.apply(nickelodean)

def starwars(megastring):
    if any(word in megastring for word in ['starwars', 'star wars', 'jedi', 'yoda', 'darth vader']):
        return(1)
    else:
        return(0)
books['starwars'] = books.megastring.apply(starwars)

def winniethepooh(megastring):
    if any(word in megastring for word in ['winnie-the-pooh', 'winnie the pooh', 'tigger', 'eeyore']):
        return(1)
    else:
        return(0)
books['winniethepooh'] = books.megastring.apply(winniethepooh)

def harrypotter(megastring):
    if any(word in megastring for word in ['harrypotter', 'harry potter', 'hogwarts', 'dumbledore', 'fanstastic beasts']):
        return(1)
    else:
        return(0)
books['harrypotter'] = books.megastring.apply(harrypotter)

def wimpykid(megastring):
    if any(word in megastring for word in ['diary of a wimpy kid', 'wimpy kid']):
        return(1)
    else:
        return(0)
books['wimpykid'] = books.megastring.apply(wimpykid)

def wimpykid(megastring):
    if any(word in megastring for word in ['diary of a wimpy kid', 'wimpy kid']):
        return(1)
    else:
        return(0)
books['wimpykid'] = books.megastring.apply(wimpykid)

def fisherprice(megastring):
    if any(word in megastring for word in ['fisherprice', 'fisher-price', 'fisher price']):
        return(1)
    else:
        return(0)
books['fisherprice'] = books.megastring.apply(fisherprice)

def scholastic(megastring):
    if any(word in megastring for word in ['scholastic']):
        return(1)
    else:
        return(0)
books['scholastic'] = books.megastring.apply(scholastic)

def bundle(megastring):
    if any(word in megastring for word in ['bundle', 'set of']):
        return(1)
    else:
        return(0)
books['bundle'] = books.megastring.apply(bundle)

def vintage(megastring):
    if any(word in megastring for word in ['vintage']):
        return(1)
    else:
        return(0)
books['vintage'] = books.megastring.apply(vintage)

def smokefree(megastring):
    if any(word in megastring for word in ['smokefree', 'smoke free', 'smoke-free', 'pet free', 'pet-free', 'petfree']):
        return(1)
    else:
        return(0)
books['smokefree'] = books.megastring.apply(smokefree)

def hardcover(megastring):
    if any(word in megastring for word in ['hard cover', 'hardcover', 'hard-cover']):
        return(1)
    else:
        return(0)
books['hardcover'] = books.megastring.apply(hardcover)







In [161]:
books.head()

Unnamed: 0,brand,category,condition,description,id,posted,price,seller_name,shipping,sold,...,starwars,winniethepooh,harrypotter,wimpykid,fisherprice,scholastic,bundle,vintage,smokefree,hardcover
0,,"Other,Books,Children's books",New,Free with another book purchase,84914972647,2017-07-10,3,C Sells,$4.00 | from Texas,available,...,0,0,0,0,0,0,0,0,0,0
1,,"Other,Books,Children's books",New,Book-When Bad Thinds Happen in God's Wonderful...,28411714700,2017-07-15,3,Investinyou,$4.00 | 3+ days | from Texas,available,...,0,0,0,0,0,0,0,0,0,0
2,,"Other,Books,Children's books",New,This listing is for one book. Please let me kn...,21933816748,2019-05-26,11,HWalters55,Free | from Washington,available,...,0,0,0,0,0,0,0,0,0,0
3,,"Other,Books,Children's books",New,Brand new. Never used. Ask me about bundling.,89750511304,2017-11-11,3,Bargain hauls,$4.00 | 3+ days | from Minnesota,available,...,0,0,0,0,0,0,0,0,0,0
4,,"Other,Books,Children's books",New,My Sister The Vampire Vampalicious. New softco...,496308655,2016-08-19,4,Dustyattic,Free | from Connecticut,available,...,0,0,0,0,0,0,0,0,0,0


In [124]:

import pandas as pd
stemmer = SnowballStemmer("english")
"".join(".Seus".lower().replace(".", "").split(" "))
 


In [125]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [113]:
'''
Preview a document after preprocessing
'''
document_num = 50
doc_sample = 'Would make a great bday gift for younger child or Easter basket filler Brand new. \n*Dr Seuss hard cover "oh the places you go"\n*Play,learn,color. Pull back cars and let them go. \n\nComes from a smoke free home.'

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['Would', 'make', 'a', 'great', 'bday', 'gift', 'for', 'younger', 'child', 'or', 'Easter', 'basket', 'filler', 'Brand', 'new.', '\n*Dr', 'Seuss', 'hard', 'cover', '"oh', 'the', 'places', 'you', 'go"\n*Play,learn,color.', 'Pull', 'back', 'cars', 'and', 'let', 'them', 'go.', '\n\nComes', 'from', 'a', 'smoke', 'free', 'home.']


Tokenized and lemmatized document: 
['great', 'bday', 'gift', 'younger', 'child', 'easter', 'basket', 'filler', 'brand', 'seuss', 'hard', 'cover', 'place', 'play', 'learn', 'color', 'pull', 'car', 'come', 'smoke', 'free', 'home']


In [112]:
books['description'][90]

'Would make a great bday gift for younger child or Easter basket filler Brand new. \n*Dr Seuss hard cover "oh the places you go"\n*Play,learn,color. Pull back cars and let them go. \n\nComes from a smoke free home.'

In [110]:
processed_docs = []

for doc in books['title']:
    processed_docs.append(preprocess(doc))

TypeError: decoding to str: need a bytes-like object, int found

In [109]:

'''
Preview 'processed_docs'
'''
print(processed_docs[:2])

[[], []]


# STEP 3: Bag of words on dataset

In [37]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears 
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [38]:
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 addit
1 bodi
2 bricklin
3 bring
4 bumper
5 call
6 colleg
7 door
8 earli
9 engin
10 enlighten


In [39]:

'''
OPTIONAL STEP
Remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)

In [40]:
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 addit
1 bodi
2 bring
3 bumper
4 call
5 colleg
6 door
7 earli
8 engin
9 enlighten
10 histori


In [41]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [44]:
'''
Preview BOW for our sample preprocessed document
'''
document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 18 ("rest") appears 1 time.
Word 166 ("clear") appears 1 time.
Word 336 ("refer") appears 1 time.
Word 350 ("true") appears 1 time.
Word 391 ("technolog") appears 1 time.
Word 437 ("christian") appears 1 time.
Word 453 ("exampl") appears 1 time.
Word 476 ("jew") appears 1 time.
Word 480 ("lead") appears 1 time.
Word 482 ("littl") appears 3 time.
Word 520 ("wors") appears 2 time.
Word 721 ("keith") appears 3 time.
Word 732 ("punish") appears 1 time.
Word 803 ("california") appears 1 time.
Word 859 ("institut") appears 1 time.
Word 917 ("similar") appears 1 time.
Word 990 ("allan") appears 1 time.
Word 991 ("anti") appears 1 time.
Word 992 ("arriv") appears 1 time.
Word 993 ("austria") appears 1 time.
Word 994 ("caltech") appears 2 time.
Word 995 ("distinguish") appears 1 time.
Word 996 ("german") appears 1 time.
Word 997 ("germani") appears 3 time.
Word 998 ("hitler") appears 1 time.
Word 999 ("livesey") appears 2 time.
Word 1000 ("motto") appears 2 time.
Word 1001 ("order") appear

# STEP 4: Running LDA using Bag of Words

In [45]:
# LDA mono-core -- fallback code in case LdaMulticore throws an error on your machine
# lda_model = gensim.models.LdaModel(bow_corpus, 
#                                    num_topics = 10, 
#                                    id2word = dictionary,                                    
#                                    passes = 50)

# LDA multicore 
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [46]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")


Topic: 0 
Words: 0.007*"presid" + 0.005*"clinton" + 0.004*"homosexu" + 0.004*"netcom" + 0.004*"virginia" + 0.004*"bike" + 0.004*"run" + 0.003*"pitch" + 0.003*"talk" + 0.003*"consid"


Topic: 1 
Words: 0.009*"govern" + 0.007*"armenian" + 0.006*"israel" + 0.005*"kill" + 0.005*"isra" + 0.004*"american" + 0.004*"turkish" + 0.004*"weapon" + 0.004*"jew" + 0.004*"countri"


Topic: 2 
Words: 0.017*"game" + 0.015*"team" + 0.011*"play" + 0.009*"player" + 0.008*"hockey" + 0.006*"season" + 0.005*"leagu" + 0.005*"canada" + 0.005*"score" + 0.004*"andrew"


Topic: 3 
Words: 0.012*"window" + 0.011*"card" + 0.008*"drive" + 0.007*"driver" + 0.006*"sale" + 0.005*"control" + 0.005*"scsi" + 0.005*"disk" + 0.005*"speed" + 0.005*"price"


Topic: 4 
Words: 0.013*"file" + 0.009*"program" + 0.007*"window" + 0.006*"encrypt" + 0.006*"chip" + 0.006*"imag" + 0.006*"data" + 0.006*"avail" + 0.005*"code" + 0.004*"version"


Topic: 5 
Words: 0.012*"space" + 0.009*"nasa" + 0.006*"scienc" + 0.005*"orbit" + 0.004*"researc

### List of Brand Categories on Children's book site that we can see if appear in Brand, Title, or Description Categories.

Scholastic
Disney
American Girl
Vintage
Dr. Seuss
HarperCollins Publishers
Random House
Sesame Street
Leap Frog
ABRAMS
Osborne
Minecraft
Marvel
Lot
Eric Carle
Aladdin
Nickelodeon
Hallmark
LEGO
Custom Variety Pack
Star Wars
Winnie The Pooh
Fisher-Price
DK Publishing
National Geographic
Disney Princess
MacMillan Childrens
Harry Potter
Penguin
Disney Pixar

### My Ideas for two sets of categories


brand =

