In [1]:
import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Ridge

In [2]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

In [3]:
df = pd.read_csv('./data/sample-data.csv')

In [4]:
df.head(5)

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."


In [5]:
tags = ['</li>','<li>', '</ul>', '<ul>', '</b>', '<br>', '</b>', '<b>']
def replace_tags(text, tags):
    for tag in tags:
        text = text.replace(tag, '')
    return text    

In [6]:
df['clean_description'] = df['description'].apply(lambda x: replace_tags(x, tags))

In [7]:
print(df.iloc[0,2])

Active classic boxers - There's a reason why our boxers are a cult favorite - they keep their cool, especially in sticky situations. The quick-drying, lightweight underwear takes up minimal space in a travel pack. An exposed, brushed waistband offers next-to-skin softness, five-panel construction with a traditional boxer back for a classic fit, and a functional fly. Made of 3.7-oz 100% recycled polyester with moisture-wicking performance. Inseam (size M) is 4 1/2". Recyclable through the Common Threads Recycling Program.Details: "Silky Capilene 1 fabric is ultralight, breathable and quick-to-dry" "Exposed, brushed elastic waistband for comfort" 5-panel construction with traditional boxer back "Inseam (size M) is 4 1/2"""Fabric: 3.7-oz 100% all-recycled polyester with Gladiodor natural odor control for the garment. Recyclable through the Common Threads Recycling ProgramWeight: 99 g (3.5 oz)Made in Mexico.


In [8]:
# Remove punctuation and lower case all doc 
# We use Regular Expression. More info here --> https://stackoverflow.com/questions/20731966/regex-remove-all-special-characters-except-numbers
df['clean_description'] = df['clean_description'].str.replace(r"[^A-Za-z ]+", " ") # 1
df['clean_description'] = df['clean_description'].fillna('').apply(lambda x: x.lower())
df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,description,clean_description
0,1,Active classic boxers - There's a reason why o...,active classic boxers there s a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...,active sport boxer briefs skinning up glory ...
2,3,Active sport briefs - These superbreathable no...,active sport briefs these superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc...",alpine guide pants skin in climb ice switc...
4,5,"Alpine wind jkt - On high ridges, steep ice an...",alpine wind jkt on high ridges steep ice an...


In [9]:
print(df.iloc[0,2])

active classic boxers   there s a reason why our boxers are a cult favorite   they keep their cool  especially in sticky situations  the quick drying  lightweight underwear takes up minimal space in a travel pack  an exposed  brushed waistband offers next to skin softness  five panel construction with a traditional boxer back for a classic fit  and a functional fly  made of  oz   recycled polyester with moisture wicking performance  inseam  size m  is     recyclable through the common threads recycling program details   silky capilene   fabric is ultralight  breathable and quick to dry   exposed  brushed elastic waistband for comfort   panel construction with traditional boxer back  inseam  size m  is    fabric   oz   all recycled polyester with gladiodor natural odor control for the garment  recyclable through the common threads recycling programweight    g   oz made in mexico 


In [10]:
## Import stop words from spacy 
from spacy.lang.en.stop_words import STOP_WORDS

## Tokenize the cleaned document
tokenized_doc = df['clean_description'].fillna('').apply(lambda x: nlp(x))

# remove stop-words
tokenized_doc = tokenized_doc.apply(lambda x: [token.lemma_ for token in x if token.text not in STOP_WORDS])
tokenized_doc

0      [active, classic, boxer,   , s, reason, boxer,...
1      [active, sport, boxer, brief,   , skin, glory,...
2      [active, sport, brief,   , superbreathable, fl...
3      [alpine, guide, pant,   , skin,  , climb, ice,...
4      [alpine, wind, jkt,   , high, ridge,  , steep,...
                             ...                        
495    [cap,   , bottom,   , cut, loose, madden, crow...
496    [cap,   , crew,   , crew, take, edge, fickle, ...
497    [time, shell,   , need, use, morning, time, um...
498    [wear, cargo, short,   , wear, cargo, short, b...
499    [wear, short,   , time, simplify,  , wear, sho...
Name: clean_description, Length: 500, dtype: object

In [11]:
df['tokenized_desc'] =tokenized_doc
df.head()

Unnamed: 0,id,description,clean_description,tokenized_desc
0,1,Active classic boxers - There's a reason why o...,active classic boxers there s a reason why o...,"[active, classic, boxer, , s, reason, boxer,..."
1,2,Active sport boxer briefs - Skinning up Glory ...,active sport boxer briefs skinning up glory ...,"[active, sport, boxer, brief, , skin, glory,..."
2,3,Active sport briefs - These superbreathable no...,active sport briefs these superbreathable no...,"[active, sport, brief, , superbreathable, fl..."
3,4,"Alpine guide pants - Skin in, climb ice, switc...",alpine guide pants skin in climb ice switc...,"[alpine, guide, pant, , skin, , climb, ice,..."
4,5,"Alpine wind jkt - On high ridges, steep ice an...",alpine wind jkt on high ridges steep ice an...,"[alpine, wind, jkt, , high, ridge, , steep,..."


In [12]:
list_token = ['active', 'classic', 'boxer', '']
' '.join(list_token)


'active classic boxer '

In [13]:
df["clean_token"] = df['tokenized_desc'].apply(lambda x: " ".join(x))
df.head()

Unnamed: 0,id,description,clean_description,tokenized_desc,clean_token
0,1,Active classic boxers - There's a reason why o...,active classic boxers there s a reason why o...,"[active, classic, boxer, , s, reason, boxer,...",active classic boxer s reason boxer cult fa...
1,2,Active sport boxer briefs - Skinning up Glory ...,active sport boxer briefs skinning up glory ...,"[active, sport, boxer, brief, , skin, glory,...",active sport boxer brief skin glory require...
2,3,Active sport briefs - These superbreathable no...,active sport briefs these superbreathable no...,"[active, sport, brief, , superbreathable, fl...",active sport brief superbreathable fly brie...
3,4,"Alpine guide pants - Skin in, climb ice, switc...",alpine guide pants skin in climb ice switc...,"[alpine, guide, pant, , skin, , climb, ice,...",alpine guide pant skin climb ice switch...
4,5,"Alpine wind jkt - On high ridges, steep ice an...",alpine wind jkt on high ridges steep ice an...,"[alpine, wind, jkt, , high, ridge, , steep,...",alpine wind jkt high ridge steep ice alpi...


In [14]:
# Tfidf transformer 
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vector
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['clean_token'])
dense = X.toarray()
dense

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
# Create a tf-idf matrix for each token and each document
matrix = pd.DataFrame(dense, 
             columns=[x for x in vectorizer.get_feature_names()])
matrix.head(5)

Unnamed: 0,abandon,ability,able,abrasion,abrasive,abroad,absolute,absorb,absorption,abstract,...,zinger,zip,zipped,zipper,zippered,zippers,zippersfabric,zipping,zips,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.241148,0.068665,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.14115,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
def max_tf_idf(line):
    return np.argmax(line)#matrix.iloc[1, :]

In [18]:
matrix.shape[0]

500

In [19]:
list_max = list()
list_rec = list() # list of words with max tf_idf
for i in range(matrix.shape[0]):
    col_idx = np.argmax(matrix.iloc[i, :]) # index of column with max value
    list_max.append(matrix.iloc[i, col_idx])
    list_rec.append(matrix.columns[col_idx])
#    list_max.append(matrix.columns())
matrix['max_tf_idf'] = list_max
matrix['most_important'] = list_rec

In [20]:
matrix.head(10)

Unnamed: 0,abandon,ability,able,abrasion,abrasive,abroad,absolute,absorb,absorption,abstract,...,zipped,zipper,zippered,zippers,zippersfabric,zipping,zips,zone,max_tf_idf,most_important
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.535294,boxer
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.355194,boxer
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.298029,mesh
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.241148,0.068665,0.0,0.0,0.0,0.0,0.0,0.259225,resistant
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.14115,0.0,0.0,0.0,0.0,0.0,0.0,0.19994,panel
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.19455,0.0,0.0,0.0,0.0,0.0,0.0,0.222729,way
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.212984,0.050538,0.0,0.0,0.0,0.0,0.0,0.32346,atom
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.475452,betina
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.320686,microdeni
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.288101,supplex


In [21]:
df.iloc[9,1]

'Baby sun bucket hat - This hat goes on when the sun rises above the horizon, and stays on when raindrops start falling. Its made from an ultra-durable 4-ply, 4.2-oz Supplex nylon fabric with a DWR (durable water repellent) finish, and reverses to either a contrasting solid color or print. A soft tuck-away chin strap with a hook-and-loop fastener holds securely without chafing young jawlines. Packs small; easy care.<br><br><b>Details:</b><ul> <li>"Lightweight Supplex nylon is soft, dries fast and packs small; easy care"</li> <li>Brim shields head and neck</li> <li>Chin strap with hook-and-loop fastener can be tucked away when not in use</li> <li>Reversible style with fun print/solid combinations</li> <li>One colorway has print on both sides</li></ul><br><br><b>Fabric: </b>"4-ply, 4.2-oz Supplex nylon with a DWR (durable water repellent) finish"<br><br><b>Weight: </b>(49 g 1.7 oz)<br><br>Made in China.'

In [22]:
# TruncatedSVD from sklearn
from sklearn.decomposition import TruncatedSVD

In [23]:
# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=15, algorithm='randomized', n_iter=100, random_state=122)
lsa = svd_model.fit_transform(X)

col_names = ['topic_{}'.format(i) for i in range(15)]

topic_encoded_df = pd.DataFrame(lsa, columns = col_names)
topic_encoded_df["documents"] = df['clean_token']
topic_encoded_df

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,documents
0,0.279251,-0.044662,0.192870,0.098602,-0.120932,-0.017531,-0.075669,0.000285,-0.142784,-0.074576,-0.015033,0.018718,-0.068663,0.020519,0.017502,active classic boxer s reason boxer cult fa...
1,0.301679,-0.064508,0.105576,0.113509,-0.034945,-0.133946,-0.002684,-0.020225,-0.089721,-0.117994,-0.038437,0.093139,-0.107680,0.042856,-0.065141,active sport boxer brief skin glory require...
2,0.288637,-0.079838,0.112713,0.028648,-0.042203,-0.092905,0.098687,0.032590,-0.066038,-0.048770,-0.028918,0.146078,-0.102439,0.057251,-0.076734,active sport brief superbreathable fly brie...
3,0.419469,-0.256505,-0.126017,-0.152662,0.043997,0.010301,-0.077000,-0.097040,0.015971,-0.144288,-0.012095,-0.041352,-0.154103,-0.098854,-0.043628,alpine guide pant skin climb ice switch...
4,0.406511,-0.263648,-0.027142,-0.248345,-0.033898,0.071667,0.020179,0.011237,0.101403,-0.156119,0.000604,-0.118697,-0.258172,0.221830,0.075223,alpine wind jkt high ridge steep ice alpi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.342277,-0.072815,0.393977,0.082310,-0.250618,0.012794,-0.151201,-0.058888,-0.304413,-0.111252,-0.050656,-0.029421,-0.037617,0.009217,-0.038008,cap bottom cut loose madden crowd search...
496,0.382089,-0.072973,0.493851,-0.000616,-0.298972,0.048773,-0.072183,0.018664,-0.256497,-0.003314,-0.041687,-0.140569,-0.005351,0.098169,-0.014710,cap crew crew take edge fickle weather ...
497,0.333652,-0.210800,-0.053722,-0.238009,0.000623,0.199455,-0.007720,-0.178304,0.050147,0.064159,0.032631,-0.000012,0.125492,0.002028,0.021796,time shell need use morning time umbrella ...
498,0.333107,0.078017,-0.134258,0.344820,0.039499,0.076239,-0.170294,0.080356,-0.048693,0.038876,-0.015405,-0.032500,0.115552,0.141984,-0.025911,wear cargo short wear cargo short bask glor...


In [24]:
def get_most_important(matrix):
    list_max = list()
    list_rec = list() # list of words with max tf_idf
    for i in range(matrix.shape[0]):
        col_idx = np.argmax(matrix.iloc[i, :]) # index of column with max value
        list_max.append(matrix.iloc[i, col_idx])
        list_rec.append(matrix.columns[col_idx])
    #    list_max.append(matrix.columns())
    return list_max, list_rec

In [25]:
list_max, list_rec = get_most_important(topic_encoded_df.iloc[:,:-2])
topic_encoded_df['max_tf_idf'] = list_max
topic_encoded_df['most_important'] = list_rec
topic_encoded_df

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,documents,max_tf_idf,most_important
0,0.279251,-0.044662,0.192870,0.098602,-0.120932,-0.017531,-0.075669,0.000285,-0.142784,-0.074576,-0.015033,0.018718,-0.068663,0.020519,0.017502,active classic boxer s reason boxer cult fa...,0.279251,topic_0
1,0.301679,-0.064508,0.105576,0.113509,-0.034945,-0.133946,-0.002684,-0.020225,-0.089721,-0.117994,-0.038437,0.093139,-0.107680,0.042856,-0.065141,active sport boxer brief skin glory require...,0.301679,topic_0
2,0.288637,-0.079838,0.112713,0.028648,-0.042203,-0.092905,0.098687,0.032590,-0.066038,-0.048770,-0.028918,0.146078,-0.102439,0.057251,-0.076734,active sport brief superbreathable fly brie...,0.288637,topic_0
3,0.419469,-0.256505,-0.126017,-0.152662,0.043997,0.010301,-0.077000,-0.097040,0.015971,-0.144288,-0.012095,-0.041352,-0.154103,-0.098854,-0.043628,alpine guide pant skin climb ice switch...,0.419469,topic_0
4,0.406511,-0.263648,-0.027142,-0.248345,-0.033898,0.071667,0.020179,0.011237,0.101403,-0.156119,0.000604,-0.118697,-0.258172,0.221830,0.075223,alpine wind jkt high ridge steep ice alpi...,0.406511,topic_0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.342277,-0.072815,0.393977,0.082310,-0.250618,0.012794,-0.151201,-0.058888,-0.304413,-0.111252,-0.050656,-0.029421,-0.037617,0.009217,-0.038008,cap bottom cut loose madden crowd search...,0.393977,topic_2
496,0.382089,-0.072973,0.493851,-0.000616,-0.298972,0.048773,-0.072183,0.018664,-0.256497,-0.003314,-0.041687,-0.140569,-0.005351,0.098169,-0.014710,cap crew crew take edge fickle weather ...,0.493851,topic_2
497,0.333652,-0.210800,-0.053722,-0.238009,0.000623,0.199455,-0.007720,-0.178304,0.050147,0.064159,0.032631,-0.000012,0.125492,0.002028,0.021796,time shell need use morning time umbrella ...,0.333652,topic_0
498,0.333107,0.078017,-0.134258,0.344820,0.039499,0.076239,-0.170294,0.080356,-0.048693,0.038876,-0.015405,-0.032500,0.115552,0.141984,-0.025911,wear cargo short wear cargo short bask glor...,0.344820,topic_3


In [26]:
topic_encoded_df['most_important'].value_counts()

topic_0     273
topic_1      61
topic_6      40
topic_2      36
topic_7      27
topic_3      17
topic_11     13
topic_10      8
topic_8       8
topic_4       8
topic_13      6
topic_5       2
topic_12      1
Name: most_important, dtype: int64