In [None]:
#### General Libraries

import pandas as pd
import numpy as np
import time

import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger("lda").setLevel(logging.WARNING)

#### Libraries for plotting graphs

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
%matplotlib inline

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show, output_notebook
#from bokeh.transform import factor_cmap

### Libraries for handling text

import string, re
import nltk
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

In [None]:
train = pd.read_csv('train.tsv',sep = '\t')
test = pd.read_csv('test.tsv',sep = '\t')

print("Train has %d rows and %d columns." % (train.shape[0], train.shape[1]))
print("Test has %d rows and %d columns." %(test.shape[0], test.shape[1]))

### Relationship of price with Item Description

Item description has 4 null values. 82489 rows have "No description yet". 
The item description has a meean length of 13 words with one item being decribed in upto 130 words.
There isn't a lot of difference between price based on descriptions

Removing stop words from a sentence takes emotionn out of it. Since it's the description of an item, just the words matter.

In [None]:
##### Cleaning and Standardizing text

def wordClean(text):
    try:
        text = text.lower()
        regex = re.compile('[' +re.escape(string.punctuation) + '0-9\\r\\t\\n]')
        return regex.sub(" ",text)        
    except:
        return ""
    
def wordCount(text):
    try:
        #words = nltk.word_tokenize()
        words = [w for w in text.split(" ") if not w in stop_words.ENGLISH_STOP_WORDS and len(w)>3]
        return len(words)        
    except:J
        return 0

In [None]:
train['clean_text'] = train['item_description'].apply(lambda x: wordClean(x))
train['description_length'] = train['clean_text'].apply(lambda x: wordCount(x))
train['description_length'].describe()

In [None]:
df = train.groupby('description_length')['price'].mean().reset_index()

data = [go.Scatter(x = df['description_length'],y = np.log(df['price']+1), mode = 'lines+markers',name = 'lines+markers')]
layout = dict(title= 'Average Log(Price) by Description Length',
              yaxis = dict(title='Average Log(Price)'),
              xaxis = dict(title='Description Length'))
fig=dict(data=data, layout=layout)
py.iplot(fig)

In [None]:
train.loc[train['clean_text'] == "no description yet",'clean_text'] = ""
train['clean_stop_words'] = train['clean_text'].apply(lambda x: [w for w in x.split(" ") if not w in stop_words.ENGLISH_STOP_WORDS and len(w)>3])
train['clean_stop_words'] = train['clean_stop_words'].apply(lambda x: " ".join(x))

In [None]:
def category_split(x):
    try: return(x.split("/"))
    except: return(("NA","NA","NA"))
train['Cat_1'], train['Cat_2'],train['Cat_3'] = zip(*train['category_name'].apply(lambda x: category_split(x)))

In [None]:


general_categories = train['Cat_1'].unique()
# create a dictionary of words for each category
cat_desc = dict()
for cat in general_categories: 
    text = " ".join(train.loc[train['Cat_1']==cat, 'clean_stop_words'].values)
    cat_desc[cat] = nltk.word_tokenize(text)

# flat list of all words combined
flat_list = [item for sublist in list(cat_desc.values()) for item in sublist]
allWordsCount = Counter(flat_list)
all_top10 = allWordsCount.most_common(20)
x = [w[0] for w in all_top10]
y = [w[1] for w in all_top10]

data = [go.Bar(x=x, y=y)]
layout = dict(title= 'Word Frequency',yaxis = dict(title='Count'),xaxis = dict(title='Word'))
fig=dict(data=data, layout=layout)
py.iplot(fig)

In [None]:
stop = set(stopwords.words('english'))
def tokenize(text):
    try: 
        text = wordClean(text)         
        tokens = list(filter(lambda t: t not in stop, nltk.word_tokenize(text)))
        filtered_tokens = [w for w in tokens if re.search('[a-zA-Z]', w)]
        filtered_tokens = [w for w in filtered_tokens if len(w)>=3]        
        return filtered_tokens
            
    except TypeError as e: print(text,e)

In [None]:
train['tokens'] = train['item_description'].map(tokenize)

In [None]:
for description, tokens in zip(train['item_description'].head(),train['tokens'].head()):
    print('description:', description)
    print('tokens:', tokens)
    print()

In [None]:
cat_desc = dict()
for cat in general_categories: 
    text = " ".join(train.loc[train['Cat_1']==cat, 'clean_text'].values)
    cat_desc[cat] = tokenize(text)

# find the most common words for the top 4 categories
women100 = Counter(cat_desc['Women']).most_common(100)
beauty100 = Counter(cat_desc['Beauty']).most_common(100)
kids100 = Counter(cat_desc['Kids']).most_common(100)
electronics100 = Counter(cat_desc['Electronics']).most_common(100)

In [None]:
def generate_wordcloud(tup):
    wordcloud = WordCloud(background_color='white',max_words=50, max_font_size=40,
                          random_state=42).generate(str(tup))
    return wordcloud

fig,axes = plt.subplots(2, 2, figsize=(30, 15))

ax = axes[0, 0]
ax.imshow(generate_wordcloud(women100), interpolation="bilinear")
ax.axis('off')
ax.set_title("Women Top 100", fontsize=30)

ax = axes[0, 1]
ax.imshow(generate_wordcloud(beauty100))
ax.axis('off')
ax.set_title("Beauty Top 100", fontsize=30)

ax = axes[1, 0]
ax.imshow(generate_wordcloud(kids100))
ax.axis('off')
ax.set_title("Kids Top 100", fontsize=30)

ax = axes[1, 1]
ax.imshow(generate_wordcloud(electronics100))
ax.axis('off')
ax.set_title("Electronic Top 100", fontsize=30)

### TF-IDF (Term Frequency-Inverse Document Frequency)

TfidfVectorizer takes in input df, tokenizer, analyser, stop_words, max_features, ngram_range

For the given tokenizer
1. ngram_range is selecting unigrams and bigrams
2. min_df is going to ignore tokens that have coount less than 10. (literature cut off)
3. max_features is the number of tokens that is going to be used to create a vocabulary based on token frequency
4. tokenizer is the preporicessing that overrides string tokenization step
5. decode_error will ignore empty descriptions

the output of the vectorizer is a matrix with each column as a feature(token) and its value for each row(item description)

Once the vectorizatino is done, we can map the tokens to their tfidf values. Terms with low tfidf values will be generic terms that cannot be used to distinguish items, whereas terms with high tfidf should be able to distinguish items and categories

In [None]:
vectorizer = TfidfVectorizer(min_df=10,max_features=180000,tokenizer=tokenize,ngram_range=(1, 2),decode_error = 'ignore')

In [None]:
train.loc[train['item_description'].isnull(),'item_description'] = ""

In [None]:
all_desc = np.append(train['item_description'].values, test['item_description'].values)
vz = vectorizer.fit_transform(list(all_desc))

## mapping the tokens to idf values
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

tfidf.sort_values(by=['tfidf'], ascending=False).head(10)

In [None]:
## mapping the tokens to idf values
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

In [None]:
tfidf.sort_values(by=['tfidf'], ascending=False).head(10)

### Dimensinality Recution

SVD (Singular Value Decomposition) is PCA but numerically stable. Using sklearn SVD - reducing the features from 4232 to 30 helps in applying t-SNE to the already reduced data. 

t-SNE cost function is not convex - that means different initializatinos leads to different results. It is compuationally expensive, so converting high dimensional matrics using SVD (for sparse data) or PCA (for dense data) is recomended before using t-SNE.

for t-SNE
1. n_components is the dimensionality of the embedded space
2. perplexity
3. learning_rate
4. n_iter - for optimization purposes
5. init - the initial matrix can be defined at the start



In [None]:
trn = train.copy()
tst = test.copy()
trn['is_train'] = 1
tst['is_train'] = 0

sample_sz = 15000

combined_sample = pd.concat([trn, tst]).sample(n=sample_sz)
vz_sample = vectorizer.fit_transform(list(combined_sample['item_description']))

vz_sample.shape

svd = TruncatedSVD(n_components=30, random_state=42)
svd_tfidf = svd.fit_transform(vz_sample)

svd_tfidf.shape
tsne_model = TSNE(n_components=2, verbose=1, random_state=42, n_iter=500)

tsne_tfidf = tsne_model.fit_transform(svd_tfidf)

In [None]:
#combined_sample.reset_index(inplace=True, drop=True)
tfidf_df = pd.DataFrame(tsne_tfidf, columns=['x', 'y'])
tfidf_df['description'] = combined_sample['item_description']
tfidf_df['tokens'] = combined_sample['tokens']
tfidf_df['category'] = combined_sample['Cat_1']

In [None]:
#output_notebook()
#plot_tfidf = bp.figure(plot_width=700, plot_height=600,
#                       title="tf-idf clustering of the item description",
#    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
#    x_axis_type=None, y_axis_type=None, min_border=1)

#plot_tfidf.scatter(x='x', y='y', source=tfidf_df, alpha=0.7)
#hover = plot_tfidf.select(dict(type=HoverTool))
#hover.tooltips={"description": "@description", "tokens": "@tokens", "category":"@category"}
#show(plot_tfidf)

tf1 = tfidf_df[tfidf_df['category'].notnull()]#.plot(x=x, y=y)

In [None]:
#sns.scatterplot(x = "x", y = "y",data = tf1, hue = 'category')
import random
color = ["#" + "".join([random.choice("0123456789ABCDEF") for j in range(6)])
    for i in range(len(general_categories))
]

cols = tf1["category"].map({clust: color for clust, color in zip(general_categories, color)})

data = [go.Scatter(x = tf1['x'],y = tf1['y'], mode = 'markers',marker=dict(size=15, color=cols),name = 'markers')]
layout = dict(title= 'T-SNE Plot',
              yaxis = dict(title='Y-axis'),
              xaxis = dict(title='X-axis'))
fig=dict(data=data, layout=layout)
py.iplot(fig)