### Importing Libraries

In [None]:
#### General Libraries

import pandas as pd
import numpy as np
import time

import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger("lda").setLevel(logging.WARNING)

#### Libraries for plotting graphs

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
%matplotlib inline

#import bokeh.plotting as bp
#from bokeh.models import HoverTool, BoxSelectTool
#from bokeh.models import ColumnDataSource
#from bokeh.plotting import figure, show, output_notebook
#from bokeh.transform import factor_cmap

### Libraries for handling text

import string, re
import nltk
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from collections import Counter
#from wordcloud import WordCloud
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.decomposition import LatentDirichletAllocation
#from sklearn.decomposition import TruncatedSVD
#from sklearn.manifold import TSNE

In [None]:
train = pd.read_csv('train.tsv',sep = '\t')
test = pd.read_csv('test.tsv',sep = '\t')

print("Train has %d rows and %d columns." % (train.shape[0], train.shape[1]))
print("Test has %d rows and %d columns." %(test.shape[0], test.shape[1]))

Independent Variables - 'name', 'item_condition_id', 'category_name', 'brand_name','shipping', 'item_description'
Dependent Variables - 'price'

##### Categorical Variables

1. item_condition_id | 5
2. brand_name | 4809
3. category_name | 1287
4. shipping | 2

##### Descriptive Variables

1. name
2. item_description

##### Dependent Variable i
1. continous variable
2. Mean 267.3
3. Range between 0 to 2009
4. The variable is left skewed

In [None]:
## sample rows from train
train.head(2)

In [None]:
## Count of missing values for each column
train.isnull().sum()

### Target Variable Distribution

In [None]:
train['price'].describe()

In [None]:
plt.subplot(1,2,1)
train['price'].plot.hist(bins = 50, figsize=(20,10),range=[0,250] )
plt.xlabel('price+', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Price Distribution - Training Set', fontsize=17)

plt.subplot(1, 2, 2)
np.log(train['price']+1).plot.hist(bins=50, figsize=(20,10))
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Log(Price) Distribution - Training Set', fontsize=17)

### Target Variable relation with shipping

The variable is 1 when seller pays for the shipping cost and 0 when buyer pays for it. Average price of products where seller pays for shipping is 22.57 and average price for products where buyer pays for shipping is 30.11 which makes sense since the shipping cost is being compensated in the cost for those proucts

In [None]:
### Shipping Counts
train['shipping'].value_counts()/train.shape[0] * 100

In [None]:
ship1 = train[train['shipping']==1]['price']
ship0 = train[train['shipping']==0]['price']

print("Average price when seller pays for shipping is %f" %ship1.mean())
print("Average price when buyer pays for shipping is %f" %ship0.mean())

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
ax.hist(np.log(ship1+1),bins=50,label='Price when Seller pays Shipping',alpha = 1.0)#, color='#8CB4E1',
ax.hist(np.log(ship0+1), bins=50,label='Price when Buyer pays Shipping',alpha = 0.7)# color='#007D00',
#ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.title('Price Distribution by Shipping Type', fontsize=17)
plt.tick_params(labelsize=15)
#plt.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=np.log(ship1+1)))
fig.add_trace(go.Histogram(x=np.log(ship0+1)))

# The two histograms are drawn on top of another
fig.update_layout(barmode='stack')
fig.show()

### Target variable's relationship with item condition

There isn't much significant difference between average price based on condition of item.

In [None]:
## Item condition count
train['item_condition_id'].value_counts()/train.shape[0]*100

In [None]:
cond1 = train[train['item_condition_id']==1]['price']
cond2 = train[train['item_condition_id']==2]['price']
cond3 = train[train['item_condition_id']==3]['price']
cond4 = train[train['item_condition_id']==4]['price']
cond5 = train[train['item_condition_id']==5]['price']

print("Average price when Condition is 1 %f" %cond1.mean())
print("Average price when Condition is 2 %f" %cond2.mean())
print("Average price when Condition is 3 %f" %cond3.mean())
print("Average price when Condition is 4 %f" %cond4.mean())
print("Average price when Condition is 5 %f" %cond5.mean())



fig, ax = plt.subplots(figsize=(20,10))
ax.hist(np.log(cond1+1),bins=50,label='Price when Condition is 1',alpha = 1.0,color = 'black')#, color='#8CB4E1',
ax.hist(np.log(cond2+1),bins=50,label='Price when Condition is 2',alpha = 0.7,color = 'darksalmon')
ax.hist(np.log(cond3+1),bins=50,label='Price when Condition is 3',alpha = 0.4)
#ax.hist(np.log(cond4+1),bins=50,label='Price when Condition is 4',alpha = 0.4)
#ax.hist(np.log(cond5+1),bins=50,label='Price when Condition is 5',alpha = 0.2, color = 'lavenderblush')# color='#007D00',
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.title('Price Distribution by Item Condition', fontsize=17)
#plt.tick_params(labelsize=15)

### Relationship of price with Item category

There are 1287 unique categories in the list. 6327 items are missing category. Each category consists of 3 or 4 subcategories.
After splitting the category description into three categories, category 1 has 10 main categories with two categories having more than 50% of the share. Category 2 has 113 values and category 3 has 870 values.

In [None]:
train['category_name'].value_counts()[:5]/train.shape[0] * 100

In [None]:
def category_split(x):
    try: return(x.split("/"))
    except: return(("NA","NA","NA"))
train['Cat_1'], train['Cat_2'],train['Cat_3'] = zip(*train['category_name'].apply(lambda x: category_split(x)))
train['Cat_1'].value_counts()/train.shape[0]*100

In [None]:
train['Cat_2'].value_counts()[:5]/train.shape[0]*100

In [None]:
train['Cat_3'].value_counts()[:5]/train.shape[0]*100

In [None]:
x = train['Cat_1'].value_counts().index.values.astype('str')
y = train['Cat_1'].value_counts().values
pct = [("%.2f"%(v*100))+"%"for v in (y/len(train))]

layout = dict(title= 'Number of Items by Main Category',
              yaxis = dict(title='Count'),
              xaxis = dict(title='Category'))
fig=dict(data=go.Bar(x=x, y=y, text=pct), layout=layout)
py.iplot(fig)

In [None]:
general_cats = train['Cat_1'].unique()
x = [train.loc[train['Cat_1']==cat, 'price'] for cat in general_cats]
data = [go.Box(x=np.log(x[i]+1), name=general_cats[i]) for i in range(len(general_cats))]
layout = dict(title= 'Price Distribution by Main Category',
              yaxis = dict(title='Frequency'),
              xaxis = dict(title='Category'))
py.iplot(dict(data=data, layout=layout))