In [1]:
import os
import datetime
import pandas as pd
import numpy as np
import dask.dataframe as dd
from dask import delayed, compute
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import lightgbm as lgb
import gc
from textblob import TextBlob
import time
import pickle
from tqdm import tqdm,tqdm_pandas
path = "F:\\Pratik\\LearningZone\\Competitions\\Avito_Demand_Prediction_Challenge"
os.chdir(path)



In [2]:
train_df = pd.read_csv('train.csv', parse_dates=['activation_date'])
test_df = pd.read_csv('test.csv', parse_dates=['activation_date'])

In [3]:
test_df['deal_probability'] = 0

In [4]:
train_df['set'] = 'train'
test_df['set'] = 'test'

In [5]:
total_df = pd.concat([train_df, test_df])

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(train_df["deal_probability"].values, bins=100, kde=False)
plt.xlabel('Deal Probility', fontsize=12)
plt.title("Deal Probability Histogram", fontsize=14)
plt.show()

# plt.figure(figsize=(8,6))
# plt.scatter(range(train_df.shape[0]), np.sort(train_df['deal_probability'].values))
# plt.xlabel('index', fontsize=12)
# plt.ylabel('deal probability', fontsize=12)
# plt.title("Deal Probability Distribution", fontsize=14)
# plt.show()

In [6]:
total_df["price_new"] = total_df["price"].values
total_df["price_new"].fillna(np.nanmean(total_df["price"].values), inplace=True)

# plt.figure(figsize=(12,8))
# sns.distplot(np.log1p(train_df["price_new"].values), bins=100, kde=False)
# plt.xlabel('Log of price', fontsize=12)
# plt.title("Log of Price Histogram", fontsize=14)
# plt.show()

In [7]:
def translate(x):
    try:
        return TextBlob(x).translate(to="en")
    except:
        return x

In [8]:
def conv_string(x):
    try:
        return translate(x).string
    except:
        return translate(x)

### Translation the city, region names, category name and parent category name:

In [17]:
unique_reg_trans = []
unique_reg = list(pd.unique(total_df['region']))
for reg in unique_reg:
    unique_reg_trans.append(translate(reg).string)
    
dict_region = dict(zip(unique_reg, unique_reg_trans)) # Region Dictionary

total_df['en_region'] = total_df['region'].map(dict_region) # Mapping the regions in DF 

In [18]:
unique_city_trans = []
unique_city = list(pd.unique(total_df['city']))
for city in unique_city:
    unique_city_trans.append(conv_string(city))
    
dict_city = dict(zip(unique_city, unique_city_trans)) # city Dictionary

total_df['en_city'] = total_df['city'].map(dict_city) # Mapping the cityions in DF 

In [19]:
unique_cat_name_trans = []
unique_cat_name = list(pd.unique(total_df['category_name']))
for cat_name in unique_cat_name:
    unique_cat_name_trans.append(translate(cat_name).string)
    
dict_cat_name = dict(zip(unique_cat_name, unique_cat_name_trans)) # cat_name Dictionary

total_df['en_category_name'] = total_df['category_name'].map(dict_cat_name) # Mapping the cat_nameions in DF 

In [20]:
unique_cat_name_trans = []
unique_cat_name = list(pd.unique(total_df['parent_category_name']))
for cat_name in unique_cat_name:
    unique_cat_name_trans.append(translate(cat_name).string)
    
dict_cat_name = dict(zip(unique_cat_name, unique_cat_name_trans)) # cat_name Dictionary

total_df['en_parent_category_name'] = total_df['parent_category_name'].map(dict_cat_name) # Mapping the cat_nameions in DF 

In [21]:
unique_param1_trans = []
unique_param1 = list(pd.unique(total_df['param_1']))
for param1 in unique_param1:
    unique_param1_trans.append(conv_string(param1))
    
dict_param1 = dict(zip(unique_param1, unique_param1_trans)) # param1 Dictionary

total_df['en_param1'] = total_df['param_1'].map(dict_param1) # Mapping the param1ions in DF 

In [22]:
unique_param2_trans = []
unique_param2 = list(pd.unique(total_df['param_2']))
for param2 in unique_param2:
    unique_param2_trans.append(conv_string(param2))
    
dict_param2 = dict(zip(unique_param2, unique_param2_trans)) # param2 Dictionary

total_df['en_param2'] = total_df['param_2'].map(dict_param2) # Mapping the param2ions in DF 

In [23]:
unique_param3_trans = []
unique_param3 = list(pd.unique(total_df['param_3']))
for param3 in unique_param3:
    unique_param3_trans.append(conv_string(param3))
    
dict_param3 = dict(zip(unique_param3, unique_param3_trans)) # param3 Dictionary

total_df['en_param3'] = total_df['param_3'].map(dict_param3) # Mapping the param3ions in DF 

In [24]:
total_df[total_df['set'] == 'train'].to_csv('train_translated.csv',index=False)
total_df[total_df['set'] == 'test'].to_csv('test_translated.csv',index=False)

In [None]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [None]:
temp_series = train_df['en_region'].value_counts()
labels = (np.array(temp_series.index))
sizes = (np.array((temp_series / temp_series.sum())*100))

trace = go.Pie(labels=labels, values=sizes)
layout = go.Layout(
    title='Region distribution',
    width=900,
    height=900,
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="region")

In [None]:
cnt_srs = train_df['en_city'].value_counts().head(20)
trace = go.Bar(
    y=cnt_srs.index[::-1],
    x=cnt_srs.values[::-1],
    orientation = 'h',
    marker=dict(
        color=cnt_srs.values[::-1],
        colorscale = 'Blues',
        reversescale = True
    ),
)

layout = dict(
    title='City distribution of Ads',
    )
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="CityAds")

In [None]:
temp_series = train_df['en_parent_category_name'].value_counts()
labels = (np.array(temp_series.index))
sizes = (np.array((temp_series / temp_series.sum())*100))

trace = go.Pie(labels=labels, values=sizes)
layout = go.Layout(
    title='Parent Category distribution',
    width=900,
    height=900,
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="parent_cat")

In [None]:
cnt_srs = train_df['en_category_name'].value_counts()
trace = go.Bar(
    y=cnt_srs.index[::-1],
    x=cnt_srs.values[::-1],
    orientation = 'h',
    marker=dict(
        color=cnt_srs.values[::-1],
        colorscale = 'Blues',
        reversescale = True
    ),
)

layout = dict(
    title='Category distribution of Ads',
    height = 900
    )
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="Category")

In [None]:
temp_series = train_df['user_type'].value_counts()
labels = (np.array(temp_series.index))
sizes = (np.array((temp_series / temp_series.sum())*100))

trace = go.Pie(labels=labels, values=sizes)
layout = go.Layout(
    title='User Type distribution',
    width=700,
    height=700,
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="usertype")

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x="en_parent_category_name", y="deal_probability", data=train_df)
plt.ylabel('Deal probability', fontsize=12)
plt.xlabel('Parent Category', fontsize=12)
plt.title("Deal probability by parent category", fontsize=14)
plt.xticks(rotation=45)
plt.show()