#### create new fetaures for test and train datasets for improving model efficiency

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from xgboost.sklearn import XGBRegressor
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn import preprocessing
import json
import math
import os
from geopy.exc import GeocoderTimedOut 
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import mean_squared_error

In [2]:
path=os.getcwd()
path

'C:\\Users\\jayam\\Downloads\\7879phd'

In [3]:
with open(path+"\\city_dict.json") as f: ## read json file and get cities in approriate format for mapping
      data = json.load(f)
city = {value: key for key, value in data.items()}
city={k:int(v) for k, v in city.items()}

In [4]:
df=pd.DataFrame(list(city.items()),columns=["city","keys"])

In [5]:
df.head(2)

Unnamed: 0,city,keys
0,Mumbai,4
1,Delhi,5


##### Creating latitude and logitude features

In [6]:
locator = Nominatim(user_agent="myGeocoder")
# 1 - conveneint function to delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
# 2- - create location column
df['location'] = df['city'].apply(geocode)
# 3 - create longitude, laatitude and altitude from location column (returns tuple)
df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
# 4 - split point column into latitude, longitude and altitude columns
df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df['point'].tolist(), index=df.index)

In [7]:
df.drop(["location","point","altitude"],inplace=True,axis=1)

In [8]:
df

Unnamed: 0,city,keys,latitude,longitude
0,Mumbai,4,18.938771,72.835335
1,Delhi,5,28.651718,77.221939
2,Bangalore,7,12.97912,77.5913
3,Kolkata,1,22.545412,88.356775
4,Chennai,6,13.080172,80.283833
5,Hyderabad,8,17.388786,78.461065
6,Ahmedabad,9,23.021624,72.579707
7,Jaipur,3,26.916194,75.820349
8,Lucknow,10,26.8381,80.9346
9,Other,2,60.599136,-134.880251


In [10]:
test=pd.read_csv(path +"\\test_final.csv", sep=",", na_values=["?",",","#","NaN","unknown",""])# read train data

In [11]:
train=pd.read_csv(path +"\\train_final.csv", sep=",", na_values=["?",",","#","NaN","unknown",""])# read test data

In [12]:
test["date"]=pd.to_datetime(test["date"])
train["date"]=pd.to_datetime(train["date"])

#### Merge latitude and logitude with test and train datasets

In [13]:
df.drop(["city"],axis=1,inplace=True)
df.rename(columns={"keys":"city"},inplace=True)
train=train.merge(df,on=["city"],how="left")
test=test.merge(df,on=["city"],how="left")

In [14]:
train.head(2)

Unnamed: 0,year,month,day,city,medicine,date,sales,discounted,year_month,city_medicine,week,footfall_perday_permed,old_new,latitude,longitude
0,2015,1,2,1,1,2015-01-02,24.0,0,2015_01,1_1,1,0.385804,old,22.545412,88.356775
1,2015,1,2,1,2,2015-01-02,144.0,0,2015_01,1_2,1,2.314824,old,22.545412,88.356775


In [15]:
train["weekday"]=np.where((train["date"].dt.dayofweek) < 5,0,1)# creating weekend feature
test["weekday"]=np.where((test["date"].dt.dayofweek) < 5,0,1)
test["dayofweek"]=test["date"].dt.dayofweek # creating day of week feature
train["dayofweek"]=train["date"].dt.dayofweek

In [16]:
train.drop(["date","city"],axis=1,inplace=True)
test.drop(["id","date","footfall_perday_permed","city"],axis=1,inplace=True)

In [21]:
train=train[train["year"]>=2017]

In [22]:
train.to_csv("kaggle_train.csv",index=False)# save the train data for running the model in kaggle 

In [23]:
test.to_csv("kaggle_test.csv",index=False)# save the test data for running the model in kaggle

In [17]:
test.head()

Unnamed: 0,year,month,day,medicine,discounted,year_month,city_medicine,week,old_new,latitude,longitude,weekday,dayofweek
0,2018,7,1,1292,0,2018_07,1_1292,26,old,22.545412,88.356775,1,6
1,2018,7,1,1,0,2018_07,1_1,26,old,22.545412,88.356775,1,6
2,2018,7,1,2,1,2018_07,1_2,26,old,22.545412,88.356775,1,6
3,2018,7,1,3,0,2018_07,1_3,26,old,22.545412,88.356775,1,6
4,2018,7,1,4,0,2018_07,1_4,26,old,22.545412,88.356775,1,6


In [18]:
train.head()

Unnamed: 0,year,month,day,medicine,sales,discounted,year_month,city_medicine,week,footfall_perday_permed,old_new,latitude,longitude,weekday,dayofweek
0,2015,1,2,1,24.0,0,2015_01,1_1,1,0.385804,old,22.545412,88.356775,0,4
1,2015,1,2,2,144.0,0,2015_01,1_2,1,2.314824,old,22.545412,88.356775,0,4
2,2015,1,2,3,84.0,0,2015_01,1_3,1,1.350314,old,22.545412,88.356775,0,4
3,2015,1,2,4,24.0,0,2015_01,1_4,1,0.385804,old,22.545412,88.356775,0,4
4,2015,1,2,5,8.0,0,2015_01,1_5,1,0.128601,old,22.545412,88.356775,0,4
