In [None]:
# EDA enligt https://www.kaggle.com/code/ayushikaushik/eda-regression-analysis#Preprocessing-the-data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

train_data = pd.read_csv('data/stores_train.csv')
busstops = pd.read_csv('data/busstops_norway.csv')
grunnkrets_age = pd.read_csv('data/grunnkrets_age_distribution.csv')
grunnkrets_households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
grunnkrets_income = pd.read_csv('data/grunnkrets_income_households.csv')
grunnkrets_stripped = pd.read_csv('data/grunnkrets_norway_stripped.csv')
plaace_hierarchy = pd.read_csv('data/plaace_hierarchy.csv')

print(f"Shape of training data: {train_data.shape}\nFeatures available: {train_data.columns}")

In [None]:
# To see all the columns in output this can be done.
pd.options.display.max_columns=None
# To see all rows change max_columns with max_rows

train_data.head()

In [None]:
# Check if there are missing values => true, so there are
train_data.isnull().sum().any()

In [None]:
# Replace NaN in mall_name column with 'No mall'
train_data.mall_name = train_data.mall_name.fillna('No mall')
train_data.address = train_data.address.fillna('No address')
train_data.chain_name = train_data.chain_name.fillna('No chain')

In [None]:
train_data.head()

In [None]:
# Check if there are missing values => false, so there aren't anymore :)
train_data.isnull().sum().any()

In [None]:
# 'store_name', 'year', 'sales_channel_name', 'address' columns are redundant, remove them
train_data = train_data.drop('store_name',axis=1)
train_data = train_data.drop('year',axis=1)
train_data = train_data.drop('sales_channel_name',axis=1)
train_data = train_data.drop('address',axis=1)

In [None]:
train_data.head()

In [None]:
#train_data['store_name'] = pd.factorize(train_data['store_name'])[0]

In [None]:
#train_data.head()

In [None]:
sns.distplot(train_data['revenue'],hist=False)
plt.title('Distribution of Target variable')
sns.despine() # removes top and right border from the figure

In [None]:
sns.distplot(np.log(train_data['revenue']),hist=False)
plt.title('Distribution of Target variable')

In [None]:
# The data is ≈ normally distributed when plotted in log

In [None]:
# Make new column for less specified plaace hierarchy to group data together

# train_data.insert(2,'hierarchy', train_data.apply(lambda x: x['plaace_hierarchy_id'][:-4], axis = 1))



In [None]:
sns.countplot(x=train_data["hierarchy"])

In [None]:
train_data['hierarchy'].value_counts()
# Issue: we have two 2.8, probably because of one being 2.8. and one 2.8 - can we remove the ending of the strings in a different way?

In [None]:
train_data.groupby('hierarchy')['revenue'].mean()
# hierarchy seems to affect the revenue

In [None]:
train_data['chain_name'].value_counts()
# 307 different ones, create dummy variables will give too many columns

In [None]:
train_data['grunnkrets_id'].value_counts()
# 3817 different ones, group together somehow?

In [None]:
train_data['mall_name'].value_counts()
# 488 different ones, but 10579 of the stores are not in a mall
# => make dummy variable: one column with 1 for mall and 0 for no mall

In [None]:
# Dummy variable for mall or no mall

# mall_dummy = pd.get_dummies(train_data['mall_name'],drop_first=True)
# train_data = pd.concat([train_data,mall_dummy],axis=1)
# train_data.drop(['mall_name'],axis=1, inplace=True)

train_data.loc[train_data['mall_name'].str.contains("No mall", na=False),'mall_dummy'] = 0
train_data.loc[~(train_data['mall_name'].str.contains("No mall", na=False)),'mall_dummy'] = 1
train_data.drop(['mall_name'],axis=1, inplace=True)


In [None]:
train_data.head()

In [None]:
# Now we work with the geographical features

In [None]:
# A way to get the zipcode from the lat and lon, based on https://www.geeksforgeeks.org/get-the-city-state-and-country-names-from-latitude-and-longitude-using-python/
# and https://gis.stackexchange.com/questions/352961/convert-lat-lon-to-zip-postal-code-using-python
# # requires pip install geopy in terminal first

import geopy
geolocator = geopy.Nominatim(user_agent="geoapiExercises")

# FIRST ATTEMPT
# train_data.insert(len(train_data.columns),'zipcode', 0)
# # go through the whole columns of lat and lon pairs
# for i in range(train_data.shape[0]):
#     latitude = str(train_data.loc[i]['lat'])
#     longitude = str(train_data.loc[i]['lon'])
#     location = geolocator.reverse(latitude+","+longitude)
#     address = location.raw['address']
#     train_data.loc[i]['zipcode'] = address.get('postcode')
# # now we should have the dataframe with an extra column for zipcodes

# # SECOND ATTEMPT
# def get_zipcode(df, geolocator, lat_field, lon_field):
#     location = geolocator.reverse((df[lat_field], df[lon_field]))
#     return location.raw['address']['postcode']

# zipcodes = train_data.apply(get_zipcode, axis=1, geolocator=geolocator, lat_field='lat', lon_field='lon')
# train_data.insert(len(train_data.columns),'zipcode', zipcodes)
# # Now we should have the dataframe train_data with a new column for zipcodes



In [None]:
# IDEAS/TO DO

# Convert categorical columns to numeric
# Split into train and test data set
# Make new column for distance to nearest busstop based on coordinates and busstops_norway.csv
# Make categories for income and age distribution for different grunnkretser, add as columns
# Try the sklearn ML models used in the EDA link from Kaggle

In [None]:
# Buffer