[Group 62] Machines of ML

Frimann Bergvik Garmann         - 527245
Brage Bergsmyr                  - 514881
Magnus Christian Kvist Jacobsen - 506626

In [None]:
!pip install catboost
!pip install geopandas
!pip install folium
!pip install matplotlib
!pip install mapclassify
!pip install shapely
!pip install numpy
!pip install sklearn
!pip install xgboost
!pip install catboost
!pip install scipy
!pip install lightgbm
!pip install h2o
!pip install pandas
!pip install dataprep
!pip install geopy
!pip install fiona
!pip install pyproj
!pip install packaging
!pip install seaborn

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import math
import geopy.distance
import matplotlib.pyplot as plt
import seaborn as sns
from dataprep import eda
import xgboost as xgb
from shapely.geometry import Point
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

plt.style.use("ggplot")

pd.set_option('display.max_columns', None)

In [3]:
busstops_norway                     = pd.read_csv('../data/busstops_norway.csv')
grunnkrets_age_distribution         = pd.read_csv('../data/grunnkrets_age_distribution.csv')
grunnkrets_households_num_persons   = pd.read_csv('../data/grunnkrets_households_num_persons.csv')
grunnkrets_income_households        = pd.read_csv('../data/grunnkrets_income_households.csv')
grunnkrets_norway_stripped          = pd.read_csv('../data/grunnkrets_norway_stripped.csv')
plaace_hierarchy                    = pd.read_csv('../data/plaace_hierarchy.csv')
sample_submission                   = pd.read_csv('../data/sample_submission.csv')
stores_extra                        = pd.read_csv('../data/stores_extra.csv')
stores_test                         = pd.read_csv('../data/stores_test.csv')
stores_train                        = pd.read_csv('../data/stores_train.csv')

#eda.create_report(stores_train)

In [4]:
# helper functions
def left_merge(X, Z, on):
    return pd.merge(X, Z, how='left', on=on)

def replace_missing(X, column, replacement):
    _X = X.copy()
    _X[column] = _X[column].replace(np.nan, replacement)
    return _X

def remove_nan_rows(X, column):
    _X = X.copy()
    return _X[_X[column].notna()]

def remove_column(X, column):
    _X = X.copy()
    return _X.drop(column, axis=1)

def remove_year(X):
    X_copy = X.copy()
    columns = X.columns
    X_copy = X_copy.loc[X_copy.groupby('grunnkrets_id')['year'].idxmax()]
    X_copy = X_copy.drop(columns=['year'])
    #X_copy = X_copy.groupby(['grunnkrets_id'], as_index=False).agg(dict)
    return X_copy

In [5]:
# Feature functions
def sum_people(X):
    y_grunnkrets = X.grunnkrets_id
    X_copy = X.drop(columns=['grunnkrets_id'])
    X_sum = X_copy.sum(axis=1)
    return pd.merge(y_grunnkrets.rename('grunnkrets_id'), X_sum.rename('sum_people'), left_index=True, right_index=True)

def sum_stores(X):
    X_sum = X['grunnkrets_id'].groupby('grunnkrets_id').sum()
    return X_sum

def jeg_er_lei_meg(X):
    # Adds a num_of_stores to the district

    districts=X.loc[:,"district_name"].unique()
    dict={"district_name":[],"num_of_stores":[]}
    for district in districts:
        num_of_stores=(X.loc[X["district_name"] == district])["store_id"].count()
        dict["district_name"]+=[district]
        dict["num_of_stores"]+=[num_of_stores]
    return pd.DataFrame.from_dict(dict)


def clostest_rival(df, shop_id):
    #hente ut samme hierarky
    #sammenligne distansen til nærmeste

    hierarchy = df.loc[df["store_id"] == shop_id]["plaace_hierarchy_id"].values
    coords_1 = df.loc[df["store_id"] == shop_id][["lon","lat"]].values
    entry_index= df.loc[df["store_id"] == shop_id].index
    
    check = df.loc[df["plaace_hierarchy_id"] == hierarchy[0]]
    #print(check)
    distance=math.inf
    best_row=None
    #finds=""
    for index, row in check.iterrows():
        coords_2 = row[["lon","lat"]].values
        current_distance= geopy.distance.geodesic(coords_1, coords_2).km

        if (current_distance < distance and index != entry_index):
            distance=current_distance
            best_row=row
     #       finds+=str(row["store_id"])
    #print(distance, "finds",finds)
    return distance, best_row


def add_avg_revenue_municipality(X): #ALL MY HOMIES HATE THIS FUNCTION
    districts=X.loc[:,"municipality_name"].unique()
    dict={"municipality_name":[],"mean":[]}
    for district in districts:
        mean = (X.loc[X["municipality_name"] == district])["revenue"].mean()
        dict["district_name"]+=[district]
        dict["mean"]+=[mean]
    df= pd.DataFrame.from_dict(dict)
    print(df)
    bass = pd.merge(X, df, how="left", on="municipality_name")
    X["avg_revenue_municipality"] = bass["mean"]

In [6]:
# remove rows with revenue outliers
print(stores_train.shape)
stores_train = stores_train[stores_train['revenue'] > 0.01]
print(stores_train.shape)
stores_train = stores_train[stores_train['revenue'] < 135.0] # 180 for å fjerne øverste 5
print(stores_train.shape)

stores_train['revenue'] = np.log1p(stores_train['revenue'])

#Saves mean revenue to fill in nans later
MEAN_REVENUE = stores_train["revenue"].mean()
print(MEAN_REVENUE)

(12859, 12)
(12610, 12)
(12592, 12)
1.6144287126764343


In [7]:
plaace_hierarchy=plaace_hierarchy.drop(columns=["sales_channel_name"])

stores_train=left_merge(stores_train, plaace_hierarchy, on="plaace_hierarchy_id")
stores_test=left_merge(stores_test, plaace_hierarchy, on="plaace_hierarchy_id")

# removing year
grunnkrets_age_distribution = remove_year(grunnkrets_age_distribution)
grunnkrets_households_num_persons = remove_year(grunnkrets_households_num_persons)
grunnkrets_income_households = remove_year(grunnkrets_income_households)
grunnkrets_norway_stripped = remove_year(grunnkrets_norway_stripped)

stores_train = left_merge(stores_train, grunnkrets_norway_stripped[['grunnkrets_id', 'district_name', "municipality_name", "area_km2"]], on='grunnkrets_id')
stores_test = left_merge(stores_test, grunnkrets_norway_stripped[['grunnkrets_id', 'district_name',"municipality_name", "area_km2"]], on='grunnkrets_id')

stores_test.head(3)


Unnamed: 0,store_id,year,store_name,plaace_hierarchy_id,sales_channel_name,grunnkrets_id,address,lat,lon,chain_name,mall_name,lv1,lv1_desc,lv2,lv2_desc,lv3,lv3_desc,lv4,lv4_desc,district_name,municipality_name,area_km2
0,914206820-914239427-717245,2016,VÅLERENGA HALAL BURGER AS,1.1.1.0,Hamburger restaurants,3012704,STRØMSVEIEN 25 A,59.908672,10.787031,,,1,Dining and Experiences,1.1,Restaurant,1.1.1,Hamburger restaurants,1.1.1.0,Hamburger restaurants,Vålerenga,Oslo,0.057027
1,916789157-916823770-824309,2016,BURGER KING MYREN,1.1.1.0,Hamburger restaurants,8061401,MYREN 1,59.201467,9.588243,BURGER KING,,1,Dining and Experiences,1.1,Restaurant,1.1.1,Hamburger restaurants,1.1.1.0,Hamburger restaurants,Gulset,Skien,0.165993
2,913341082-977479363-2948,2016,BURGER KING STOVNER,1.1.1.0,Hamburger restaurants,3013917,STOVNER SENTER 3,59.962146,10.924524,BURGER KING,Stovner Senter,1,Dining and Experiences,1.1,Restaurant,1.1.1,Hamburger restaurants,1.1.1.0,Hamburger restaurants,Fossum,Oslo,0.236628


In [8]:
# replaces missing values
stores_train = replace_missing(stores_train, "chain_name", "no chain")
stores_train = replace_missing(stores_train, "mall_name", "no mall")
stores_train = replace_missing(stores_train, 'district_name', 'No district')
stores_train = replace_missing(stores_train, 'municipality_name' , "No municipality_name")
stores_train = replace_missing(stores_train, 'area_km2' , 0.0)



stores_test = replace_missing(stores_test, "chain_name", "no chain")
stores_test = replace_missing(stores_test, "mall_name", "no mall")
stores_test = replace_missing(stores_test, 'district_name', 'No district')
stores_test = replace_missing(stores_test, 'municipality_name' , "No municipality_name")
stores_test = replace_missing(stores_test, 'area_km2' , 0.0)

In [9]:
# finds num stores grunnkrets
def find_num_stores_grunnkrets(X):
    grunnkretser=X.loc[:,"grunnkrets_id"].unique()
    dict={"grunnkrets_id":[],"num_of_stores_grunnkrets":[]}
    for grunnkrets in grunnkretser:
        num_of_stores = (X.loc[X["grunnkrets_id"] == grunnkrets])["store_id"].count()
        dict["grunnkrets_id"]+=[grunnkrets]
        dict["num_of_stores_grunnkrets"]+=[num_of_stores]
    return pd.DataFrame.from_dict(dict)

In [10]:
# merges num_stores_grunnkrets into dataframes
all_stores = stores_train.drop(columns=['revenue']).append(stores_test, ignore_index=True).append(stores_extra, ignore_index=True)
all_stores=pd.merge(all_stores,grunnkrets_norway_stripped, how="left", on="grunnkrets_id")

grunkr=find_num_stores_grunnkrets(all_stores)

stores_train= left_merge(stores_train, grunkr, on="grunnkrets_id")

stores_test= left_merge(stores_test, grunkr, on="grunnkrets_id")


In [11]:
# Generates info about malls
from scipy import spatial

def mall_df(X):
    #alls=X.loc[:,"mall_name"].unique()
    #mallss = malls[np.logical_not(pd.isnull(malls))]
    all_stores = np.array(X.values)
    df69 = X.groupby('mall_name', as_index=False)["lat","lon"].mean()
    df69_nump = np.array(X.groupby('mall_name')["lat","lon"].mean())


    lat_ind = X.columns.get_loc("lat")
    lon_ind = X.columns.get_loc("lon")
    store_id_ind = X.columns.get_loc("store_id")

    dict={"store_id":[], "mall_distance":[], "closest_mall_name":[]}

    i=0
    maxxx=len(all_stores)
    for store in all_stores:
        if i%5000==0:
            print(100*(i/maxxx),"%")
        i+=1

        #distance=math.inf
        #closest_mall=None
        lat = store[lat_ind]
        lon = store[lon_ind]
        store_id=store[store_id_ind]
        coords_1 = [lat,lon]

        #A = df69_nump[spatial.KDTree(df69_nump).query(coords_1)[1]] # <-- the nearest point 
        B = spatial.KDTree(df69_nump).query(coords_1)
        distance=B[0]
        mall =df69.iloc[[B[1]]]
        dict["store_id"]+=[store_id]
        dict["mall_distance"]+=[distance]
        dict["closest_mall_name"]+=[mall.values[0][0]]
        #print(distance)
        #print(mall)
        #print("A", A)


    return pd.DataFrame.from_dict(dict)


#df69 = all_stores.groupby('mall_name')["lat","lon"].mean()

In [12]:
# Merges mall info to the dataframes
mall_df = mall_df(all_stores)

stores_train = pd.merge(stores_train, mall_df, how="left", on="store_id")
stores_test = pd.merge(stores_test, mall_df, how="left", on="store_id")

  df69 = X.groupby('mall_name', as_index=False)["lat","lon"].mean()
  df69_nump = np.array(X.groupby('mall_name')["lat","lon"].mean())


0.0 %
10.029486690871162 %
20.058973381742323 %
30.088460072613483 %
40.117946763484646 %
50.147433454355806 %
60.176920145226966 %
70.20640683609813 %
80.23589352696929 %
90.26538021784046 %


In [13]:
# generates avg_rev for chains
def chain_info(stores_train):
    df69 = stores_train.groupby('chain_name', as_index=True)["store_id"].count()
    big_chain = df69.loc[df69 >= 1]
    big_chain = big_chain.dropna()

    dict = {"chain_name":[], "avg_rev_big_chain":[]}

    for i in range(len(big_chain)):
        avg_rev = stores_train.loc[stores_train["chain_name"] == big_chain.index[i]]["revenue"].mean()
        dict["chain_name"]+=[big_chain.index[i]]
        dict["avg_rev_big_chain"]+=[avg_rev]
        #print(big_chain.index[i], avg_rev)
    big_chain_df = pd.DataFrame.from_dict(dict)
    return big_chain_df

In [14]:
# merges avg_rev for chains into dataframes
avg_rev_big_chain= chain_info(stores_train)

stores_train = left_merge(stores_train, avg_rev_big_chain, on="chain_name")
stores_test = left_merge(stores_test, avg_rev_big_chain, on="chain_name")

stores_train = replace_missing(stores_train, 'avg_rev_big_chain' , MEAN_REVENUE)
stores_test = replace_missing(stores_test, 'avg_rev_big_chain' , MEAN_REVENUE)


In [15]:
# adds avg_rev for different hierarchy levels
def avg_rev_hierarchy_levels(X):
    lv_1_hierarkies = (X.loc[:,"lv1_desc"]).unique()
    lv_2_hierarkies = (X.loc[:,"lv2_desc"]).unique()
    lv_3_hierarkies = (X.loc[:,"lv3_desc"]).unique()
    lv_4_hierarkies = (X.loc[:,"lv4_desc"]).unique()

    dict = {"lv1_desc":[],"lv1_avg_rev":[]}
    for lv1 in lv_1_hierarkies:
        avg_rev = (X.loc[X["lv1_desc"] == lv1])["revenue"].mean()
        dict["lv1_desc"]+=[lv1]
        dict["lv1_avg_rev"]+=[avg_rev]
    lv1_df = pd.DataFrame.from_dict(dict)

    dict = {"lv2_desc":[],"lv2_avg_rev":[]}
    for lv2 in lv_2_hierarkies:
        avg_rev = (X.loc[X["lv2_desc"] == lv2])["revenue"].mean()
        dict["lv2_desc"]+=[lv2]
        dict["lv2_avg_rev"]+=[avg_rev]
    lv2_df = pd.DataFrame.from_dict(dict)

    dict = {"lv3_desc":[],"lv3_avg_rev":[]}
    for lv3 in lv_3_hierarkies:
        avg_rev = (X.loc[X["lv3_desc"] == lv3])["revenue"].mean()
        dict["lv3_desc"]+=[lv3]
        dict["lv3_avg_rev"]+=[avg_rev]
    lv3_df = pd.DataFrame.from_dict(dict)

    dict = {"lv4_desc":[],"lv4_avg_rev":[]}
    for lv4 in lv_4_hierarkies:
        avg_rev = (X.loc[X["lv4_desc"] == lv4])["revenue"].mean()
        dict["lv4_desc"]+=[lv4]
        dict["lv4_avg_rev"]+=[avg_rev]
    lv4_df = pd.DataFrame.from_dict(dict)

    return lv1_df, lv2_df, lv3_df, lv4_df


In [16]:
# Add useful features
_,_,lv3,lv4 = avg_rev_hierarchy_levels(stores_train)

stores_train = pd.merge(stores_train, lv3, how="left", on="lv3_desc")
stores_train = pd.merge(stores_train, lv4, how="left", on="lv4_desc")

stores_test = pd.merge(stores_test, lv3, how="left", on="lv3_desc")
stores_test = pd.merge(stores_test, lv4, how="left", on="lv4_desc")

lv4.describe()

Unnamed: 0,lv4_avg_rev
count,90.0
mean,1.639134
std,0.624417
min,0.45362
25%,1.255351
50%,1.509913
75%,1.89466
max,4.102165


In [17]:
# magnus cluster feature


from scipy.spatial.distance import pdist, squareform
import sklearn
from sklearn.cluster import DBSCAN
from math import radians, sin, cos, asin, sqrt

revenues = stores_train[['store_id', 'revenue']]
all_stores = stores_train.drop(columns=['revenue']).append(stores_test, ignore_index=True).append(stores_extra, ignore_index=True)

df=all_stores[['store_id','lat','lon']]
geometry = [Point(xy) for xy in zip(df.lat, df.lon)]
gdf = gpd.GeoDataFrame(df, geometry=geometry)
latlon = all_stores[['lat','lon']]
X = np.radians(np.array(latlon, dtype='float64'))

clustering = DBSCAN(eps=2.0/6371.0, min_samples=250, metric='haversine', algorithm='ball_tree').fit(X)

y_clusters = clustering.labels_

gdf['cluster'] = y_clusters

all_stores['cluster'] = y_clusters
gdf=gdf[gdf['cluster']!=-1]
gdf.crs = "EPSG:9672"
print(gdf.loc[gdf['cluster'].idxmax()])

cluster_center = all_stores.groupby('cluster')['lat','lon'].mean()

cluster_center.rename(columns = {'lat':'center_lat', 'lon':'center_lon'}, inplace = True)

all_stores = left_merge(all_stores, cluster_center, on='cluster')

all_stores
def distance(lat1, lon1, lat2, lon2, cluster):
    if cluster == -1:
        return 15
    else:
        return np.linalg.norm(np.array([lat1,lon1])-np.array([lat2, lon2]))

from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2, cluster):
    max_distance = 20
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    if cluster == -1:
        return max_distance
    else:
        # convert decimal degrees to radians 
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
        # haversine formula 
        dlon = lon2 - lon1 
        dlat = lat2 - lat1 
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * asin(sqrt(a)) 
        km = 6371 * c
        if km >= max_distance:
            return max_distance
        else:
            return km
    
all_stores['center_distance'] = np.exp(all_stores[['lon', 'lat','center_lon','center_lat', 'cluster']].apply(lambda row: haversine(row['lon'], row['lat'], row['center_lon'], row['center_lat'], row['cluster']), axis=1))

stores_train = all_stores[all_stores['store_id'].isin(stores_train['store_id'])]
stores_train = left_merge(stores_train, revenues, on='store_id')
stores_test = all_stores[all_stores['store_id'].isin(stores_test['store_id'])]
stores_extra = all_stores[all_stores['store_id'].isin(stores_extra['store_id'])]
df=all_stores[['store_id','lat','lon', 'cluster', 'center_distance', 'store_name']]
geometry = [Point(xy) for xy in zip(df.lat, df.lon)]
gdf = gpd.GeoDataFrame(df, geometry=geometry)
gdf=gdf[gdf['cluster']!=-1]
sample=gdf.sample(frac=0.25)
sample.crs = "EPSG:9672"
#sample.explore("cluster", marker_type="circle_marker",marker_kwds={"radius":8}, legend=False)


store_id                   918023925-918087567-872230
lat                                         68.779439
lon                                         16.566267
geometry    POINT (68.77943861059249 16.566267033764)
cluster                                            33
Name: 15, dtype: object


  cluster_center = all_stores.groupby('cluster')['lat','lon'].mean()


In [18]:
#stores_train.loc[stores_train["cluster"] == -1].head(10)

#save id for later
stores_test_id = np.asarray(stores_test.store_id)

remove_columns = [
    'address',
    'store_name',
    'sales_channel_name',
    'lat',
    'lon',
    'center_lat',
    'center_lon',
    'lv1',
    'lv2',
    'lv3',
    'lv4',
    'year',
    'store_id',
    'plaace_hierarchy_id',
]

# data cleaning for train
for column in remove_columns:
    #print(column)
    stores_train = remove_column(stores_train, column)
    stores_test = remove_column(stores_test, column)
#stores_train = remove_column(stores_train, 'grunnkrets_id')


stores_train = replace_missing(stores_train, 'district_name', 'No district')
stores_train = replace_missing(stores_train, 'municipality_name' , "No municipality_name")
stores_train = replace_missing(stores_train, 'area_km2' , 0.0)

stores_test = replace_missing(stores_test, 'district_name', 'No district')
stores_test = replace_missing(stores_test, 'municipality_name' , "No municipality_name")
stores_test = replace_missing(stores_test, 'area_km2' , 0.0)

stores_test = replace_missing(stores_test, "lv3_avg_rev", MEAN_REVENUE)
stores_test = replace_missing(stores_test, "lv4_avg_rev", MEAN_REVENUE)

In [19]:
# Replace inf values
stores_train.replace([np.inf, -np.inf], 0, inplace=True)
stores_test.replace([np.inf, -np.inf], 0, inplace=True)

In [20]:
# separating training data

X_train = stores_train.drop(columns=['revenue'])
y_train = stores_train.revenue
X_test = stores_test

# encode categorical features


categorical_features = ['chain_name',
                        'grunnkrets_id',
                        'municipality_name',
                        "closest_mall_name",
                        'cluster',
                        'lv1_desc',
                        'lv2_desc',
                        'lv3_desc',
                        'lv4_desc',
                        'mall_name',
                        'district_name',
                       ]

for feature in categorical_features:
    X_train[feature] = X_train[feature].astype('category')
    X_test[feature] = X_test[feature].astype('category')

In [21]:
# LabelEncoding
le = LabelEncoder()
X_train_cat = X_train.copy()
X_test_cat = X_test.copy()
y_train_cat = y_train.copy()


X_train_shape = X_train.shape[0]
X_test_shape = X_test.shape[0]

X_train["train_test_split"] = [0 for i in range(X_train_shape)]
X_test["train_test_split"] = [1 for i in range(X_test_shape)]

X_train_test = X_train.append(X_test, ignore_index=True)

X_train_test[categorical_features] = X_train_test[categorical_features].apply(le.fit_transform)
X_train = X_train_test.loc[X_train_test["train_test_split"] == 0]
X_test = X_train_test.loc[X_train_test["train_test_split"] == 1]

X_train=X_train.drop(["train_test_split"], axis=1)
X_test=X_test.drop(["train_test_split"], axis=1)

In [22]:
class RMSLEMetric(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        # the larger metric value the better
        return False

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        preds = np.array(approxes[0])
        target = np.array(target)
        preds = np.exp(preds) - 1
        target = np.exp(target) - 1
        for i in range(len(preds)):
            if preds[i]<0:
                preds[i] = 0
        assert (target >= 0).all(), 'Received negative target values'
        assert (preds >= 0).all(), 'Received negative pred values'
        assert target.shape == preds.shape, 'target and pred have different shapes'
        target_log1p = np.log1p(target)  # log(1 + y_true)
        preds_log1p = np.log1p(preds)  # log(1 + y_pred)
        
        return np.sqrt(np.mean(np.square(preds_log1p - target_log1p))), 0


def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error 
    
    Args:
        y_true (np.array): n-dimensional vector of ground-truth values 
        y_pred (np.array): n-dimensional vecotr of predicted values 
    
    Returns:
        A scalar float with the rmsle value 
    
    Note: You can alternatively use sklearn and just do: 
        `sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5`
    """
    for i in range(len(y_pred)):
        if y_pred[i]<0:
            y_pred[i] = 0
    assert (y_true >= 0).all(), 'Received negative y_true values'
    assert (y_pred >= 0).all(), 'Received negative y_pred values'
    assert y_true.shape == y_pred.shape, 'y_true and y_pred have different shapes'
    y_true = np.exp(y_true) - 1
    y_pred = np.exp(y_pred) - 1
    y_true_log1p = np.log1p(y_true)  # log(1 + y_true)
    y_pred_log1p = np.log1p(y_pred)  # log(1 + y_pred)
    print(np.sqrt(np.mean(np.square(y_pred_log1p - y_true_log1p))))
    return np.sqrt(np.mean(np.square(y_pred_log1p - y_true_log1p)))


In [23]:
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

cat_params =     {
    "random_state":8,
    "learning_rate":0.01,
    "max_depth":8,
    "eval_metric":RMSLEMetric(),
    "cat_features":categorical_features,
    "n_estimators":3000,
    "od_type":"Iter",
    "silent":True
}

random_forest_params = {
    "n_estimators":2000,
    "max_depth":10,
    "random_state":8,
    
    "max_features":None,
}

light_gbm_params = {
    "num_leaves":8,
    "max_depth":8, 
    "random_state":8,
    "silent":True, 
    "metric":'rmsle',
    "n_jobs":-1,
    #"min_data_in_leaf": 20,
    "lambda_l1": 0.5,
    "n_estimators":1500,
    "colsample_bytree":0.95,
    "subsample":0.2,
    "learning_rate":0.008,
}

model_light_gbm = lgb.LGBMRegressor(**light_gbm_params)
# model_random_forest = RandomForestRegressor(**random_forest_params)
model_catboost= CatBoostRegressor(**cat_params)

model_catboost.fit(X_train, y_train)
#model_random_forest.fit(X_train, y_train)
model_light_gbm.fit(X_train, y_train)


Failed in nopython mode pipeline (step: nopython frontend)
[1m[1m[1mNo implementation of function Function(<built-in function array>) found for signature:
 
 >>> array(array(float64, 1d, C))
 
There are 4 candidate implementations:
[1m   - Of which 4 did not match due to:
   Overload in function '_OverloadWrapper._build.<locals>.ol_generated': File: numba/core/overload_glue.py: Line 131.
     With argument(s): '(array(float64, 1d, C))':[0m
[1m    Rejected as the implementation raised a specific error:
      TypingError: Failed in nopython mode pipeline (step: nopython frontend)
    [1m[1m[1mNo implementation of function Function(<intrinsic stub>) found for signature:
     
     >>> stub(array(float64, 1d, C))
     
    There are 2 candidate implementations:
    [1m  - Of which 2 did not match due to:
      Intrinsic in function 'stub': File: numba/core/overload_glue.py: Line 35.
        With argument(s): '(array(float64, 1d, C))':[0m
    [1m   Rejected as the implementation



LGBMRegressor(colsample_bytree=0.95, lambda_l1=0.5, learning_rate=0.008,
              max_depth=8, metric='rmsle', n_estimators=1500, num_leaves=8,
              random_state=8, silent=True, subsample=0.2)

In [24]:
features_for_model = [f for f in X_train]
feat_import = [t for t in zip(features_for_model,model_catboost.get_feature_importance())]
feat_import_df = pd.DataFrame(feat_import, columns = ['Features', 'CatBoost_importance'])
df_cat =feat_import_df.sort_values("CatBoost_importance", ascending= False)


features_for_model = [f for f in X_train]
feat_import = [t for t in zip(features_for_model,model_light_gbm.feature_importances_)]
feat_import_df = pd.DataFrame(feat_import, columns = ['Features', 'Light_gbm_importance'])
df_lgbm =feat_import_df.sort_values("Light_gbm_importance", ascending= False)

df = pd.merge(df_cat,df_lgbm, how="left", on="Features")
df



Unnamed: 0,Features,CatBoost_importance,Light_gbm_importance
0,avg_rev_big_chain,25.174228,1548
1,closest_mall_name,6.141035,460
2,lv4_desc,6.122676,302
3,lv4_avg_rev,5.963669,631
4,mall_distance,5.704027,1115
5,num_of_stores_grunnkrets,5.246388,932
6,lv3_desc,4.927102,510
7,cluster,4.671898,167
8,district_name,4.563575,295
9,municipality_name,4.498663,496


In [25]:
cv = KFold(n_splits = 4, random_state=42, shuffle=True)
score = cross_val_score(model_catboost, X_train, y_train, cv=cv, scoring=make_scorer(rmsle))
print("catboost")
print('Scores for all folds: ')
print(score)
print('\n')
print('Mean of scores:')
print(np.mean(score))
print('\n')
print('Standard deviation of scores: ')
print(np.std(score))

cv = KFold(n_splits = 4, random_state=42, shuffle=True)
score = cross_val_score(model_light_gbm, X_train, y_train, cv=cv, scoring=make_scorer(rmsle))

print("light gbm")
print('Scores for all folds: ')
print(score)
print('\n')
print('Mean of scores:')
print(np.mean(score))
print('\n')
print('Standard deviation of scores: ')
print(np.std(score))

Failed in nopython mode pipeline (step: nopython frontend)
[1m[1m[1mNo implementation of function Function(<built-in function array>) found for signature:
 
 >>> array(array(float64, 1d, C))
 
There are 4 candidate implementations:
[1m      - Of which 4 did not match due to:
      Overload in function '_OverloadWrapper._build.<locals>.ol_generated': File: numba/core/overload_glue.py: Line 131.
        With argument(s): '(array(float64, 1d, C))':[0m
[1m       Rejected as the implementation raised a specific error:
         TypingError: Failed in nopython mode pipeline (step: nopython frontend)
       [1m[1m[1mNo implementation of function Function(<intrinsic stub>) found for signature:
        
        >>> stub(array(float64, 1d, C))
        
       There are 2 candidate implementations:
       [1m  - Of which 2 did not match due to:
         Intrinsic in function 'stub': File: numba/core/overload_glue.py: Line 35.
           With argument(s): '(array(float64, 1d, C))':[0m
  

0.6616371686038339


Failed in nopython mode pipeline (step: nopython frontend)
[1m[1m[1mNo implementation of function Function(<built-in function array>) found for signature:
 
 >>> array(array(float64, 1d, C))
 
There are 4 candidate implementations:
[1m      - Of which 4 did not match due to:
      Overload in function '_OverloadWrapper._build.<locals>.ol_generated': File: numba/core/overload_glue.py: Line 131.
        With argument(s): '(array(float64, 1d, C))':[0m
[1m       Rejected as the implementation raised a specific error:
         TypingError: Failed in nopython mode pipeline (step: nopython frontend)
       [1m[1m[1mNo implementation of function Function(<intrinsic stub>) found for signature:
        
        >>> stub(array(float64, 1d, C))
        
       There are 2 candidate implementations:
       [1m  - Of which 2 did not match due to:
         Intrinsic in function 'stub': File: numba/core/overload_glue.py: Line 35.
           With argument(s): '(array(float64, 1d, C))':[0m
  

0.6657786946250365


Failed in nopython mode pipeline (step: nopython frontend)
[1m[1m[1mNo implementation of function Function(<built-in function array>) found for signature:
 
 >>> array(array(float64, 1d, C))
 
There are 4 candidate implementations:
[1m      - Of which 4 did not match due to:
      Overload in function '_OverloadWrapper._build.<locals>.ol_generated': File: numba/core/overload_glue.py: Line 131.
        With argument(s): '(array(float64, 1d, C))':[0m
[1m       Rejected as the implementation raised a specific error:
         TypingError: Failed in nopython mode pipeline (step: nopython frontend)
       [1m[1m[1mNo implementation of function Function(<intrinsic stub>) found for signature:
        
        >>> stub(array(float64, 1d, C))
        
       There are 2 candidate implementations:
       [1m  - Of which 2 did not match due to:
         Intrinsic in function 'stub': File: numba/core/overload_glue.py: Line 35.
           With argument(s): '(array(float64, 1d, C))':[0m
  

0.6706272673450171


Failed in nopython mode pipeline (step: nopython frontend)
[1m[1m[1mNo implementation of function Function(<built-in function array>) found for signature:
 
 >>> array(array(float64, 1d, C))
 
There are 4 candidate implementations:
[1m      - Of which 4 did not match due to:
      Overload in function '_OverloadWrapper._build.<locals>.ol_generated': File: numba/core/overload_glue.py: Line 131.
        With argument(s): '(array(float64, 1d, C))':[0m
[1m       Rejected as the implementation raised a specific error:
         TypingError: Failed in nopython mode pipeline (step: nopython frontend)
       [1m[1m[1mNo implementation of function Function(<intrinsic stub>) found for signature:
        
        >>> stub(array(float64, 1d, C))
        
       There are 2 candidate implementations:
       [1m  - Of which 2 did not match due to:
         Intrinsic in function 'stub': File: numba/core/overload_glue.py: Line 35.
           With argument(s): '(array(float64, 1d, C))':[0m
  

0.6694457213354614
catboost
Scores for all folds: 
[0.66163717 0.66577869 0.67062727 0.66944572]


Mean of scores:
0.6668722129773372


Standard deviation of scores: 
0.0035115826400974083




0.6650135025216873
0.6740535014115837
0.6743513789702983
0.6737243604951957
light gbm
Scores for all folds: 
[0.6650135  0.6740535  0.67435138 0.67372436]


Mean of scores:
0.6717856858496913


Standard deviation of scores: 
0.003916206551661562


In [26]:
X_test.head(10)

Unnamed: 0,grunnkrets_id,chain_name,mall_name,lv1_desc,lv2_desc,lv3_desc,lv4_desc,district_name,municipality_name,area_km2,num_of_stores_grunnkrets,mall_distance,closest_mall_name,avg_rev_big_chain,lv3_avg_rev,lv4_avg_rev,cluster,center_distance
12592,922,345,529,0,15,35,38,1263,249,0.057027,4.0,0.011638,191,1.290239,2.096344,2.096344,2,5.315833
12593,2189,52,529,0,15,35,38,329,299,0.165993,9.0,0.015869,237,1.830112,2.096344,2.096344,9,2.671965
12594,1033,52,451,0,15,35,38,244,249,0.236628,123.0,4.9e-05,484,1.830112,2.096344,2.096344,2,8626.714236
12595,3962,52,529,0,15,35,38,1022,356,0.983436,15.0,0.016599,444,1.830112,2.096344,2.096344,11,13.36437
12596,1078,334,529,0,15,35,38,361,249,0.449502,27.0,0.008053,228,3.397223,2.096344,2.096344,2,1.395687
12597,938,52,529,0,15,35,38,763,249,0.313006,10.0,0.001627,498,1.830112,2.096344,2.096344,2,1106.438239
12598,3915,207,496,0,15,35,38,671,356,0.143946,98.0,0.000242,531,3.188035,2.096344,2.096344,11,2.817447
12599,810,345,529,0,15,35,38,647,249,0.191613,90.0,0.003768,127,1.290239,2.096344,2.096344,2,41.561478
12600,1397,207,453,0,15,35,38,910,181,0.39577,80.0,0.005933,486,3.188035,2.096344,2.096344,7,2.003095
12601,3655,207,531,0,15,35,38,777,404,0.120695,33.0,0.001941,573,3.188035,2.096344,2.096344,28,1.18213


In [27]:
# mean of scores
# 0.6781866272821699 - 11-12-123108 - kaggle: 0.68349

# mean of scores 3- stack
# 6781866272821699 / 0.6839925788085071 / 0.679119033159766 - 11-12-132139 - kaggle: 0.68413

# mean of scores models:
# 0.6976428658736472 - random forest - kaggle: 0.70203
# 0.6857052809131627 - random forest - new hyper params

# Mean of scores:
# 0.6754065055519785 - catboost
# 0.6798965069269716 - light gbm

# Mean of scores: reverted big chain to >1
# 0.6701695186092096 - catboost
# 0.6752403812552107 - light gbm

# Mean of scores: reverted big chain to >=1
# 0.6668722129773372
# 0.6717856858496913


X_test = replace_missing(X_test, "lv3_avg_rev", MEAN_REVENUE)
X_test = replace_missing(X_test, "lv4_avg_rev", MEAN_REVENUE)


catboost_prediction = model_catboost.predict(X_test)
light_gbm_prediction= model_light_gbm.predict(X_test)
#catboost_prediction_3 = model_catboost_3.predict(X_test)

all_predictions= np.array([catboost_prediction, light_gbm_prediction])
for prd in all_predictions:
    print( np.exp(prd[:12]) -1 )
    print(pd.DataFrame(np.exp(prd) -1 ).describe())



final_prediction = np.mean( all_predictions , axis=0)
for i in range(len(final_prediction)):
    if final_prediction[i]<0:
        final_prediction[i] = 0
        
final_prediction = np.exp(final_prediction) - 1

[ 3.57860254  7.18207468  5.50706869  7.12047749 34.02700927  6.16900091
 21.0143839   6.42088054 21.86393185 22.20359843  4.89586579  5.0262228 ]
                 0
count  8577.000000
mean      6.380816
std      10.188587
min       0.452414
25%       2.031146
50%       3.252137
75%       5.367583
max      88.469473
[ 3.94541748  6.68174259  5.65180127  6.84852398 29.0164004   5.75816769
 26.1222001   6.9470074  25.12626229 26.00934688  5.65706121  5.34737836]
                 0
count  8577.000000
mean      6.306610
std       9.929117
min       0.328851
25%       2.053865
50%       3.261790
75%       5.302835
max      75.488876


In [28]:
# generate .csv-submission


submission = pd.DataFrame()
submission['id'] = stores_test_id
submission['predicted'] = np.asarray(final_prediction)

from datetime import datetime
now = datetime.now() # current date and time
timestamp = now.strftime("%Y-%m-%d_%H%M%S")

path = '../predictions'
prefix = 'submission'
suffix = '.csv'

filename = "".join(["-".join([prefix,timestamp]),suffix])

#eda.create_report(submission)
#submission.to_csv("/".join([path,filename]), index=False)

print(submission[:12])
submission.describe()



# same as submission-2022-11-13-175627

                            id  predicted
0   914206820-914239427-717245   3.758477
1   916789157-916823770-824309   6.927963
2     913341082-977479363-2948   5.579037
3    889682582-889697172-28720   6.983343
4   997991699-998006945-417222  31.425063
5   914931487-815162862-756427   5.960554
6     967062979-972338656-6209  23.435190
7   914631734-914748119-740036   6.679440
8    970976361-973961837-23171  23.440726
9    979425031-979584385-54031  24.034257
10  914852625-914864976-744489   5.264913
11  916756097-816761972-821991   5.184716


Unnamed: 0,predicted
count,8577.0
mean,6.332031
std,10.022437
min,0.557473
25%,2.047476
50%,3.246694
75%,5.289865
max,81.426084
