In [1]:
# Load required packages
import os
import oci
import pandas as pd
from io import StringIO, BytesIO
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline 
import matplotlib.pyplot as plt

In [2]:
#read data from oracle cloud
# read config file from default location 
config = oci.config.from_file() 
identity = oci.identity.IdentityClient(config)
object_storage = oci.object_storage.ObjectStorageClient(config)
namespace = object_storage.get_namespace().data

In [3]:
bucket = "Hackathon_Data_Team1"
obj = object_storage.get_object(namespace, bucket, 'X_train.csv')
X_train1 = pd.read_csv(BytesIO(obj.data.content))

In [4]:
obj = object_storage.get_object(namespace, bucket, 'X_test.csv')
X_test1 = pd.read_csv(BytesIO(obj.data.content))

In [5]:
obj = object_storage.get_object(namespace, bucket, 'y_train.csv')
y_train1 = pd.read_csv(BytesIO(obj.data.content))

In [6]:
obj = object_storage.get_object(namespace, bucket, 'vehicle_registration_data_2018.csv')
vehicle = pd.read_csv(BytesIO(obj.data.content))

In [7]:
# Dropping unwanted columns
X_train = X_train1.drop(X_train1.columns[[0,1]], 1)
X_test = X_test1.drop(X_test1.columns[[0,1]], 1)
y_train = y_train1.drop(y_train1.columns[[0,1]], 1)

In [8]:
# Replacing inf values with nan
X_train=X_train.replace([np.inf, -np.inf], np.nan)
X_test=X_test.replace([np.inf, -np.inf], np.nan)

In [9]:
# Concatenating train and test data to perform label encoding and feature scaling
X_train['type'] = 1
X_test['type'] = 0
total=pd.concat([X_train, X_test], axis=0, sort=False)

In [10]:
vehicle_group = vehicle.loc[:,['ZIP_CODE','TOTAL_VEHICLE_COUNT']].groupby(['ZIP_CODE']).agg({'ZIP_CODE':'size', 
               'TOTAL_VEHICLE_COUNT':'mean'}).rename(columns=
                {'ZIP_CODE':'count','TOTAL_VEHICLE_COUNT':'mean_tot_veh'}).reset_index()

In [11]:
# Merging sales data with vehicle registration data
mergerd = pd.merge(total,vehicle_group, left_on=['Monthly_Top_1_Customer_Zip'],
                   right_on=['ZIP_CODE'], how= 'left')

In [12]:
mergerd

Unnamed: 0,DC_ZIPCODE,Invoice_Year,Invoice_Week,CATEGORY,TIER,SPEED_RATING_CODE,RIM_DIAMETER_SIZE_CODE,WIDTH,HEIGHT,AVG_UNIT_WEIGHT,...,Monthly_Top_5_Customer_Total_Sales,Monthly_Top_6_Customer_Total_Sales,Monthly_Top_7_Customer_Total_Sales,Monthly_Top_8_Customer_Total_Sales,Monthly_Top_9_Customer_Total_Sales,Monthly_Top_10_Customer_Total_Sales,type,ZIP_CODE,count,mean_tot_veh
0,11717,2016,44,Passenger Car / Mini-Van,Tier 3,S,13.0,6.89,22.68,14.90,...,690830.0,258324.0,235820.0,160756.0,183788.0,114146.0,1,11101,21,10.091429
1,11717,2017,10,Passenger Car / Mini-Van,Tier 3,S,13.0,6.89,22.68,14.90,...,501052.0,252610.0,107190.0,272140.0,132466.0,127240.0,1,11101,21,10.091429
2,11717,2017,13,Passenger Car / Mini-Van,Tier 3,S,13.0,6.89,22.68,14.90,...,501052.0,252610.0,107190.0,272140.0,132466.0,127240.0,1,11101,21,10.091429
3,11717,2017,19,Passenger Car / Mini-Van,Tier 3,H,13.0,6.89,22.64,13.19,...,217224.0,187662.0,117520.0,50656.0,55520.0,86772.0,1,11520,23,7.383565
4,11717,2017,25,Passenger Car / Mini-Van,Tier 3,S,13.0,6.89,22.68,14.90,...,378726.0,525926.0,246766.0,155972.0,83740.0,73248.0,1,11101,21,10.091429
5,11717,2016,29,Passenger Car / Mini-Van,Tier 2,T,13.0,6.97,22.68,12.80,...,214580.0,179086.0,135796.0,226266.0,128360.0,143966.0,1,11520,23,7.383565
6,11717,2016,30,Passenger Car / Mini-Van,Tier 2,T,13.0,6.97,22.68,12.80,...,214580.0,179086.0,135796.0,226266.0,128360.0,143966.0,1,11520,23,7.383565
7,11717,2016,36,Passenger Car / Mini-Van,Tier 2,T,13.0,6.97,22.68,12.80,...,562312.0,224766.0,199210.0,172620.0,48782.0,107648.0,1,11746,28,10.043143
8,11717,2016,41,Passenger Car / Mini-Van,Tier 2,T,13.0,6.97,22.68,12.80,...,690830.0,258324.0,235820.0,160756.0,183788.0,114146.0,1,11101,21,10.091429
9,11717,2017,8,Passenger Car / Mini-Van,Tier 2,T,13.0,6.97,22.68,12.80,...,307034.0,144974.0,187884.0,620762.0,100636.0,42290.0,1,11520,23,7.383565


In [13]:
# Identifying numeric columns to perform Standard scaler
num_cols = mergerd.columns[mergerd.dtypes.apply(lambda c: np.issubdtype(c, np.number))]
num_cols

Index(['DC_ZIPCODE', 'Invoice_Year', 'Invoice_Week', 'RIM_DIAMETER_SIZE_CODE',
       'WIDTH', 'HEIGHT', 'AVG_UNIT_WEIGHT', 'Invoice_Month', 'SELLING_PRICE',
       'Monthly_Top_1_Customer_Zip', 'Monthly_Top_2_Customer_Zip',
       'Monthly_Top_3_Customer_Zip', 'Monthly_Top_4_Customer_Zip',
       'Monthly_Top_5_Customer_Zip', 'Monthly_Top_6_Customer_Zip',
       'Monthly_Top_7_Customer_Zip', 'Monthly_Top_8_Customer_Zip',
       'Monthly_Top_9_Customer_Zip', 'Monthly_Top_10_Customer_Zip',
       'Monthly_Top_1_Customer_Total_Sales',
       'Monthly_Top_2_Customer_Total_Sales',
       'Monthly_Top_3_Customer_Total_Sales',
       'Monthly_Top_4_Customer_Total_Sales',
       'Monthly_Top_5_Customer_Total_Sales',
       'Monthly_Top_6_Customer_Total_Sales',
       'Monthly_Top_7_Customer_Total_Sales',
       'Monthly_Top_8_Customer_Total_Sales',
       'Monthly_Top_9_Customer_Total_Sales',
       'Monthly_Top_10_Customer_Total_Sales', 'type', 'ZIP_CODE', 'count',
       'mean_tot_veh'],
  

In [14]:
num_cols=num_cols[3:]

In [15]:
# Feature scaling 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
mergerd[num_cols] = scaler.fit_transform(mergerd[num_cols])
mergerd

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,DC_ZIPCODE,Invoice_Year,Invoice_Week,CATEGORY,TIER,SPEED_RATING_CODE,RIM_DIAMETER_SIZE_CODE,WIDTH,HEIGHT,AVG_UNIT_WEIGHT,...,Monthly_Top_5_Customer_Total_Sales,Monthly_Top_6_Customer_Total_Sales,Monthly_Top_7_Customer_Total_Sales,Monthly_Top_8_Customer_Total_Sales,Monthly_Top_9_Customer_Total_Sales,Monthly_Top_10_Customer_Total_Sales,type,ZIP_CODE,count,mean_tot_veh
0,11717,2016,44,Passenger Car / Mini-Van,Tier 3,S,-0.659080,-2.283355,-2.066499,-1.242958,...,6.934614,2.034821,2.049241,0.718820,1.554059,0.398333,0.475477,-1.596326,-0.812226,-0.042215
1,11717,2017,10,Passenger Car / Mini-Van,Tier 3,S,-0.659080,-2.283355,-2.066499,-1.242958,...,4.551490,1.947271,-0.149183,2.323610,0.590029,0.657651,0.475477,-1.596326,-0.812226,-0.042215
2,11717,2017,13,Passenger Car / Mini-Van,Tier 3,S,-0.659080,-2.283355,-2.066499,-1.242958,...,4.551490,1.947271,-0.149183,2.323610,0.590029,0.657651,0.475477,-1.596326,-0.812226,-0.042215
3,11717,2017,19,Passenger Car / Mini-Van,Tier 3,H,-0.659080,-2.283355,-2.080409,-1.362126,...,0.987338,0.952137,0.027368,-0.867470,-0.855322,-0.143791,0.475477,-1.580920,-0.375882,-0.576771
4,11717,2017,25,Passenger Car / Mini-Van,Tier 3,S,-0.659080,-2.283355,-2.066499,-1.242958,...,3.015389,6.135020,2.236320,0.649894,-0.325239,-0.411625,0.475477,-1.596326,-0.812226,-0.042215
5,11717,2016,29,Passenger Car / Mini-Van,Tier 2,T,-0.659080,-2.216248,-2.066499,-1.389304,...,0.954136,0.820736,0.339724,1.662670,0.512902,0.988898,0.475477,-1.580920,-0.375882,-0.576771
6,11717,2016,30,Passenger Car / Mini-Van,Tier 2,T,-0.659080,-2.216248,-2.066499,-1.389304,...,0.954136,0.820736,0.339724,1.662670,0.512902,0.988898,0.475477,-1.580920,-0.375882,-0.576771
7,11717,2016,36,Passenger Car / Mini-Van,Tier 2,T,-0.659080,-2.216248,-2.066499,-1.389304,...,5.320758,1.520645,1.423537,0.889754,-0.981888,0.269644,0.475477,-1.572611,0.714977,-0.051747
8,11717,2016,41,Passenger Car / Mini-Van,Tier 2,T,-0.659080,-2.216248,-2.066499,-1.389304,...,6.934614,2.034821,2.049241,0.718820,1.554059,0.398333,0.475477,-1.596326,-0.812226,-0.042215
9,11717,2017,8,Passenger Car / Mini-Van,Tier 2,T,-0.659080,-2.216248,-2.066499,-1.389304,...,2.115121,0.298072,1.229964,7.346460,-0.007865,-1.024727,0.475477,-1.580920,-0.375882,-0.576771


In [16]:
# Label encoding categorical variables
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
mergerd['CATEGORY']=lb.fit_transform(mergerd['CATEGORY'])
mergerd['TIER']=lb.fit_transform(mergerd['TIER'])
mergerd['SPEED_RATING_CODE']=lb.fit_transform(mergerd['SPEED_RATING_CODE'])
mergerd['DC_ZIPCODE']=lb.fit_transform(mergerd['DC_ZIPCODE'])
mergerd['Invoice_Year']=lb.fit_transform(mergerd['Invoice_Year'])
mergerd['Monthly_Top_1_Customer_Zip']=lb.fit_transform(mergerd['Monthly_Top_1_Customer_Zip'])
mergerd['Monthly_Top_2_Customer_Zip']=lb.fit_transform(mergerd['Monthly_Top_2_Customer_Zip'])
mergerd['Monthly_Top_3_Customer_Zip']=lb.fit_transform(mergerd['Monthly_Top_3_Customer_Zip'])
mergerd['Monthly_Top_4_Customer_Zip']=lb.fit_transform(mergerd['Monthly_Top_4_Customer_Zip'])
mergerd['Monthly_Top_5_Customer_Zip']=lb.fit_transform(mergerd['Monthly_Top_5_Customer_Zip'])
mergerd['Monthly_Top_6_Customer_Zip']=lb.fit_transform(mergerd['Monthly_Top_6_Customer_Zip'])
mergerd['Monthly_Top_7_Customer_Zip']=lb.fit_transform(mergerd['Monthly_Top_7_Customer_Zip'])
mergerd['Monthly_Top_8_Customer_Zip']=lb.fit_transform(mergerd['Monthly_Top_8_Customer_Zip'])
mergerd['Monthly_Top_9_Customer_Zip']=lb.fit_transform(mergerd['Monthly_Top_9_Customer_Zip'])
mergerd['Monthly_Top_10_Customer_Zip']=lb.fit_transform(mergerd['Monthly_Top_10_Customer_Zip'])

In [17]:
# Filling missing data with median value
mergerd=mergerd.fillna(mergerd.median())

In [18]:
# Dividing train and test data to train the model and test the model. X_test is going to be the same data as given.
X_train=mergerd.iloc[:2019036,:]
X_test=mergerd.iloc[2019036:,:]

# Dropping unwanted columns
X_train = X_train.drop(['type', 'ZIP_CODE'],1)
X_test = X_test.drop(['type', 'ZIP_CODE'],1)

X_train=X_train.drop(['Invoice_Week'], axis=1)
X_test=X_test.drop(['Invoice_Week'], axis=1)


In [19]:
# Installing required packages
!pip install lightgbm
import scipy
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb



In [21]:
# Model building
d_train = lgb.Dataset(X_train, label=y_train)
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 250,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

model = lgb.train(params,
                d_train,
                num_boost_round=1500)
#predict test data values using trained model
preds = model.predict(X_test)

In [22]:
len(preds)

456460

In [23]:
# Predictions
preds

array([5.74605479, 5.52666264, 5.7259739 , ..., 3.68255377, 3.58444345,
       3.58444345])

In [24]:
import pandas as pd 
import numpy as np 
from sklearn.metrics import mean_squared_error
import os

In [25]:
def submit_score(predictions, team_key):
    """
    Submit your predictions for scoring

    Args:
        predictions (DataFrame): Pandas DataFrame containing the following required
            column:
                1. idx (int) - The unique identifier for each observation
                2. predictions (float) - Your predicted value
        team_key (str): Your team's unique identifier

    Returns:
        Response: Flask Response object. See the Response.text field to get the score
            from your latest submission. Your best score will be reflected on the
            leaderboard
    """

    import requests
    import json
    import numpy
    def default(o):
        if isinstance(o, numpy.int64):
            return int(o)
        raise TypeError

    API_ENDPOINT = "http://coe-hackathon-dot-atd-fn-anacoe-dev.appspot.com/submitscore"
    payload = {
        "team_key": team_key,
        "data": predictions.loc[:, ["idx", "predictions"]].to_dict(orient="records")
    }
    resp = requests.post(
        API_ENDPOINT,
        data=json.dumps(payload, default=default),
        headers={'Content-Type': 'application/json'}
    )
    
    if resp.status_code == 404:
        print(resp.json()['error'])
        return None
    
    elif resp.status_code != 200:
        raise ValueError('There was an error processing your request: '
                         '\n{}'.format(resp.text))
        return None
    else:
        score = resp.json()['score']
        print('Submission successful! Your score was \n{}'.format(score))
        return score

In [26]:
teamkey = 'teamkey'

In [27]:
testdatasize = 456460
import pandas as pd
testX = pd.DataFrame()
testX['idx'] = range(testdatasize)
testX['predictions'] = preds

In [28]:
import time
s = time.time()
a = submit_score(testX[['idx','predictions']],teamkey)
print('submission time elapsed: '+str(time.time() - s))

Submission successful! Your score was 
6.4942207800557386
submission time elapsed: 9.601505279541016


In [29]:
a

'6.4942207800557386'

In [30]:
# Downloading test results
testX.to_csv("best.csv",index=False)