In [481]:
# data handling
import numpy as np
import pandas as pd

# data visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# feature scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# feature selection
from sklearn.feature_selection import RFE

# machine learning algorithms
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn import naive_bayes 
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

# dimensionality reduction with PCA
from sklearn.decomposition import PCA

# accuracy metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [482]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/data-storm-4/Data-dictionary.xlsx
/kaggle/input/data-storm-4/DataStorm4.0_KaggleProblemStatement.pdf
/kaggle/input/data-storm-4/Store-info.csv
/kaggle/input/data-storm-4/Testing-data.csv
/kaggle/input/data-storm-4/Historical-transaction-data.csv


In [483]:
store_db = pd.read_csv('/kaggle/input/data-storm-4/Store-info.csv')
transaction_db = pd.read_csv('/kaggle/input/data-storm-4/Historical-transaction-data.csv')
test_db = pd.read_csv('/kaggle/input/data-storm-4/Testing-data.csv')

## Creating the Master Table of Data based on Transactions and the corresponding Store 

In [484]:
table = pd.merge(transaction_db, store_db)

### Splitting data into smaller ones on the Master Data

In [485]:
# Amount of money spent in rupees for each transaction by each person on a particular item
table['amount'] = table['item_price'] * table['quantity_sold']

# Handling the date
table['transaction_date'] = table['transaction_date'].str.split("T",n=1,expand=True)[0]
table['transaction_date'] = pd.to_datetime(table['transaction_date'],format="%Y-%m-%d")

# Splitting the date components
table['year'] = table['transaction_date'].dt.year
table['month'] = table['transaction_date'].dt.month
table['date'] = table['transaction_date'].dt.day
table['weekday'] = table['transaction_date'].dt.weekday 

In [486]:
price_item_modes = table.groupby(['item_price'])['item_description'].agg(pd.Series.mode)

for i in price_item_modes.index:
    if type(price_item_modes[i]) == np.ndarray:
        price_item_modes[i] = np.NaN
        
table['item_description'] = table['item_description'].fillna(table['item_price'].map(price_item_modes))

In [487]:
table.dropna(subset=['item_description'], inplace=True)

In [488]:
# Obtaining the beverage type and the volume of the beverage in each transaction
item_desc_split = table['item_description'].str.rsplit(" ",n=1,expand=True)
table['beverage_description'] = item_desc_split[0]
table['volume_type'] = item_desc_split[1]

# Obtaining the volume of the beverage bottles as millilitres
# code obtained from https://stackoverflow.com/a/61041274 
vol_data = table.volume_type.str.extract(r'(?i)\b(\d+(?:\.\d+)?)\s*(ML|L)\b', expand=True)
vol_data[0] = vol_data[0].astype(float)

vol_data = vol_data.replace(['ML','L'],[0,1])
vol_data[1].astype(float)

vol_data[0] = vol_data[0] + vol_data[1]*999*vol_data[0]

table['volume_ml'] = vol_data[0]

In [489]:
reordered_cols = ['month','date','weekday','invoice_id','customer_id','beverage_description',
                  'volume_ml','item_price','quantity_sold','amount','shop_id','shop_area_sq_ft',
                  'shop_profile']
table = table[reordered_cols]
table

Unnamed: 0,month,date,weekday,invoice_id,customer_id,beverage_description,volume_ml,item_price,quantity_sold,amount,shop_id,shop_area_sq_ft,shop_profile
0,12,11,5,147.0,BGXA,ORANGE BARLEY,1500.0,220,2,440,SHOP008,678,Moderate
1,12,13,0,484.0,VN7V,TONIC PET,500.0,160,2,320,SHOP008,678,Moderate
2,12,13,0,484.0,VN7V,CREAM SODA,1000.0,150,2,300,SHOP008,678,Moderate
3,12,10,4,1000053.0,VT9C,GINGER BEER,1500.0,220,1,220,SHOP008,678,Moderate
4,12,10,4,1000057.0,8QLS,GINGER BEER,1500.0,440,1,440,SHOP008,678,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...
473969,12,9,3,7030409.0,QVA7,SODA,500.0,70,1,70,SHOP072,617,High
473970,12,11,5,7030809.0,HXMG,FIT O ORANGE,200.0,140,2,280,SHOP072,617,High
473971,12,11,5,7030820.0,OUH2,BOTTLED DRINKING WATER,500.0,35,1,35,SHOP072,617,High
473972,10,21,3,,5IJM,GINGER BEER,1500.0,220,1,220,SHOP072,617,High


## Handling Missing Values

There are two features in the original dataset that have missing values. 
1. Invoice Number
2. Item Description (now split as beverage_description & volume_ml)

To impute the missing invoice numbers we will assume that all transactions that were done using the same customer_id on the same date to be considered as a single invoice. Since we only to identify whether these transactions belong to the same invoice, it is not necessary to identify the correct the invoice id number. Instead we will create a new invoice number starting from the last invoice recorded in the table (i.e. highest number in the invoice_id column)

In [490]:
no_inv_db = table.loc[table.invoice_id.isna()]
invoice_index = no_inv_db.sort_values(['customer_id','month','date']).groupby(
    ['customer_id','month','date']).count().index

invoice_i = table.invoice_id.max() + 1
for idx in invoice_index:
    no_inv_db.loc[(no_inv_db.customer_id == idx[0]) &
                  (no_inv_db.month == idx[1]) & 
                  (no_inv_db.date == idx[2]),'invoice_id'] = invoice_i
    invoice_i += 1
    
table.loc[table.invoice_id.isna()] = no_inv_db

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


It is idenitified that shop with id 'SHOP008' has only being functioning in the month December and hence will be dropped from the training data

table = table.loc[table.shop_id != 'SHOP008']
store_db = store_db.loc[store_db.shop_id != 'SHOP008']

In [491]:
table.loc[table.shop_id == 'SHOP047']

Unnamed: 0,month,date,weekday,invoice_id,customer_id,beverage_description,volume_ml,item_price,quantity_sold,amount,shop_id,shop_area_sq_ft,shop_profile
21177,10,17,6,30956.0,4OFQ,SODA,1000.0,110,1,110,SHOP047,528,Moderate
21178,10,20,2,31418.0,KQYP,ORANGE BARLEY,1500.0,220,1,220,SHOP047,528,Moderate
21179,11,1,0,33467.0,WCZT,GINGER BEER,1500.0,220,2,440,SHOP047,528,Moderate
21180,11,6,5,34265.0,KD8T,FIT O MIXED FRUIT,200.0,60,1,60,SHOP047,528,Moderate
21181,11,6,5,34265.0,KD8T,GINGER BEER,1500.0,220,2,440,SHOP047,528,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22864,11,18,3,2026696.0,ULVU,NECTO,500.0,100,1,100,SHOP047,528,Moderate
22865,11,28,6,2027937.0,5C5L,SODA,1000.0,110,2,220,SHOP047,528,Moderate
22866,10,23,5,3011507.0,IN28,LEMONADE,1500.0,220,2,440,SHOP047,528,Moderate
22867,10,23,5,3011514.0,4QJ5,GINGER BEER,400.0,600,7,4200,SHOP047,528,Moderate


## Forming Aggregate Data Regarding each shop

In [492]:
store_table = store_db.copy()

In [493]:
def add_new_feature(database, col_name, new_series):
    database[col_name] = 0
    i=0
    for idx in database.shop_id:
        database.loc[i, col_name] = new_series.loc[idx][0]
        i+=1
    return database

In [494]:
all_shop_index = store_table.shop_id

In [495]:
tot_sales = pd.DataFrame( table.groupby('shop_id', sort=False)['amount'].sum())
store_table = add_new_feature(store_table, 'tot_sales', tot_sales)

In [496]:
oct_sales = pd.DataFrame(table.loc[table.month == 10].groupby('shop_id',sort=False)['amount'].sum(),index=all_shop_index)
store_table = add_new_feature(store_table, 'oct_sales', oct_sales)

In [497]:
nov_sales = pd.DataFrame(table.loc[table.month == 11].groupby('shop_id',sort=False)['amount'].sum(),index=all_shop_index)
store_table = add_new_feature(store_table, 'nov_sales', nov_sales)

In [498]:
dec_sales = pd.DataFrame(table.loc[table.month == 12].groupby('shop_id',sort=False)['amount'].sum(),index=all_shop_index)
store_table = add_new_feature(store_table, 'dec_sales', dec_sales)

In [499]:
tot_customers = pd.DataFrame( table.groupby('shop_id',sort=False)['customer_id'].nunique(), index=all_shop_index).astype(int)
store_table = add_new_feature(store_table, 'tot_customers', tot_customers)

In [500]:
oct_customers = pd.DataFrame(table.loc[table.month == 10].groupby('shop_id',sort=False)['customer_id'].nunique(),index=all_shop_index)
store_table = add_new_feature(store_table, 'oct_customers', oct_customers)

In [501]:
nov_customers = pd.DataFrame(table.loc[table.month == 11].groupby('shop_id',sort=False)['customer_id'].nunique(),index=all_shop_index)
store_table = add_new_feature(store_table, 'nov_customers', nov_customers)

In [502]:
dec_customers = pd.DataFrame(table.loc[table.month == 12].groupby('shop_id',sort=False)['customer_id'].nunique(),index=all_shop_index)
store_table = add_new_feature(store_table, 'dec_customers', dec_customers)

In [503]:
visit_per_customer = pd.DataFrame(table.groupby('shop_id')['customer_id'].value_counts())
repeat_customers = visit_per_customer.loc[visit_per_customer.customer_id > 1].groupby('shop_id').count()
store_table = add_new_feature(store_table, 'repeat_customers', repeat_customers)

In [504]:
frequent_customers = visit_per_customer.loc[visit_per_customer.customer_id > 12].groupby('shop_id').count()
frequent_customers = pd.DataFrame(frequent_customers, index=all_shop_index)
store_table = add_new_feature(store_table, 'frequent_customers', frequent_customers)

In [505]:
customers_180_bottles = pd.DataFrame(table.loc[table.volume_ml == 180].groupby('shop_id',sort=False)['customer_id'].nunique(),index=all_shop_index)
store_table = add_new_feature(store_table, 'customers_180_bottles', customers_180_bottles)

In [506]:
customers_200_bottles = pd.DataFrame(table.loc[table.volume_ml == 200].groupby('shop_id',sort=False)['customer_id'].nunique(),index=all_shop_index)
store_table = add_new_feature(store_table, 'customers_200_bottles', customers_200_bottles)

In [507]:
customers_400_bottles = pd.DataFrame(table.loc[table.volume_ml == 400].groupby('shop_id',sort=False)['customer_id'].nunique(),index=all_shop_index)
store_table = add_new_feature(store_table, 'customers_400_bottles', customers_400_bottles)

In [508]:
customers_500_bottles = pd.DataFrame(table.loc[table.volume_ml == 500].groupby('shop_id',sort=False)['customer_id'].nunique(),index=all_shop_index)
store_table = add_new_feature(store_table, 'customers_500_bottles', customers_500_bottles)

In [509]:
customers_1000_bottles = pd.DataFrame(table.loc[table.volume_ml == 1000].groupby('shop_id',sort=False)['customer_id'].nunique(),index=all_shop_index)
store_table = add_new_feature(store_table, 'customers_1000_bottles', customers_1000_bottles)

In [510]:
customers_1500_bottles = pd.DataFrame(table.loc[table.volume_ml == 1500].groupby('shop_id',sort=False)['customer_id'].nunique(),index=all_shop_index)
store_table = add_new_feature(store_table, 'customers_1500_bottles', customers_1500_bottles)

In [511]:
table.beverage_description.unique()

array(['ORANGE BARLEY', 'TONIC PET', 'CREAM SODA', 'GINGER BEER',
       'CREAM SODA APPLE POP', 'SODA', 'BOTTLED DRINKING WATER',
       'LEMONADE', 'KIK COLA', 'STRAWBERRY MILK', 'SODA PET',
       'FIT O MIXED FRUIT', 'NECTO', 'FIT O ORANGE', 'FIT O MANGO',
       'GINGER BEER SUGAR FREE', 'ORANGE CRUSH', 'CHOCOLATE MILK',
       'TWISTEE APPLE', 'DRY GINGER ALE PET', 'LIME CRUSH JUICE'],
      dtype=object)

In [512]:
soft_drinks = ['ORANGE BARLEY','TONIC PET','CREAM SODA','GINGER BEER','CREAM SODA APPLE POP','SODA',
               'LEMONADE','KIK COLA','SODA PET','NECTO','GINGER BEER SUGAR FREE','ORANGE CRUSH',
               'DRY GINGER ALE PET']
water = ['BOTTLED DRINKING WATER']
fruit_drinks = ['FIT O MIXED FRUIT','FIT O ORANGE','FIT O MANGO','TWISTEE APPLE','LIME CRUSH JUICE']
milk_drinks = ['CHOCOLATE MILK','STRAWBERRY MILK']

In [513]:
milk_sale_data = pd.Series(table['beverage_description'].isin(milk_drinks))
milk_sales = pd.DataFrame(table.loc[milk_sale_data].groupby('shop_id',sort=False)['amount'].sum(), index=all_shop_index)
store_table = add_new_feature(store_table, 'milk_sales', milk_sales)

In [514]:
soft_d_sale_data = pd.Series(table['beverage_description'].isin(soft_drinks))
soft_d_sales = pd.DataFrame(table.loc[soft_d_sale_data].groupby('shop_id',sort=False)['amount'].sum(), index=all_shop_index)
store_table = add_new_feature(store_table, 'soft_d_sales', soft_d_sales)

In [515]:
water_sale_data = pd.Series(table['beverage_description'].isin(water))
water_sales = pd.DataFrame(table.loc[water_sale_data].groupby('shop_id',sort=False)['amount'].sum(), index=all_shop_index)
store_table = add_new_feature(store_table, 'water_sales', water_sales)

In [516]:
fruit_d_sale_data = pd.Series(table['beverage_description'].isin(fruit_drinks))
fruit_d_sales = pd.DataFrame(table.loc[fruit_d_sale_data].groupby('shop_id',sort=False)['amount'].sum(), index=all_shop_index)
store_table = add_new_feature(store_table, 'fruit_d_sales', fruit_d_sales)

In [517]:
invoice_data = pd.DataFrame(table.groupby(['shop_id','invoice_id'])['amount'].sum(),)
avg_bill_value = pd.DataFrame(invoice_data.groupby('shop_id')['amount'].mean().round(2))
store_table = add_new_feature(store_table, 'avg_bill_value', avg_bill_value)

In [518]:
weekday_avg_sales = table.groupby(['weekday','shop_id'])['amount'].sum()
saturday_avg_sales = pd.DataFrame(weekday_avg_sales[5])
sunday_avg_sales = pd.DataFrame(weekday_avg_sales[6])
weekend_avg_sales = pd.DataFrame(round((saturday_avg_sales['amount'] + sunday_avg_sales['amount'])/18,2))
store_table = add_new_feature(store_table, 'weekend_avg_sales', weekend_avg_sales)

In [519]:
customer_per_day = pd.DataFrame(table.groupby(['shop_id','month','date'])['customer_id'].nunique())
customer_per_day = pd.DataFrame(customer_per_day.groupby('shop_id')['customer_id'].mean().round(0)).astype(int)
store_table = add_new_feature(store_table, 'customer_per_day', customer_per_day)

In [520]:
store_table.tail(24)

Unnamed: 0,shop_id,shop_area_sq_ft,shop_profile,tot_sales,oct_sales,nov_sales,dec_sales,tot_customers,oct_customers,nov_customers,...,customers_500_bottles,customers_1000_bottles,customers_1500_bottles,milk_sales,soft_d_sales,water_sales,fruit_d_sales,avg_bill_value,weekend_avg_sales,customer_per_day
100,SHOP046,545,,2607865,556485.0,1170390.0,880990,1736,679.0,855.0,...,608,389,935,34640,1743320,578515,251390,944.54,32302.22,44
101,SHOP024,676,,1308795,335055.0,601275.0,372465,1641,575.0,850.0,...,559,337,884,22880,1075760,63525,146630,565.35,22019.72,37
102,SHOP023,617,,2553330,916485.0,990420.0,646425,2227,799.0,1108.0,...,645,475,1243,65750,1759360,185710,542510,802.68,50219.17,51
103,SHOP097,310,,2375735,610460.0,1057700.0,707575,1667,600.0,809.0,...,683,365,785,221930,1660760,160895,332150,962.23,38967.78,40
104,SHOP044,715,,1251090,348525.0,484555.0,418010,1550,546.0,735.0,...,545,409,713,37190,1087740,9870,116290,598.89,28303.06,34
105,SHOP030,530,,2754185,1003750.0,883125.0,867310,1636,696.0,805.0,...,534,397,887,75365,1948790,347690,382340,989.65,41864.72,44
106,SHOP038,411,,1444025,407920.0,502210.0,533895,1781,681.0,855.0,...,544,339,980,27950,1104080,98035,213960,553.48,30876.94,42
107,SHOP029,597,,1962105,489030.0,857800.0,615275,2052,601.0,945.0,...,631,529,1059,55100,1363780,203245,339980,674.96,46140.28,47
108,SHOP096,676,,2600915,681745.0,1026630.0,892540,2497,911.0,1164.0,...,772,457,1480,29175,2190780,105350,275610,692.84,48126.39,60
109,SHOP092,617,,1961240,504345.0,968280.0,488615,2247,786.0,1057.0,...,748,499,1108,51300,1375610,51870,482460,661.24,35130.83,48


In [521]:
store_table = store_table.loc[store_table.shop_id != "SHOP008"]

In [522]:
store_labelled = store_table.loc[:99]
store_labelled

Unnamed: 0,shop_id,shop_area_sq_ft,shop_profile,tot_sales,oct_sales,nov_sales,dec_sales,tot_customers,oct_customers,nov_customers,...,customers_500_bottles,customers_1000_bottles,customers_1500_bottles,milk_sales,soft_d_sales,water_sales,fruit_d_sales,avg_bill_value,weekend_avg_sales,customer_per_day
0,SHOP047,528,Moderate,838280,307505.0,339620.0,191155,928,370.0,443.0,...,301,238,422,34040,638470,45010,120760,641.38,17221.94,21
1,SHOP009,676,High,1966710,586745.0,706010.0,673955,2498,898.0,1169.0,...,1000,490,1196,41055,1505660,64575,355420,555.57,36620.28,57
2,SHOP083,676,Low,1691985,556160.0,658520.0,477305,1900,697.0,929.0,...,596,503,956,40720,1347880,74305,229080,617.51,34137.50,44
3,SHOP117,676,Low,2325980,614355.0,933530.0,778095,2037,739.0,1029.0,...,696,635,1017,28705,1892330,54775,350170,730.75,50193.33,51
4,SHOP042,676,Low,1340215,390960.0,528585.0,420670,1841,641.0,862.0,...,636,453,881,38145,1069130,48020,184920,530.99,29543.61,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,SHOP124,606,High,2967190,706165.0,1366575.0,894450,2281,817.0,1130.0,...,840,608,1078,96275,2148980,287945,433990,898.33,48450.28,53
96,SHOP012,336,Low,1102670,231270.0,420940.0,450460,1262,387.0,619.0,...,442,213,619,36540,782780,160300,123050,586.84,12960.56,30
97,SHOP093,705,High,1726515,820290.0,531640.0,374585,1493,493.0,650.0,...,536,363,605,27680,779370,541695,377770,922.28,33358.61,30
98,SHOP004,516,Low,1571700,408305.0,587295.0,576100,1744,634.0,861.0,...,498,436,914,44645,1282880,31185,212990,590.64,33964.44,43


In [523]:
store_test = store_table.loc[100:]
store_test

Unnamed: 0,shop_id,shop_area_sq_ft,shop_profile,tot_sales,oct_sales,nov_sales,dec_sales,tot_customers,oct_customers,nov_customers,...,customers_500_bottles,customers_1000_bottles,customers_1500_bottles,milk_sales,soft_d_sales,water_sales,fruit_d_sales,avg_bill_value,weekend_avg_sales,customer_per_day
100,SHOP046,545,,2607865,556485.0,1170390.0,880990,1736,679.0,855.0,...,608,389,935,34640,1743320,578515,251390,944.54,32302.22,44
101,SHOP024,676,,1308795,335055.0,601275.0,372465,1641,575.0,850.0,...,559,337,884,22880,1075760,63525,146630,565.35,22019.72,37
102,SHOP023,617,,2553330,916485.0,990420.0,646425,2227,799.0,1108.0,...,645,475,1243,65750,1759360,185710,542510,802.68,50219.17,51
103,SHOP097,310,,2375735,610460.0,1057700.0,707575,1667,600.0,809.0,...,683,365,785,221930,1660760,160895,332150,962.23,38967.78,40
104,SHOP044,715,,1251090,348525.0,484555.0,418010,1550,546.0,735.0,...,545,409,713,37190,1087740,9870,116290,598.89,28303.06,34
105,SHOP030,530,,2754185,1003750.0,883125.0,867310,1636,696.0,805.0,...,534,397,887,75365,1948790,347690,382340,989.65,41864.72,44
106,SHOP038,411,,1444025,407920.0,502210.0,533895,1781,681.0,855.0,...,544,339,980,27950,1104080,98035,213960,553.48,30876.94,42
107,SHOP029,597,,1962105,489030.0,857800.0,615275,2052,601.0,945.0,...,631,529,1059,55100,1363780,203245,339980,674.96,46140.28,47
108,SHOP096,676,,2600915,681745.0,1026630.0,892540,2497,911.0,1164.0,...,772,457,1480,29175,2190780,105350,275610,692.84,48126.39,60
109,SHOP092,617,,1961240,504345.0,968280.0,488615,2247,786.0,1057.0,...,748,499,1108,51300,1375610,51870,482460,661.24,35130.83,48


## Split to train and validation

In [524]:
from sklearn.model_selection import train_test_split
store_train, store_val = train_test_split(store_labelled,test_size = 0.25, random_state = 1)

In [525]:
store_train_x = store_train.drop(columns=['shop_profile'])
store_train_y = store_train.shop_profile

store_val_x = store_val.drop(columns=['shop_profile'])
store_val_y = store_val.shop_profile

store_test_x = store_test.drop(columns=['shop_profile'])

In [526]:
store_train_x.index = store_train_x.shop_id
store_train_x = store_train_x.drop(columns=['shop_id'])

store_val_x.index = store_val_x.shop_id
store_val_x = store_val_x.drop(columns=['shop_id'])

store_test_x.index = store_test_x.shop_id
store_test_x = store_test_x.drop(columns=['shop_id'])

## Feature Scaling

In [527]:
std_scaler = StandardScaler()
data_cols = store_train_x.columns
train_x = std_scaler.fit_transform(store_train_x)
val_x = std_scaler.transform(store_val_x)
test_x = std_scaler.transform(store_test_x)

In [528]:
train_x = pd.DataFrame(train_x, columns=data_cols)
val_x = pd.DataFrame(val_x, columns=data_cols)
test_x = pd.DataFrame(test_x, columns=data_cols)

In [529]:
store_train_y = store_train_y.replace('Low', 0)
store_train_y = store_train_y.replace('Moderate', 1)
store_train_y = store_train_y.replace('High', 2)

store_val_y = store_val_y.replace('Low', 0)
store_val_y = store_val_y.replace('Moderate', 1)
store_val_y = store_val_y.replace('High', 2)

## Feature Selection

In [530]:
feature_sel_model = svm.SVC(kernel='linear')

In [531]:
recur_fe = RFE(estimator=feature_sel_model, n_features_to_select = 0.6)
features = recur_fe.fit(train_x, store_train_y, )

print("Number of features selected = " + str(features.n_features_))
bool_features_sel = features.support_
rank_features_sel = features.ranking_

Number of features selected = 14


In [532]:
col=0
list_features_sel = []
for i in bool_features_sel:
    if i:
        list_features_sel.append(train_x.columns[col])
    col+=1
list_features_sel

['shop_area_sq_ft',
 'oct_sales',
 'nov_sales',
 'dec_sales',
 'oct_customers',
 'dec_customers',
 'frequent_customers',
 'customers_180_bottles',
 'customers_500_bottles',
 'customers_1000_bottles',
 'customers_1500_bottles',
 'water_sales',
 'fruit_d_sales',
 'avg_bill_value']

In [533]:
train_x_sel = train_x.loc[:,list_features_sel]
val_x_sel = val_x.loc[:,list_features_sel]
test_x_sel = test_x.loc[:,list_features_sel]

## Model Training

In [534]:
model = svm.SVC(kernel='rbf')

In [535]:
model.fit(train_x_sel, store_train_y)
train_pred_y_for = model.predict(train_x_sel).round()
print(f1_score(store_train_y,train_pred_y_for,average='micro'))
train_pred_y_val = model.predict(val_x_sel).round()
print(f1_score(store_val_y, train_pred_y_val,average='micro'))

0.8648648648648649
0.64


## Predicting

In [536]:
test_pred_y = model.predict(test_x_sel).round()
test_pred_y = pd.DataFrame(test_pred_y)
test_pred_y = test_pred_y.replace(0,'Low')
test_pred_y = test_pred_y.replace(1,'Moderate')
test_pred_y = test_pred_y.replace(2,'High')
test_pred_y.rename(columns={0:'shop_profile'},inplace=True)
test_pred_y

Unnamed: 0,shop_profile
0,High
1,Low
2,High
3,Moderate
4,Low
5,Moderate
6,Low
7,Low
8,High
9,Low


In [537]:
test_db = test_db.drop(columns=['shop_profile'])
test_db

Unnamed: 0,shop_id
0,SHOP046
1,SHOP024
2,SHOP023
3,SHOP097
4,SHOP044
5,SHOP030
6,SHOP038
7,SHOP029
8,SHOP096
9,SHOP092


In [538]:
submission = pd.merge(test_db, test_pred_y, left_index=True, right_index=True)
submission

Unnamed: 0,shop_id,shop_profile
0,SHOP046,High
1,SHOP024,Low
2,SHOP023,High
3,SHOP097,Moderate
4,SHOP044,Low
5,SHOP030,Moderate
6,SHOP038,Low
7,SHOP029,Low
8,SHOP096,High
9,SHOP092,Low


In [539]:
submission.shop_profile.value_counts()

High        10
Low         10
Moderate     4
Name: shop_profile, dtype: int64

In [540]:
submission.to_csv('submission2.csv', index=False)