# Importing packages

In [5]:
import numpy as np
import pandas as pd

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

# Importing dataset

In [7]:
read_dataset_path = "data/data.csv"
save_dataset_path = "data/preprocessed.csv"

In [8]:
dataset = pd.read_csv(read_dataset_path)

# Visualizing dataset

In [9]:
dataset.columns

Index(['Town', 'Country', 'Store ID', 'Manager name', 'Staff', 'Floor Space',
       'Window', 'Car park', 'Demographic score', 'Location',
       '40min population', '30 min population', '20 min population',
       '10 min population', 'Store age', 'Clearance space',
       'Competition number', 'Competition score', 'Performance'],
      dtype='object')

In [10]:
dataset.head()

Unnamed: 0,Town,Country,Store ID,Manager name,Staff,Floor Space,Window,Car park,Demographic score,Location,40min population,30 min population,20 min population,10 min population,Store age,Clearance space,Competition number,Competition score,Performance
0,Swinton (Greater Manchester),UK,1437,Roger,9,18526,121,Yes,10,Retail Park,1288374,1138224,1006986,1002340,3,238,16,16,Good
1,Stamford,UK,1544,Ella,8,18569,121,Yes,11,Shopping Centre,1086225,1015321,1012182,1008436,4,384,15,19,Good
2,Skipton,UK,2039,Valentina,-2,12288,105,No,12,Retail Park,1595638,1281661,1104490,1011395,11,219,13,18,Bad
3,Stratton,UK,2055,Antonia,7,17092,117,No,14,High Street,1179395,1022959,1009496,1002169,5,261,15,12,Bad
4,Shepperton,UK,2141,Gabriel,7,11307,103,No,18,Retail Park,1398558,1085170,1003137,1002513,7,200,19,13,Bad


In [11]:
dataset.describe()

Unnamed: 0,Store ID,Staff,Floor Space,Window,Demographic score,40min population,30 min population,20 min population,10 min population,Store age,Clearance space,Competition number,Competition score
count,136.0,136.0,136.0,136.0,136.0,136.0,136.0,136.0,136.0,136.0,136.0,136.0,136.0
mean,52172.580882,13.352941,15024.095588,112.044118,14.463235,1487374.0,1249870.0,1134533.0,1066118.0,6.529412,262.426471,14.463235,14.808824
std,28725.990237,56.592439,2761.801921,6.884085,2.874944,280498.9,216087.5,149028.0,88845.85,2.815147,51.724288,2.895483,2.955258
min,1437.0,-2.0,10072.0,100.0,10.0,1030405.0,1000653.0,1000214.0,1000002.0,2.0,200.0,10.0,10.0
25%,24707.25,6.0,12793.0,106.75,12.0,1254949.0,1057284.0,1012573.0,1005593.0,4.0,220.75,12.0,13.0
50%,56011.5,7.0,14596.5,111.0,14.5,1516338.0,1214459.0,1077542.0,1033228.0,6.0,249.0,15.0,15.0
75%,76870.5,8.0,17543.75,118.0,17.0,1696352.0,1396967.0,1203077.0,1090349.0,9.0,292.25,17.0,18.0
max,99854.0,600.0,19869.0,124.0,19.0,1997044.0,1929089.0,1723395.0,1426533.0,11.0,408.0,19.0,19.0


# Removing columns insignificant for the model

Town column is text and has no significant value for the model. Dropping it!

In [12]:
dataset["Town"].value_counts()

 Swinton (Greater Manchester)    1
 Shoreham-by-Sea                 1
 South Cave                      1
 Settle                          1
 Sleaford                        1
                                ..
 Stainforth                      1
 Swindon                         1
 Stanford-le-Hope                1
 Shefford                        1
 Stapleford                      1
Name: Town, Length: 136, dtype: int64

In [13]:
dataset = dataset.drop(columns = ["Town"])

## Error in dataset

Country column has an error as France is not a part of this assignment. 2 rows are regarding France. Removing these rows from dataset and dropping the column

In [14]:
dataset["Country"].value_counts()

UK        134
France      2
Name: Country, dtype: int64

In [15]:
dataset.loc[dataset["Country"] == "France"]

Unnamed: 0,Country,Store ID,Manager name,Staff,Floor Space,Window,Car park,Demographic score,Location,40min population,30 min population,20 min population,10 min population,Store age,Clearance space,Competition number,Competition score,Performance
39,France,35207,Hannah,5,13127,107,No,16,High Street,1171849,1084983,1006564,1003013,7,235,19,10,Bad
95,France,69540,Crist�bal,9,13766,109,Yes,15,Retail Park,1951800,1750297,1195799,1054550,4,278,10,19,Bad


In [16]:
dataset = dataset.drop([dataset.index[39], dataset.index[95]])

In [17]:
dataset["Country"].value_counts()

UK    134
Name: Country, dtype: int64

In [18]:
dataset = dataset.drop(columns="Country")

Manager names are not required and dropped from dataset

In [19]:
dataset["Manager name"].value_counts()

Madison    3
Sarah      3
Emma       3
Olivia     2
Sof�a      2
          ..
Larissa    1
Jeremy     1
Sydney     1
Logan      1
Lea        1
Name: Manager name, Length: 117, dtype: int64

In [20]:
dataset = dataset.drop(columns = ["Manager name"])

store ids are not required and dropped from dataset

In [21]:
dataset = dataset.drop(columns = ["Store ID"])

In [22]:
dataset.columns

Index(['Staff', 'Floor Space', 'Window', 'Car park', 'Demographic score',
       'Location', '40min population', '30 min population',
       '20 min population', '10 min population', 'Store age',
       'Clearance space', 'Competition number', 'Competition score',
       'Performance'],
      dtype='object')

In [23]:
dataset.head()

Unnamed: 0,Staff,Floor Space,Window,Car park,Demographic score,Location,40min population,30 min population,20 min population,10 min population,Store age,Clearance space,Competition number,Competition score,Performance
0,9,18526,121,Yes,10,Retail Park,1288374,1138224,1006986,1002340,3,238,16,16,Good
1,8,18569,121,Yes,11,Shopping Centre,1086225,1015321,1012182,1008436,4,384,15,19,Good
2,-2,12288,105,No,12,Retail Park,1595638,1281661,1104490,1011395,11,219,13,18,Bad
3,7,17092,117,No,14,High Street,1179395,1022959,1009496,1002169,5,261,15,12,Bad
4,7,11307,103,No,18,Retail Park,1398558,1085170,1003137,1002513,7,200,19,13,Bad


# Label encoding all binary columns

In [24]:
dataset["Car park"].value_counts()

Yes    93
No     34
Y       4
N       3
Name: Car park, dtype: int64

In [25]:
dataset["Car park"] = dataset["Car park"].replace("Y", "Yes").replace("N", "No").reset_index()["Car park"]

In [26]:
dataset["Car park"].value_counts()

Yes    95
No     37
Name: Car park, dtype: int64

In [27]:
car_park_LE = LabelEncoder()

In [28]:
dataset["Car park"] = car_park_LE.fit_transform(dataset["Car park"])

In [29]:
dataset["Performance"].value_counts()

Good    69
Bad     65
Name: Performance, dtype: int64

In [30]:
performance_LE = LabelEncoder()

In [31]:
dataset["Performance"] = performance_LE.fit_transform(dataset["Performance"])

# One hot encoding all categorical columns

In [32]:
data = dataset.iloc[:, :].values

In [33]:
data.shape

(134, 15)

In [34]:
data[0][5]

'Retail Park'

In [35]:
location_OHE = OneHotEncoder(sparse_output=False)

In [36]:
locations = location_OHE.fit_transform(np.array(dataset["Location"]).reshape(-1, 1))

In [37]:
locations = locations[:, :-1]

In [38]:
locations.shape

(134, 3)

In [39]:
locations = locations.reshape(3, 134)

In [40]:
dataset = dataset.drop(columns = ["Location"])

In [41]:
dataset["location0"] = locations[0]
dataset["location1"] = locations[1]
dataset["location2"] = locations[2]

In [42]:
dataset.head()

Unnamed: 0,Staff,Floor Space,Window,Car park,Demographic score,40min population,30 min population,20 min population,10 min population,Store age,Clearance space,Competition number,Competition score,Performance,location0,location1,location2
0,9,18526,121,1,10,1288374,1138224,1006986,1002340,3,238,16,16,1,0.0,0.0,0.0
1,8,18569,121,1,11,1086225,1015321,1012182,1008436,4,384,15,19,1,1.0,1.0,1.0
2,-2,12288,105,0,12,1595638,1281661,1104490,1011395,11,219,13,18,0,0.0,0.0,0.0
3,7,17092,117,0,14,1179395,1022959,1009496,1002169,5,261,15,12,0,0.0,0.0,0.0
4,7,11307,103,0,18,1398558,1085170,1003137,1002513,7,200,19,13,0,0.0,0.0,1.0


# Scaling all data

In [43]:
columns_to_scale = ["Staff", 
                    "Floor Space",
                    "Window", 
                    "Demographic score", 
                    "40min population", 
                    "30 min population", 
                    "20 min population", 
                    "10 min population",
                    "Store age",
                    "Clearance space",
                    "Competition number",
                    "Competition score"]

In [44]:
for column in columns_to_scale:
    scaler = MinMaxScaler()
    dataset[column] = scaler.fit_transform(np.array(dataset[column]).reshape(134, 1))

In [45]:
dataset.head()

Unnamed: 0,Staff,Floor Space,Window,Car park,Demographic score,40min population,30 min population,20 min population,10 min population,Store age,Clearance space,Competition number,Competition score,Performance,location0,location1,location2
0,0.018272,0.862917,0.875,1,0.0,0.266872,0.148175,0.009364,0.005481,0.111111,0.182692,0.666667,0.666667,1,0.0,0.0,0.0
1,0.016611,0.867306,0.875,1,0.111111,0.057746,0.015799,0.016549,0.019773,0.222222,0.884615,0.555556,1.0,1,1.0,1.0,1.0
2,0.0,0.226192,0.208333,0,0.222222,0.584741,0.302668,0.144191,0.026711,1.0,0.091346,0.333333,0.888889,0,0.0,0.0,0.0
3,0.01495,0.716546,0.708333,0,0.444444,0.154132,0.024025,0.012835,0.005081,0.333333,0.293269,0.555556,0.222222,0,0.0,0.0,0.0
4,0.01495,0.126059,0.125,0,0.888889,0.380859,0.091032,0.004042,0.005887,0.555556,0.0,1.0,0.333333,0,0.0,0.0,1.0


In [46]:
dataset.describe()

Unnamed: 0,Staff,Floor Space,Window,Car park,Demographic score,40min population,30 min population,20 min population,10 min population,Store age,Clearance space,Competition number,Competition score,Performance,location0,location1,location2
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,0.025661,0.507874,0.504353,0.738806,0.494196,0.471591,0.26573,0.186422,0.156315,0.504975,0.300553,0.495854,0.534826,0.514925,0.328358,0.328358,0.335821
std,0.094702,0.283289,0.288189,0.473812,0.321444,0.28798,0.229249,0.206917,0.209449,0.314156,0.250192,0.318278,0.325063,0.501653,0.471378,0.471378,0.474049
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.013289,0.264877,0.260417,0.0,0.222222,0.236217,0.059966,0.017673,0.013241,0.222222,0.097356,0.222222,0.333333,0.0,0.0,0.0,0.0
50%,0.01495,0.474329,0.479167,1.0,0.444444,0.502703,0.230286,0.106928,0.077899,0.444444,0.235577,0.555556,0.555556,1.0,0.0,0.0,0.0
75%,0.016611,0.76526,0.75,1.0,0.777778,0.685002,0.425342,0.287679,0.212329,0.777778,0.445913,0.777778,0.861111,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Saving preprocessed dataset to disk

In [49]:
performance = dataset["Performance"]
dataset = dataset.drop(columns = ["Performance"])
dataset["performance"] = performance

In [51]:
dataset.to_csv(save_dataset_path, index=False)