In [1]:
import pandas as pd 
import warnings
warnings.filterwarnings('ignore') 

In [2]:
df = pd.read_csv('supershops.csv') 

In [3]:
df.head() 

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [4]:
x =  df.drop('Profit', axis=1)

In [5]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,114523.61,136897.8,471784.1,Dhaka
1,162597.7,151377.59,443898.53,Ctg
2,153441.51,101145.55,407934.54,Rangpur
3,144372.41,118671.85,383199.62,Dhaka
4,142107.34,91391.77,366168.42,Rangpur


# Measure of Central Tendency

In [6]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [7]:
df[19:20]

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
19,86419.7,153514.11,,Dhaka,122776.86


In [8]:
df.Transport.mean() #not for outlier

215331.73244897963

In [9]:
df.Transport.median() #for outliers

214634.81

In [10]:
df.Transport.max()

471784.1

In [11]:
df.Transport.min()

0.0

In [12]:
df.Transport = df.Transport.fillna(df.Transport.mean())

In [13]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [14]:
df[19:20]

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
19,86419.7,153514.11,215331.732449,Dhaka,122776.86


# Label Encoder

In [15]:
def label_encoder(df, col):
    label_map = {}
    encoded_value = []
    
    for instance in df[col]:
        if instance not in label_map:
            label_map[instance] = len(label_map)
        encoded_value.append(label_map[instance])
    return encoded_value, label_map

encoded_value, label_map = label_encoder(x, 'Area')
x.Area = encoded_value

In [16]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,114523.61,136897.8,471784.1,0
1,162597.7,151377.59,443898.53,1
2,153441.51,101145.55,407934.54,2
3,144372.41,118671.85,383199.62,0
4,142107.34,91391.77,366168.42,2


In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
le = LabelEncoder()

In [19]:
df1 = df.copy()

In [20]:
df1.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [21]:
df1.Area = le.fit_transform(df[['Area']])

In [22]:
df1.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


# Using For Loop

In [23]:
#assignment

In [24]:
df = pd.read_csv('supershops.csv') 
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


# One Hot Encoder

In [25]:
df2 = df.copy()

In [26]:
pd.get_dummies(df2, prefix='area', drop_first=True).head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,area_Dhaka,area_Rangpur
0,114523.61,136897.8,471784.1,192261.83,1,0
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1
3,144372.41,118671.85,383199.62,182901.99,1,0
4,142107.34,91391.77,366168.42,166187.94,0,1


In [27]:
#For Loop

In [28]:
df = pd.read_csv('supershops.csv') 
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


# ORDINAL

In [29]:
df2.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [30]:
from sklearn.preprocessing import OrdinalEncoder

In [31]:
ordinal = OrdinalEncoder(categories = [['Dhaka', 'Ctg', 'Rangpur']])

In [32]:
df2.Area = ordinal.fit_transform(df2[['Area']])

In [33]:
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,0.0,192261.83
1,162597.7,151377.59,443898.53,1.0,191792.06
2,153441.51,101145.55,407934.54,2.0,191050.39
3,144372.41,118671.85,383199.62,0.0,182901.99
4,142107.34,91391.77,366168.42,2.0,166187.94


In [34]:
#For Loop

In [35]:
df = pd.read_csv('supershops.csv') 
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94
