In [None]:
"""
How to scale Pandas DataFrame columns with the scikit-learn 
MinMaxScaler in Python./
Min-max scaling is a common feature pre-processing 
technique which results in scaled data values 
that fall in the range [0,1]. When applied to a Python sequence,
such as a Pandas Series, scaling results in 
a new sequence such that 0 is the minimum value 
and 1 is the maximum value of the prior unscaled sequence. 
If the sequence is [1, 2, 3], then the scaled sequence is [0, 0.5, 1].

In machine learning, feature scaling can improve 
the convergence speed of various algorithms.

"""


In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

df = pd.DataFrame({
    "A":[0,1,2,3,4,5,89,78],
    "B":[25,50,75,100,125,150,67,55]})

min_max_scaler = preprocessing.MinMaxScaler()

print(df)


    A    B
0   0   25
1   1   50
2   2   75
3   3  100
4   4  125
5   5  150
6  89   67
7  78   55


In [5]:
df[["A","B"]] = min_max_scaler.fit_transform(df[["A", "B"]])

In [6]:
df

Unnamed: 0,A,B
0,0.0,0.0
1,0.011236,0.2
2,0.022472,0.4
3,0.033708,0.6
4,0.044944,0.8
5,0.05618,1.0
6,1.0,0.336
7,0.876404,0.24


In [7]:
pd.__version__

'1.5.3'

In [8]:
df=pd.DataFrame({"country":["Russia","Germany",
                            "Australia","Korea","Germany"]})
df

Unnamed: 0,country
0,Russia
1,Germany
2,Australia
3,Korea
4,Germany


In [10]:
df=pd.DataFrame({"country":["Russia","Germany","Australia","Korea","Germany"]})
df

Unnamed: 0,country
0,Russia
1,Germany
2,Australia
3,Korea
4,Germany


In [11]:
df=pd.DataFrame({"country":["Russia","Germany",
                            "Australia","Korea","Germany"]})
pd.get_dummies(df,prefix=["country"])

Unnamed: 0,country_Australia,country_Germany,country_Korea,country_Russia
0,0,0,0,1
1,0,1,0,0
2,1,0,0,0
3,0,0,1,0
4,0,1,0,0


In [12]:
df=pd.DataFrame({"country":["Russia","Germany","Australia","Korea","Germany"]})
pd.get_dummies(df["country"], prefix="country", drop_first=True)

Unnamed: 0,country_Germany,country_Korea,country_Russia
0,0,0,1
1,1,0,0
2,0,0,0
3,0,1,0
4,1,0,0


In [13]:
import pandas as pd
from pandas.api.types import CategoricalDtype

In [14]:
pd.__version__

'1.5.3'

In [18]:
#Add Columns for categories that only appear in the test set
import pandas as pd
from pandas.api.types import CategoricalDtype

df = pd.DataFrame({"country":["Russia","Germany","Australia","Korea","Germany"]})
df["country"] = df["country"].astype(CategoricalDtype(["Australia","Germany","Korea","Russia","Japan"]))
df

Unnamed: 0,country
0,Russia
1,Germany
2,Australia
3,Korea
4,Germany


In [17]:
pd.get_dummies(df["country"],prefix="country")

Unnamed: 0,country_Australia,country_Germany,country_Korea,country_Russia,country_Japan
0,0,0,0,1,0
1,0,1,0,0,0
2,1,0,0,0,0
3,0,0,1,0,0
4,0,1,0,0,0


In [21]:
#Add dummy columns to dataframe
import pandas as pd
from pandas.api.types import CategoricalDtype
import pandas as pd

# df noew has two columns: name and country
df = pd.DataFrame({
    "name":["Joseph", "Micheal","John", "Bawool", "Klaus"],
    "country": ["Russia","Germany","Australia","Korea","Germany"]})
df

Unnamed: 0,name,country
0,Joseph,Russia
1,Micheal,Germany
2,John,Australia
3,Bawool,Korea
4,Klaus,Germany


In [22]:
# use pd.concat to join the new columns with your original datafram
df = pd.concat([df,pd.get_dummies(df["country"], prefix="country")],axis=1)

#now drop the original "country " column(you dont need it anymore)
df.drop(["country"], axis=1, inplace=True)
df

Unnamed: 0,name,country_Australia,country_Germany,country_Korea,country_Russia
0,Joseph,0,0,0,1
1,Micheal,0,1,0,0
2,John,1,0,0,0
3,Bawool,0,0,1,0
4,Klaus,0,1,0,0


In [24]:
#Treat Nulls/NaNs as a separate category
import numpy as np
df = pd.DataFrame({
    'country': ['germany',np.nan,'germany','united kingdom','america','united kingdom']
})
df

Unnamed: 0,country
0,germany
1,
2,germany
3,united kingdom
4,america
5,united kingdom


In [25]:
pd.get_dummies(df)

Unnamed: 0,country_america,country_germany,country_united kingdom
0,0,1,0
1,0,0,0
2,0,1,0
3,0,0,1
4,1,0,0
5,0,0,1


In [26]:
df = pd.DataFrame({'country': ['russia', 'germany', 'australia','korea','germany']})

pd.get_dummies(df,prefix=['country'], drop_first=True)

Unnamed: 0,country_germany,country_korea,country_russia
0,0,0,1
1,1,0,0
2,0,0,0
3,0,1,0
4,1,0,0
