<a href="https://colab.research.google.com/github/niedzwiedz-source/ML-bootcamp-2025/blob/main/basics_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Podstawy na temat przygotowania danych do nauki modelu

Korzystamy z biblioteki https://scikit-learn.org/stable/

Instalacja biblioteki - kod poniżej:


In [None]:
!pip install scikit-learn

### Import bibliotek


In [None]:
import numpy as np
import pandas as pd
import sklearn

sklearn.__version__

### Wygenerowanie danych

In [None]:
data = {
    'size':['M','L','XL','L'],
    'gender':['famale','male','female','male'],
    'color': ['red','blue','green','red'],
    'price': [199.0, 89.0, 99.0, 129.0],
    'weight': [500, 450, 300, 380],
    'bought': ['yes','no','yes','no']

}

df_raw = pd.DataFrame(data=data)
df_raw

Unnamed: 0,size,gender,color,price,weight,bought
0,M,famale,red,199.0,500,yes
1,L,male,blue,89.0,450,no
2,XL,female,green,99.0,300,yes
3,L,male,red,129.0,380,no


###Utworzenie kopii danych

In [None]:
df = df_raw.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   size    4 non-null      object 
 1   gender  4 non-null      object 
 2   color   4 non-null      object 
 3   price   4 non-null      float64
 4   weight  4 non-null      int64  
 5   bought  4 non-null      object 
dtypes: float64(1), int64(1), object(4)
memory usage: 324.0+ bytes


###Zmiana typu danych i wstępna eksploracja

In [None]:
for col in ['size','gender','color','bought']:
  df[col] = df[col].astype('category')

df['weight'] = df['weight'].astype('float')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   size    4 non-null      category
 1   gender  4 non-null      category
 2   color   4 non-null      category
 3   price   4 non-null      float64 
 4   weight  4 non-null      float64 
 5   bought  4 non-null      category
dtypes: category(4), float64(2)
memory usage: 732.0 bytes


###LabelEncoder
Wykorzystywany do przygotowania "celu" (target) w postaci numerycznej

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['bought'])
le.transform(df['bought'])

array([1, 0, 1, 0])

In [None]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,4.0,129.0,49.665548,89.0,96.5,114.0,146.5,199.0
weight,4.0,407.5,86.938676,300.0,360.0,415.0,462.5,500.0


In [None]:
df.describe(include=['category']).T

Unnamed: 0,count,unique,top,freq
size,4,3,L,2
gender,4,3,male,2
color,4,3,red,2
bought,4,2,no,2


In [None]:
df

Unnamed: 0,size,gender,color,price,weight,bought
0,M,famale,red,199.0,500.0,yes
1,L,male,blue,89.0,450.0,no
2,XL,female,green,99.0,300.0,yes
3,L,male,red,129.0,380.0,no


In [None]:
df['bought'] = le.fit_transform(df['bought'])
df

Unnamed: 0,size,gender,color,price,weight,bought
0,M,famale,red,199.0,500.0,1
1,L,male,blue,89.0,450.0,0
2,XL,female,green,99.0,300.0,1
3,L,male,red,129.0,380.0,0


###Funkcja get.dummies()
Funkcja ma na celu ustrukturyzowanie wartości tekstowych do postaci numerycznej.

In [None]:
pd.get_dummies(data=df)

Unnamed: 0,price,weight,bought,size_L,size_M,size_XL,gender_famale,gender_female,gender_male,color_blue,color_green,color_red
0,199.0,500.0,1,False,True,False,True,False,False,False,False,True
1,89.0,450.0,0,True,False,False,False,False,True,True,False,False
2,99.0,300.0,1,False,False,True,False,True,False,False,True,False
3,129.0,380.0,0,True,False,False,False,False,True,False,False,True


In [None]:
pd.get_dummies(data=df,drop_first=True)

Unnamed: 0,price,weight,bought,size_M,size_XL,gender_female,gender_male,color_green,color_red
0,199.0,500.0,1,True,False,False,False,False,True
1,89.0,450.0,0,False,False,False,True,False,False
2,99.0,300.0,1,False,True,True,False,True,False
3,129.0,380.0,0,False,False,False,True,False,True


In [None]:
pd.get_dummies(data=df,columns=['size'])

Unnamed: 0,gender,color,price,weight,bought,size_L,size_M,size_XL
0,famale,red,199.0,500.0,1,False,True,False
1,male,blue,89.0,450.0,0,True,False,False
2,female,green,99.0,300.0,1,False,False,True
3,male,red,129.0,380.0,0,True,False,False


###Standaryzacja
std() - pandas nieobciążony

std() - numpy obciążony

In [None]:
print(f"{df['price']}\n")
print(f"Średnia: {df['price'].mean()}")
print(f"Odchylenie standardowe: {df['price'].std():.4f}")

0    199.0
1     89.0
2     99.0
3    129.0
Name: price, dtype: float64

Średnia: 129.0
Odchylenie standardowe: 49.6655


In [None]:
(df['price'] - df['price'].mean()) / df['price'].std()

Unnamed: 0,price
0,1.409428
1,-0.805387
2,-0.60404
3,0.0


In [None]:
def standardize(x):
    return (x - x.mean()) / x.std()

standardize(df['price'])

Unnamed: 0,price
0,1.409428
1,-0.805387
2,-0.60404
3,0.0


In [None]:
from sklearn.preprocessing import scale

scale(df['price'])

array([ 1.62746694, -0.92998111, -0.69748583,  0.        ])

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df[['price']])
scaler.transform(df[['price']])

array([[ 1.62746694],
       [-0.92998111],
       [-0.69748583],
       [ 0.        ]])

In [None]:
scaler = StandardScaler()
df[['price', 'weight']] = scaler.fit_transform(df[['price', 'weight']])
df

Unnamed: 0,size,gender,color,price,weight,bought
0,M,famale,red,1.627467,1.228565,1
1,L,male,blue,-0.929981,0.564476,0
2,XL,female,green,-0.697486,-1.427792,1
3,L,male,red,0.0,-0.365249,0


###Przygotowanie danych do modelu - podsumowanie

In [None]:
df = df_raw.copy()

In [None]:
df

Unnamed: 0,size,gender,color,price,weight,bought
0,M,famale,red,199.0,500,yes
1,L,male,blue,89.0,450,no
2,XL,female,green,99.0,300,yes
3,L,male,red,129.0,380,no


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['bought'] = le.fit_transform(df['bought'])
df

Unnamed: 0,size,gender,color,price,weight,bought
0,M,famale,red,199.0,500,1
1,L,male,blue,89.0,450,0
2,XL,female,green,99.0,300,1
3,L,male,red,129.0,380,0


In [None]:
scaler = StandardScaler()
df[['price','weight']]= scaler.fit_transform(df[['price','weight']])

In [None]:
df=pd.get_dummies(data=df,drop_first=True)
df

Unnamed: 0,price,weight,bought,size_M,size_XL,gender_female,gender_male,color_green,color_red
0,1.627467,1.228565,1,True,False,False,False,False,True
1,-0.929981,0.564476,0,False,False,False,True,False,False
2,-0.697486,-1.427792,1,False,True,True,False,True,False
3,0.0,-0.365249,0,False,False,False,True,False,True
