<a href="https://colab.research.google.com/github/lukaszparadylo/machine-learning/blob/main/03_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import numpy as np
import pandas as pd
import sklearn

sklearn.__version__

'1.2.2'

In [2]:
def fetch_finantial_data(company='AMZN'):
  """
  Stock market quotations
  """

  import pandas_datareader.data as web
  return web.DataReader(name=company, data_source='stooq')

df_raw = fetch_finantial_data()
#df_raw.head() #ostatnie 5 wierszy
df_raw

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-01-12,155.3900,156.2000,154.010,154.6200,40484155
2024-01-11,155.0400,157.1700,153.120,155.1800,49072691
2024-01-10,152.0600,154.4200,151.881,153.7300,44421830
2024-01-09,148.3300,151.7100,148.210,151.3700,43812567
2024-01-08,146.7400,149.4000,146.150,149.1000,46757053
...,...,...,...,...,...
2019-01-24,82.0535,82.8630,81.589,82.7465,81798860
2019-01-23,82.8000,82.8715,80.600,82.0010,104504240
2019-01-22,84.0500,84.0935,80.510,81.6085,128335920
2019-01-18,85.6000,85.8100,84.577,84.8100,120410060


## Kopia danych

In [4]:
df = df_raw.copy()
df = df[:5]
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2024-01-12 to 2024-01-08
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    5 non-null      float64
 1   High    5 non-null      float64
 2   Low     5 non-null      float64
 3   Close   5 non-null      float64
 4   Volume  5 non-null      int64  
dtypes: float64(4), int64(1)
memory usage: 240.0 bytes


## Generowanie nowych zmiennych

In [5]:
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-01-12,155.39,156.2,154.01,154.62,40484155,12,1,2024
2024-01-11,155.04,157.17,153.12,155.18,49072691,11,1,2024
2024-01-10,152.06,154.42,151.881,153.73,44421830,10,1,2024
2024-01-09,148.33,151.71,148.21,151.37,43812567,9,1,2024
2024-01-08,146.74,149.4,146.15,149.1,46757053,8,1,2024


In [6]:
df.index.day

Int64Index([12, 11, 10, 9, 8], dtype='int64', name='Date')

In [9]:
df = pd.DataFrame(data={'height': [175., 178., 185., 191., 184.5,183.,168.]})
df

Unnamed: 0,height
0,175.0
1,178.0
2,185.0
3,191.0
4,184.5
5,183.0
6,168.0


In [10]:
df['height_cat'] = pd.cut(x=df.height, bins=3)
df

Unnamed: 0,height,height_cat
0,175.0,"(167.977, 175.667]"
1,178.0,"(175.667, 183.333]"
2,185.0,"(183.333, 191.0]"
3,191.0,"(183.333, 191.0]"
4,184.5,"(183.333, 191.0]"
5,183.0,"(175.667, 183.333]"
6,168.0,"(167.977, 175.667]"


In [11]:
df['height_cat'] = pd.cut(x=df.height, bins=(160,175,180,195))
df

Unnamed: 0,height,height_cat
0,175.0,"(160, 175]"
1,178.0,"(175, 180]"
2,185.0,"(180, 195]"
3,191.0,"(180, 195]"
4,184.5,"(180, 195]"
5,183.0,"(180, 195]"
6,168.0,"(160, 175]"


In [12]:
df['height_cat'] = pd.cut(x=df.height, bins=(160,175,180,195), labels=['small', 'medium', 'high'])
df

Unnamed: 0,height,height_cat
0,175.0,small
1,178.0,medium
2,185.0,high
3,191.0,high
4,184.5,high
5,183.0,high
6,168.0,small


In [13]:
pd.get_dummies(df,drop_first=True, prefix='height')

Unnamed: 0,height,height_medium,height_high
0,175.0,0,0
1,178.0,1,0
2,185.0,0,1
3,191.0,0,1
4,184.5,0,1
5,183.0,0,1
6,168.0,0,0


In [15]:
df = pd.DataFrame(data={'lang':[['PL','ENG'],['GER','ENG','PL','FRA'],['RUS']]})
df

Unnamed: 0,lang
0,"[PL, ENG]"
1,"[GER, ENG, PL, FRA]"
2,[RUS]


In [16]:
df['PL_flag'] = df['lang'].apply(lambda x: 1 if 'PL' in x else 0)
df

Unnamed: 0,lang,PL_flag
0,"[PL, ENG]",1
1,"[GER, ENG, PL, FRA]",1
2,[RUS],0


In [17]:
df = pd.DataFrame(data={'website': ['wp.pl','onet.pl','google.pl']})
df

Unnamed: 0,website
0,wp.pl
1,onet.pl
2,google.pl


In [19]:
df.website.str.split('.', expand=True)

Unnamed: 0,0,1
0,wp,pl
1,onet,pl
2,google,pl


In [20]:
new = df.website.str.split('.', expand=True)
df['portal'] = new[0]
df['extension'] = new[1]
df

Unnamed: 0,website,portal,extension
0,wp.pl,wp,pl
1,onet.pl,onet,pl
2,google.pl,google,pl
