<a href="https://colab.research.google.com/github/miragasko/ml/blob/main/supervised/01_basics/03_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import sklearn

sklearn.__version__

'1.6.1'

# Uploading data

In [2]:
def fetch_financial_data(company='AMZN'):
    '''
    This function fetches stock market quotations.
    '''
    import pandas_datareader.data as web
    return web.DataReader(name=company, data_source='stooq')

df_raw = fetch_financial_data()
df_raw.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-02-07,232.5,234.81,228.06,229.15,77539276
2025-02-06,238.01,239.6599,236.01,238.83,60897095
2025-02-05,237.02,238.32,235.2,236.17,38832042
2025-02-04,239.01,242.52,238.03,242.06,29713812
2025-02-03,234.06,239.25,232.9,237.42,37285868


# Creating a copy of data

In [9]:
df = df_raw.copy()
df['Volume'] = df['Volume'].astype('float64')
df = df[:5]
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2025-02-07 to 2025-02-03
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    5 non-null      float64
 1   High    5 non-null      float64
 2   Low     5 non-null      float64
 3   Close   5 non-null      float64
 4   Volume  5 non-null      float64
dtypes: float64(5)
memory usage: 240.0 bytes


In [11]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-02-07,232.5,234.81,228.06,229.15,77539276.0
2025-02-06,238.01,239.6599,236.01,238.83,60897095.0
2025-02-05,237.02,238.32,235.2,236.17,38832042.0
2025-02-04,239.01,242.52,238.03,242.06,29713812.0
2025-02-03,234.06,239.25,232.9,237.42,37285868.0


In [14]:
df.index

DatetimeIndex(['2025-02-07', '2025-02-06', '2025-02-05', '2025-02-04',
               '2025-02-03'],
              dtype='datetime64[ns]', name='Date', freq=None)

In [15]:
df.index.day

Index([7, 6, 5, 4, 3], dtype='int32', name='Date')

In [18]:
df.index.month

Index([2, 2, 2, 2, 2], dtype='int32', name='Date')

In [19]:
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-02-07,232.5,234.81,228.06,229.15,77539276.0,7,2,2025
2025-02-06,238.01,239.6599,236.01,238.83,60897095.0,6,2,2025
2025-02-05,237.02,238.32,235.2,236.17,38832042.0,5,2,2025
2025-02-04,239.01,242.52,238.03,242.06,29713812.0,4,2,2025
2025-02-03,234.06,239.25,232.9,237.42,37285868.0,3,2,2025


In [20]:
df = pd.DataFrame(data={'height': [175.,178.5,185.,191,184.5,183.,168.]})
df

Unnamed: 0,height
0,175.0
1,178.5
2,185.0
3,191.0
4,184.5
5,183.0
6,168.0


In [23]:
df['height_cat'] = pd.cut(x=df.height, bins=3)
df

Unnamed: 0,height,height_cat
0,175.0,"(167.977, 175.667]"
1,178.5,"(175.667, 183.333]"
2,185.0,"(183.333, 191.0]"
3,191.0,"(183.333, 191.0]"
4,184.5,"(183.333, 191.0]"
5,183.0,"(175.667, 183.333]"
6,168.0,"(167.977, 175.667]"


In [24]:
df['height_cat'] = pd.cut(x=df.height, bins=(160, 175, 180, 195))
df

Unnamed: 0,height,height_cat
0,175.0,"(160, 175]"
1,178.5,"(175, 180]"
2,185.0,"(180, 195]"
3,191.0,"(180, 195]"
4,184.5,"(180, 195]"
5,183.0,"(180, 195]"
6,168.0,"(160, 175]"


In [25]:
df['height_cat'] = pd.cut(x=df.height, bins=(160, 175, 180, 195), labels=['small','medium','high'])
df

Unnamed: 0,height,height_cat
0,175.0,small
1,178.5,medium
2,185.0,high
3,191.0,high
4,184.5,high
5,183.0,high
6,168.0,small


In [26]:
pd.get_dummies(df, drop_first=True, prefix='height')

Unnamed: 0,height,height_medium,height_high
0,175.0,False,False
1,178.5,True,False
2,185.0,False,True
3,191.0,False,True
4,184.5,False,True
5,183.0,False,True
6,168.0,False,False


In [27]:
pd.get_dummies(df, dtype=int, drop_first=True, prefix='height')

Unnamed: 0,height,height_medium,height_high
0,175.0,0,0
1,178.5,1,0
2,185.0,0,1
3,191.0,0,1
4,184.5,0,1
5,183.0,0,1
6,168.0,0,0


# Features extraction

In [28]:
df = pd.DataFrame(data={'lang': [['PL','ENG'], ['GER','ENG','PL','FRA'], ['RUS']] })
df

Unnamed: 0,lang
0,"[PL, ENG]"
1,"[GER, ENG, PL, FRA]"
2,[RUS]


In [31]:
df['lang_num'] = df['lang'].apply(len)
df

Unnamed: 0,lang,lang_num
0,"[PL, ENG]",2
1,"[GER, ENG, PL, FRA]",4
2,[RUS],1


In [32]:
df['PL_flag'] = df['lang'].apply(lambda x: 1 if 'PL' in x else 0)
df

Unnamed: 0,lang,lang_num,PL_flag
0,"[PL, ENG]",2,1
1,"[GER, ENG, PL, FRA]",4,1
2,[RUS],1,0


In [33]:
df = pd.DataFrame(data={'website': ['wp.pl', 'onet.pl', 'google.com']})
df

Unnamed: 0,website
0,wp.pl
1,onet.pl
2,google.com


In [34]:
df.website.str.split('.', expand=True)

Unnamed: 0,0,1
0,wp,pl
1,onet,pl
2,google,com


In [35]:
new = df.website.str.split('.', expand=True)
df['portal'] = new[0]
df['extension'] = new[1]
df

Unnamed: 0,website,portal,extension
0,wp.pl,wp,pl
1,onet.pl,onet,pl
2,google.com,google,com
