<a href="https://colab.research.google.com/github/neoleszcz/ai-py/blob/main/classification/feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import sklearn
sklearn.__version__

def fetch_financial_data(company='JMIA'):
    """
    This function fetches stock market quotations.
    """
    import pandas_datareader.data as web
    return web.DataReader(name=company, data_source='stooq')

df_raw = fetch_financial_data()
df_raw.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-02-17,3.69,3.69,3.49,3.6,3351029
2023-02-16,3.97,4.02,3.6044,3.69,6393199
2023-02-15,3.71,4.21,3.71,4.18,4598678
2023-02-14,3.76,3.88,3.59,3.82,2327667
2023-02-13,3.73,3.915,3.66,3.77,2000753


In [4]:
df = df_raw.copy()
df = df[:50]
df.info()
     

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 50 entries, 2023-02-17 to 2022-12-07
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    50 non-null     float64
 1   High    50 non-null     float64
 2   Low     50 non-null     float64
 3   Close   50 non-null     float64
 4   Volume  50 non-null     int64  
dtypes: float64(4), int64(1)
memory usage: 2.3 KB


In [5]:
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year
df


Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-02-17,3.69,3.69,3.49,3.6,3351029,17,2,2023
2023-02-16,3.97,4.02,3.6044,3.69,6393199,16,2,2023
2023-02-15,3.71,4.21,3.71,4.18,4598678,15,2,2023
2023-02-14,3.76,3.88,3.59,3.82,2327667,14,2,2023
2023-02-13,3.73,3.915,3.66,3.77,2000753,13,2,2023
2023-02-10,3.84,3.8401,3.66,3.72,2657630,10,2,2023
2023-02-09,4.29,4.31,3.87,3.91,2037913,9,2,2023
2023-02-08,4.19,4.325,4.1013,4.12,1529118,8,2,2023
2023-02-07,4.42,4.42,4.07,4.22,3449407,7,2,2023
2023-02-06,4.46,4.57,4.3311,4.42,1791808,6,2,2023


In [6]:
df = pd.DataFrame(data={'height': [175., 178.5, 185., 191., 184.5, 183., 168.]})
df
df['height_cat'] = pd.cut(x=df.height, bins=3)
df
     

Unnamed: 0,height,height_cat
0,175.0,"(167.977, 175.667]"
1,178.5,"(175.667, 183.333]"
2,185.0,"(183.333, 191.0]"
3,191.0,"(183.333, 191.0]"
4,184.5,"(183.333, 191.0]"
5,183.0,"(175.667, 183.333]"
6,168.0,"(167.977, 175.667]"


In [7]:
df['height_cat'] = pd.cut(x=df.height, bins=(160,170,180,200))
df

Unnamed: 0,height,height_cat
0,175.0,"(170, 180]"
1,178.5,"(170, 180]"
2,185.0,"(180, 200]"
3,191.0,"(180, 200]"
4,184.5,"(180, 200]"
5,183.0,"(180, 200]"
6,168.0,"(160, 170]"


In [8]:
df['height_cat'] = pd.cut(x=df.height, bins=(160,170,180,200), labels = ['small', 'medium','high'])
df


Unnamed: 0,height,height_cat
0,175.0,medium
1,178.5,medium
2,185.0,high
3,191.0,high
4,184.5,high
5,183.0,high
6,168.0,small


In [9]:
pd.get_dummies(df, drop_first=True, prefix='height')

Unnamed: 0,height,height_medium,height_high
0,175.0,1,0
1,178.5,1,0
2,185.0,0,1
3,191.0,0,1
4,184.5,0,1
5,183.0,0,1
6,168.0,0,0


In [11]:
df = pd.DataFrame(data={'lang': [['PL', 'ENG'], ['GER', 'ENG', 'PL', 'FRA'], ['RUS']]})
df
df['lang_number'] = df['lang'].apply(len)
df  

Unnamed: 0,lang,lang_number
0,"[PL, ENG]",2
1,"[GER, ENG, PL, FRA]",4
2,[RUS],1


In [12]:
df['PL-flag'] = df['lang'].apply(lambda x:1 if 'PL' in x else 0)
df

Unnamed: 0,lang,lang_number,PL-flag
0,"[PL, ENG]",2,1
1,"[GER, ENG, PL, FRA]",4,1
2,[RUS],1,0


In [15]:
df = pd.DataFrame(data={'website': ['wp.pl', 'onet.pl', 'google.com']})
df
df.website.str.split('.', expand=True)
new = df.website.str.split('.', expand=True)
df['portal'] = new[0]
df['extension'] = new[1]
df

Unnamed: 0,website,portal,extension
0,wp.pl,wp,pl
1,onet.pl,onet,pl
2,google.com,google,com
