## Использование pandas для анализа данных (срезы, группировки, общие сведения о датафрейме, стат. характеристики).

In [1]:
import pandas as pd
import sys
import warnings
if not sys.warnoptions:
       warnings.simplefilter("ignore")

In [2]:
data = pd.read_csv('Online Retail.csv', sep = ';', parse_dates = ['InvoiceDate'])  

In [3]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Unnamed: 8
0,581483,23843,PAPER CRAFT . LITTLE BIRDIE,80995,2011-09-12 09:15:00,2.08,16446.0,United Kingdom,
1,541431,23166,MEDIUM CERAMIC TOP STORAGE JAR,74215,2011-01-18 10:01:00,1.04,12346.0,United Kingdom,
2,578841,84826,ASSTD DESIGN 3D PAPER STICKERS,12540,2011-11-25 15:57:00,0.0,13256.0,United Kingdom,
3,542504,37413,,5568,2011-01-28 12:03:00,0.0,,United Kingdom,
4,573008,84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS,4800,2011-10-27 12:26:00,0.21,12901.0,United Kingdom,


In [4]:
df = data.drop(columns = ['Unnamed: 8'])

In [5]:
df.tail()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
541904,C536757,84347,ROTATING SILVER ANGELS T-LIGHT HLDR,-9360,2010-02-12 14:23:00,0.03,15838.0,United Kingdom
541905,556690,23005,printing smudges/thrown away,-9600,2011-06-14 10:37:00,0.0,,United Kingdom
541906,556691,23005,printing smudges/thrown away,-9600,2011-06-14 10:37:00,0.0,,United Kingdom
541907,C541433,23166,MEDIUM CERAMIC TOP STORAGE JAR,-74215,2011-01-18 10:17:00,1.04,12346.0,United Kingdom
541908,C581484,23843,PAPER CRAFT . LITTLE BIRDIE,-80995,2011-09-12 09:27:00,2.08,16446.0,United Kingdom


In [6]:
df.sample()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
112243,572913,21326,AGED GLASS SILVER T-LIGHT HOLDER,12,2011-10-26 16:21:00,0.65,15993.0,United Kingdom


### Общая информация о датафрейме

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [8]:
df['Quantity'] = df['Quantity'].astype('int')
df['CustomerID'] = df['CustomerID'].astype('str')
df['UnitPrice'] = df['UnitPrice'].astype('float')
df['Country'] = df['Country'].astype('category')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int32         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   541909 non-null  object        
 7   Country      541909 non-null  category      
dtypes: category(1), datetime64[ns](1), float64(1), int32(1), object(4)
memory usage: 27.4+ MB


In [10]:
df.describe()

Unnamed: 0,Quantity,UnitPrice
count,541909.0,541909.0
mean,9.55225,4.611114
std,218.081158,96.759853
min,-80995.0,-11062.06
25%,1.0,1.25
50%,3.0,2.08
75%,10.0,4.13
max,80995.0,38970.0


In [11]:
df.duplicated(subset=['InvoiceNo'])

0         False
1         False
2         False
3         False
4         False
          ...  
541904    False
541905    False
541906    False
541907    False
541908    False
Length: 541909, dtype: bool

### Срезы в данных

In [12]:
df_UK = df[df['Country'] == 'United Kingdom']

In [13]:
df[df['Quantity'] <= 1000]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
116,537841,16014,SMALL CHINESE STYLE SCISSOR,1000,2010-08-12 15:10:00,0.32,13848.0,United Kingdom
117,566028,16014,SMALL CHINESE STYLE SCISSOR,1000,2011-08-09 12:58:00,0.32,13848.0,United Kingdom
118,569815,85099B,JUMBO BAG RED RETROSPOT,1000,2011-06-10 11:53:00,1.79,15838.0,United Kingdom
119,547519,16014,SMALL CHINESE STYLE SCISSOR,1000,2011-03-23 14:46:00,0.32,16308.0,United Kingdom
120,575582,85099B,JUMBO BAG RED RETROSPOT,1000,2011-10-11 11:55:00,1.79,16986.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,C536757,84347,ROTATING SILVER ANGELS T-LIGHT HLDR,-9360,2010-02-12 14:23:00,0.03,15838.0,United Kingdom
541905,556690,23005,printing smudges/thrown away,-9600,2011-06-14 10:37:00,0.00,,United Kingdom
541906,556691,23005,printing smudges/thrown away,-9600,2011-06-14 10:37:00,0.00,,United Kingdom
541907,C541433,23166,MEDIUM CERAMIC TOP STORAGE JAR,-74215,2011-01-18 10:17:00,1.04,12346.0,United Kingdom


### Группировка данных

In [14]:
dff = df.groupby(by=["Country"]).sum()
dff

Unnamed: 0_level_0,Quantity,UnitPrice
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Australia,83653,4054.75
Austria,4827,1701.52
Bahrain,260,86.57
Belgium,23152,7540.13
Brazil,356,142.6
Canada,2763,910.58
Channel Islands,9479,3738.55
Cyprus,6317,3920.07
Czech Republic,592,88.15
Denmark,8188,1266.95


In [15]:
dff.index

CategoricalIndex(['Australia', 'Austria', 'Bahrain', 'Belgium', 'Brazil',
                  'Canada', 'Channel Islands', 'Cyprus', 'Czech Republic',
                  'Denmark', 'EIRE', 'European Community', 'Finland', 'France',
                  'Germany', 'Greece', 'Hong Kong', 'Iceland', 'Israel',
                  'Italy', 'Japan', 'Lebanon', 'Lithuania', 'Malta',
                  'Netherlands', 'Norway', 'Poland', 'Portugal', 'RSA',
                  'Saudi Arabia', 'Singapore', 'Spain', 'Sweden',
                  'Switzerland', 'USA', 'United Arab Emirates',
                  'United Kingdom', 'Unspecified'],
                 categories=['Australia', 'Austria', 'Bahrain', 'Belgium', 'Brazil', 'Canada', 'Channel Islands', 'Cyprus', ...], ordered=False, name='Country', dtype='category')