# PYTHON-12. Продвинутые методы работы с данными в Pandas 

## 1. Введение (импорт данных)

In [1]:
import pandas as pd
melb_df = pd.read_csv('data/melb_data_fe.csv')
display(melb_df.head())
melb_df.info()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom,...,Longtitude,Regionname,Propertycount,MeanRoomsSquare,AreaRatio,MonthSale,AgeBuilding,WeekdaySale,StreetType,Weekend
0,Abbotsford,2,house,1480000.0,S,Biggin,2016-03-12,2.5,3067,2,...,144.9984,Northern Metropolitan,4019,25.2,-0.231707,3,46,5,St,1
1,Abbotsford,2,house,1035000.0,S,Biggin,2016-04-02,2.5,3067,2,...,144.9934,Northern Metropolitan,4019,15.8,-0.32766,4,116,5,St,1
2,Abbotsford,3,house,1465000.0,SP,Biggin,2017-04-03,2.5,3067,3,...,144.9944,Northern Metropolitan,4019,18.75,0.056338,4,117,0,St,0
3,Abbotsford,3,house,850000.0,PI,Biggin,2017-04-03,2.5,3067,3,...,144.9969,Northern Metropolitan,4019,15.75,0.145455,4,47,0,other,0
4,Abbotsford,4,house,1600000.0,VB,Nelson,2016-04-06,2.5,3067,3,...,144.9941,Northern Metropolitan,4019,17.75,0.083969,4,2,2,St,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Suburb           13580 non-null  object 
 1   Rooms            13580 non-null  int64  
 2   Type             13580 non-null  object 
 3   Price            13580 non-null  float64
 4   Method           13580 non-null  object 
 5   SellerG          13580 non-null  object 
 6   Date             13580 non-null  object 
 7   Distance         13580 non-null  float64
 8   Postcode         13580 non-null  int64  
 9   Bedroom          13580 non-null  int64  
 10  Bathroom         13580 non-null  int64  
 11  Car              13580 non-null  int64  
 12  Landsize         13580 non-null  float64
 13  BuildingArea     13580 non-null  float64
 14  CouncilArea      12211 non-null  object 
 15  Lattitude        13580 non-null  float64
 16  Longtitude       13580 non-null  float64
 17  Regionname  

In [2]:
melb_df['Date'] = pd.to_datetime(melb_df['Date'], yearfirst=True)

In [3]:
quarters = melb_df['Date'].dt.quarter
print(quarters.value_counts().iloc[1])

4359


In [4]:
col_exclude_list = ['Date', 'Rooms', 'Bedroom', 'Bathroom', 'Car']
max_uniq_count = 150
for col in melb_df.columns:
    if melb_df[col].nunique() < max_uniq_count and col not in col_exclude_list:
        melb_df[col] = melb_df[col].astype('category')

In [5]:
melb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Suburb           13580 non-null  category      
 1   Rooms            13580 non-null  int64         
 2   Type             13580 non-null  category      
 3   Price            13580 non-null  float64       
 4   Method           13580 non-null  category      
 5   SellerG          13580 non-null  category      
 6   Date             13580 non-null  datetime64[ns]
 7   Distance         13580 non-null  float64       
 8   Postcode         13580 non-null  int64         
 9   Bedroom          13580 non-null  int64         
 10  Bathroom         13580 non-null  int64         
 11  Car              13580 non-null  int64         
 12  Landsize         13580 non-null  float64       
 13  BuildingArea     13580 non-null  float64       
 14  CouncilArea      12211 non-null  categ

## 2. Сортировка данных в DataFrame

### Метод SORT_VALUES()

In [6]:
#melb_df.sort_values(by='Price').head(10)

#сортировка по убыванию
#melb_df.sort_values(by='Date', ascending=False)

#сортировка по нескольким столбцам с выводом каждой 10 строки
melb_df.sort_values(by=['Distance', 'Price']).loc[::10, ['Distance', 'Price']] 


Unnamed: 0,Distance,Price
11428,0.0,387000.0
10512,0.7,600000.0
5727,1.2,485000.0
8671,1.2,595000.0
5736,1.2,740000.0
...,...,...
12011,38.0,680000.0
10673,38.0,810000.0
13429,38.0,1155000.0
11102,41.0,650000.0


In [7]:
mask1 = melb_df['AreaRatio'] < -0.8
mask2 = melb_df['Type'] == 'townhouse'
mask3 = melb_df['SellerG'] == 'McGrath'
melb_df[mask1 & mask2 & mask3].sort_values(
    by=['Date', 'AreaRatio'],
    ascending=[True, False],
    ignore_index=True
).loc[:, ['Date', 'AreaRatio']]

Unnamed: 0,Date,AreaRatio
0,2016-07-26,-0.974922
1,2016-09-24,-0.971831
2,2016-11-27,-0.953608
3,2016-12-11,-0.945946
4,2017-08-04,-0.947368
5,2017-08-04,-0.970874


In [8]:
melb_df.sort_values(by='AreaRatio', ascending=False, ignore_index=True).loc[1558, 'BuildingArea']

126.0

In [9]:
melb_df[(melb_df['Rooms'] > 2) & (melb_df['Type'] == 'townhouse')].sort_values(
    by=['Rooms', 'MeanRoomsSquare'],
    ascending=[True, False],
    ignore_index=True
).loc[18, 'Price']

1300000.0

## 3. Группировка данных в DataFrame

### МЕТОД GROUPBY()

##### ГРУППИРОВКА ДАННЫХ ПО ОДНОМУ КРИТЕРИЮ С ОДНОЙ АГРЕГАЦИЕЙ

In [10]:
display(melb_df.groupby(by='Type', axis=0, as_index=False).mean())
display(melb_df.groupby(by='Type')['Price'].mean())
melb_df.groupby(by='Regionname')['Distance'].min().sort_values(ascending=False)

Unnamed: 0,Type,Rooms,Price,Distance,Postcode,Bedroom,Bathroom,Car,Landsize,BuildingArea,Lattitude,Longtitude,Propertycount,MeanRoomsSquare,AreaRatio,AgeBuilding
0,house,3.260874,1242665.0,10.979479,3104.080643,3.229336,1.613822,1.772674,617.181924,152.162553,-37.803795,144.9947,7259.025505,18.996731,-0.490031,55.6697
1,townhouse,2.837522,933735.1,9.851346,3100.777379,2.814183,1.809695,1.555655,279.606822,134.64971,-37.815782,144.996489,7094.459605,18.569847,-0.094916,26.690305
2,unit,1.963871,605127.5,7.607391,3110.797481,1.966523,1.183295,1.128936,477.314219,102.235863,-37.82371,144.996363,8199.28008,21.068242,0.319883,39.703016


Type
house        1.242665e+06
townhouse    9.337351e+05
unit         6.051275e+05
Name: Price, dtype: float64

Regionname
Western Victoria              29.8
Eastern Victoria              25.2
Northern Victoria             21.8
South-Eastern Metropolitan    14.7
Eastern Metropolitan           7.8
Western Metropolitan           4.3
Southern Metropolitan          0.7
Northern Metropolitan          0.0
Name: Distance, dtype: float64

##### ГРУППИРОВКА ДАННЫХ ПО ОДНОМУ КРИТЕРИЮ С НЕСКОЛЬКИМИ АГРЕГАЦИЯМИ

In [11]:
melb_df.groupby(by='MonthSale')['Price'].agg(
    ['count', 'mean', 'max']
).sort_values(by='count', ascending=False)

#melb_df.groupby('MonthSale')['Price'].agg('describe').sort_values(by='count', ascending=False)

Unnamed: 0_level_0,count,mean,max
MonthSale,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,1850,1056371.0,6500000.0
7,1835,931469.8,9000000.0
5,1644,1097807.0,8000000.0
6,1469,1068981.0,7650000.0
3,1408,1146762.0,5600000.0
4,1246,1050479.0,5500000.0
9,1188,1126349.0,6400000.0
10,854,1135970.0,6250000.0
11,750,1142503.0,5050000.0
12,725,1144737.0,5700000.0


In [12]:
melb_df.groupby(by='Regionname')['SellerG'].agg(
    ['nunique', set]
).sort_values(by='nunique', ascending=False)

Unnamed: 0_level_0,nunique,set
Regionname,Unnamed: 1_level_1,Unnamed: 2_level_1
Northern Metropolitan,40,"{Jellis, HAR, RT, Alexkarbon, RW, LITTLE, Ray,..."
Southern Metropolitan,38,"{Jellis, HAR, RT, Chisholm, Thomson, RW, LITTL..."
Western Metropolitan,34,"{Jellis, HAR, Moonee, RT, Chisholm, Alexkarbon..."
Eastern Metropolitan,26,"{Jellis, HAR, RT, RW, Ray, Miles, Philip, Gary..."
South-Eastern Metropolitan,25,"{Jellis, HAR, Chisholm, Thomson, RW, Ray, Greg..."
Eastern Victoria,11,"{Ray, Fletchers, Barry, HAR, Harcourts, Eview,..."
Northern Victoria,11,"{Ray, Barry, HAR, McDonald, other, YPA, LITTLE..."
Western Victoria,6,"{Ray, HAR, other, YPA, Raine, hockingstuart}"


In [13]:
melb_df.groupby(by='Rooms')['Price'].mean().sort_values()

melb_df.groupby(by='Regionname')['Lattitude'].std().sort_values(ascending=False)

Regionname
Eastern Victoria              0.147067
Northern Victoria             0.084455
South-Eastern Metropolitan    0.073411
Western Metropolitan          0.051251
Northern Metropolitan         0.049639
Eastern Metropolitan          0.047890
Southern Metropolitan         0.043080
Western Victoria              0.011579
Name: Lattitude, dtype: float64

In [14]:
melb_df[((melb_df['Date'] <= '2017-09-01') & (melb_df['Date'] >= '2017-05-01'))].groupby(by='SellerG').sum().sort_values(by='Price').head(1)

Unnamed: 0_level_0,Rooms,Price,Distance,Postcode,Bedroom,Bathroom,Car,Landsize,BuildingArea,Lattitude,Longtitude,Propertycount,MeanRoomsSquare,AreaRatio,AgeBuilding
SellerG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
LITTLE,10,2742000.0,45.5,13042,10,5,5,1021.0,448.0,-151.03278,580.15921,24072,77.0,-0.139808,188


## 4. Сводные таблицы

### МЕТОД GROUPBY КАК СПОСОБ ПОСТРОЕНИЯ СВОДНЫХ ТАБЛИЦ

In [15]:
melb_df.groupby('Rooms')[['Price', 'BuildingArea']].median()
melb_df.groupby(['Rooms', 'Type'])['Price'].mean()
melb_df.groupby(['Rooms', 'Type'])['Price'].mean().unstack()

Type,house,townhouse,unit
Rooms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,866865.5,592704.5,389928.9
2,1017238.0,710158.5,610490.5
3,1109233.0,984708.7,850596.3
4,1462283.0,1217092.0,1037476.0
5,1877327.0,1035000.0,
6,1869508.0,,520000.0
7,1920700.0,,
8,1510286.0,,2250000.0
10,900000.0,,


### МЕТОД PIVOT_TABLE ДЛЯ ПОСТРОЕНИЯ СВОДНЫХ ТАБЛИЦ

In [16]:
melb_df.pivot_table(
    values='Price',
    index='Rooms',
    columns='Type',
    fill_value=0
).round()

Type,house,townhouse,unit
Rooms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,866866.0,592705.0,389929.0
2,1017238.0,710158.0,610491.0
3,1109233.0,984709.0,850596.0
4,1462283.0,1217092.0,1037476.0
5,1877327.0,1035000.0,0.0
6,1869508.0,0.0,520000.0
7,1920700.0,0.0,0.0
8,1510286.0,0.0,2250000.0
10,900000.0,0.0,0.0


In [17]:
melb_df.pivot_table(
    values='Price',
    index='Regionname',
    columns='Weekend',
    aggfunc='count'
)

Weekend,0,1
Regionname,Unnamed: 1_level_1,Unnamed: 2_level_1
Eastern Metropolitan,447,1024
Eastern Victoria,13,40
Northern Metropolitan,1258,2632
Northern Victoria,11,30
South-Eastern Metropolitan,123,327
Southern Metropolitan,1534,3161
Western Metropolitan,960,1988
Western Victoria,8,24


In [18]:
melb_df.pivot_table(
    values='Landsize',
    index='Regionname',
    columns='Type',
    aggfunc=['median', 'mean'],
    fill_value=0
)

Unnamed: 0_level_0,median,median,median,mean,mean,mean
Type,house,townhouse,unit,house,townhouse,unit
Regionname,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Eastern Metropolitan,674.0,233.5,203,717.422847,269.440678,330.444444
Eastern Victoria,843.0,0.0,230,3108.96,0.0,295.333333
Northern Metropolitan,459.5,134.0,0,619.249092,317.325733,495.026538
Northern Victoria,724.0,0.0,0,3355.463415,0.0,0.0
South-Eastern Metropolitan,630.5,240.0,199,664.306701,212.16,357.864865
Southern Metropolitan,586.0,246.0,0,569.643881,278.858824,466.380245
Western Metropolitan,531.0,198.0,62,507.883406,244.560669,557.637232
Western Victoria,599.5,0.0,0,655.5,0.0,0.0


### МНОГОМЕРНЫЕ СВОДНЫЕ ТАБЛИЦЫ

In [19]:
melb_df.pivot_table(
    index=['Method', 'Type'],  #можно передать список
    columns='Regionname',      #можно передать список
    values='Price',
    aggfunc='median',
    fill_value=0
)

Unnamed: 0_level_0,Regionname,Eastern Metropolitan,Eastern Victoria,Northern Metropolitan,Northern Victoria,South-Eastern Metropolitan,Southern Metropolitan,Western Metropolitan,Western Victoria
Method,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
PI,house,1244000,780000,900000,500000,865000,1725000,870000,630000
PI,townhouse,760000,0,632500,0,1190000,1055000,670000,0
PI,unit,650000,0,410000,0,525000,571250,360000,0
S,house,1127000,675000,920000,555000,883300,1611000,870000,397500
S,townhouse,828000,0,750000,0,875000,1135000,729000,0
S,unit,645750,492000,525500,0,606000,655000,489000,0
SA,house,932500,950000,817500,540000,880000,1390000,772500,0
SA,townhouse,807500,0,425000,0,0,1141000,467500,0
SA,unit,0,0,616000,0,0,580000,571000,0
SP,house,1050000,672500,900000,521000,770000,1521750,865000,360000


### ДОСТУП К ДАННЫМ В СВОДНОЙ ТАБЛИЦЕ

In [20]:
pivot = melb_df.pivot_table(
    values='Landsize',
    index='Regionname',
    columns='Type',
    aggfunc=['median', 'mean'],
    fill_value=0
)

In [21]:
pivot.columns

MultiIndex([('median',     'house'),
            ('median', 'townhouse'),
            ('median',      'unit'),
            (  'mean',     'house'),
            (  'mean', 'townhouse'),
            (  'mean',      'unit')],
           names=[None, 'Type'])

In [22]:
display(pivot['mean']['unit'])

Regionname
Eastern Metropolitan          330.444444
Eastern Victoria              295.333333
Northern Metropolitan         495.026538
Northern Victoria               0.000000
South-Eastern Metropolitan    357.864865
Southern Metropolitan         466.380245
Western Metropolitan          557.637232
Western Victoria                0.000000
Name: unit, dtype: float64

In [23]:
mask = pivot['mean']['house'] < pivot['median']['house']
filtered_pivot = pivot[mask]
display(filtered_pivot)

print(list(filtered_pivot.index))

Unnamed: 0_level_0,median,median,median,mean,mean,mean
Type,house,townhouse,unit,house,townhouse,unit
Regionname,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Southern Metropolitan,586.0,246.0,0,569.643881,278.858824,466.380245
Western Metropolitan,531.0,198.0,62,507.883406,244.560669,557.637232


['Southern Metropolitan', 'Western Metropolitan']


In [24]:
import numpy as np
mser = pd.Series(
    np.random.rand(8),
	index=[['white','white','white','blue','blue','red','red','red'],
            ['up','down','right','up','down','up','down','left']])
display(mser)
print(mser.index)

white  up       0.388535
       down     0.218799
       right    0.434535
blue   up       0.330765
       down     0.116385
red    up       0.636302
       down     0.365249
       left     0.319900
dtype: float64

MultiIndex([('white',    'up'),
            ('white',  'down'),
            ('white', 'right'),
            ( 'blue',    'up'),
            ( 'blue',  'down'),
            (  'red',    'up'),
            (  'red',  'down'),
            (  'red',  'left')],
           )


In [25]:
mframe = pd.DataFrame(
    np.random.randn(16).reshape(4,4),
    index=[['white','white','red','red'], ['up','down','up','down']],
    columns=[['pen','pen','paper','paper'],[1,2,1,2]]
)

display(mframe)

Unnamed: 0_level_0,Unnamed: 1_level_0,pen,pen,paper,paper
Unnamed: 0_level_1,Unnamed: 1_level_1,1,2,1,2
white,up,0.461972,1.069537,0.848956,-1.240124
white,down,-1.544622,-2.039105,0.508477,0.091677
red,up,-1.30505,-0.082363,-1.201743,-1.596622
red,down,0.52513,-1.443186,-1.157412,0.53343


In [26]:
melb_df.pivot_table(
    index='Type',
    values='BuildingArea',
    columns='Rooms',
    aggfunc='median',
    fill_value=0
).max().sort_values()

Rooms
1     126.0
2     126.0
3     126.0
8     126.0
10    126.0
4     159.5
6     171.0
5     177.0
7     216.5
dtype: float64

In [27]:
melb_df.pivot_table(
    index='SellerG',
    columns='Type',
    values='Price',
    aggfunc='median',
    fill_value=0
).sort_values(by='unit', ascending=False)

Type,house,townhouse,unit
SellerG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Nick,2025000,780000,900000
Marshall,1975000,1408500,715000
Cayzer,1505000,1450000,707500
Kay,2220000,1365000,695000
Noel,1400500,990000,693250
Buxton,1323750,1030000,670000
Fletchers,1390000,1238000,653000
Chisholm,1520000,950000,640000
Philip,1035000,701000,636000
RT,1640000,1400000,630000


## 5. Объединение DataFrame: знакомимся с новыми данными

In [28]:
movies_1 = pd.read_csv('data/movies_data/movies.csv')
ratings_1 = pd.read_csv( 'data/movies_data/ratings1.csv')
dates_1 = pd.read_csv('data/movies_data/dates.csv')

In [29]:
dates_1['date'] = pd.to_datetime(dates_1['date'], yearfirst=True)

In [30]:
dates_1['date'].dt.year.value_counts()

2000    10061
2017     8198
2007     7114
2016     6703
2015     6616
2018     6418
1996     6040
2005     5813
2012     4656
2008     4351
2009     4158
2006     4059
2003     4014
2001     3922
2002     3478
2004     3279
1999     2439
2010     2301
1997     1916
2011     1690
2013     1664
2014     1439
1998      507
Name: date, dtype: int64

## 6. Объединение DataFrame: concat

In [31]:
ratings1 = pd.read_csv('data/movies_data/ratings1.csv')
ratings2 = pd.read_csv('data/movies_data/ratings2.csv')
movies = pd.read_csv('data/movies_data/movies.csv')

In [32]:
ratings = pd.concat(
    [ratings1, ratings2],
    ignore_index=True
    )
display(ratings)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100832,610,166534,4.0
100833,610,168248,5.0
100834,610,168250,5.0
100835,610,168252,5.0


In [33]:
print('Число строк в таблице ratings: ', ratings.shape[0])
print('Число строк в таблице dates: ', dates_1.shape[0])
print(ratings.shape[0] == dates_1.shape[0])

Число строк в таблице ratings:  100837
Число строк в таблице dates:  100836
False


In [34]:
display(ratings1.tail(1))
display(ratings2.head(1))

Unnamed: 0,userId,movieId,rating
40000,274,5621,2.0


Unnamed: 0,userId,movieId,rating
0,274,5621,2.0


In [35]:
ratings = ratings.drop_duplicates(ignore_index=True)
print('Число строк в таблице ratings: ', ratings.shape[0])

Число строк в таблице ratings:  100836


In [36]:
ratings_dates = pd.concat([ratings, dates_1], axis=1)
display(ratings_dates.tail(7))

Unnamed: 0,userId,movieId,rating,date
100829,610,164179,5.0,2017-05-03 21:07:11
100830,610,166528,4.0,2017-05-04 06:29:25
100831,610,166534,4.0,2017-05-03 21:53:22
100832,610,168248,5.0,2017-05-03 22:21:31
100833,610,168250,5.0,2017-05-08 19:50:47
100834,610,168252,5.0,2017-05-03 21:19:12
100835,610,170875,3.0,2017-05-03 21:20:15


In [37]:
pd.DataFrame({"Name": ["Pankaj", "Lisa"], "Surname": ["Sobolev", "Krasnova"]})

Unnamed: 0,Name,Surname
0,Pankaj,Sobolev
1,Lisa,Krasnova


## 7. Объединение DataFrame: join, merge

### МЕТОД ОБЪЕДИНЕНИЯ JOIN

In [38]:
joined_false = ratings_dates.join(
    movies,
    rsuffix='_right',
    how='left'
)
display(joined_false)

joined_false_right = ratings_dates.join(
    movies,
    lsuffix='_left',
    how='right'
)
display(joined_false_right)

joined = ratings_dates.join(
    movies.set_index('movieId'),
    on='movieId',
    how='left'
)
display(joined)

Unnamed: 0,userId,movieId,rating,date,movieId_right,title,genres
0,1,1,4.0,2000-07-30 18:45:03,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,2000-07-30 18:20:47,2.0,Jumanji (1995),Adventure|Children|Fantasy
2,1,6,4.0,2000-07-30 18:37:04,3.0,Grumpier Old Men (1995),Comedy|Romance
3,1,47,5.0,2000-07-30 19:03:35,4.0,Waiting to Exhale (1995),Comedy|Drama|Romance
4,1,50,5.0,2000-07-30 18:48:51,5.0,Father of the Bride Part II (1995),Comedy
...,...,...,...,...,...,...,...
100831,610,166534,4.0,2017-05-03 21:53:22,,,
100832,610,168248,5.0,2017-05-03 22:21:31,,,
100833,610,168250,5.0,2017-05-08 19:50:47,,,
100834,610,168252,5.0,2017-05-03 21:19:12,,,


Unnamed: 0,userId,movieId_left,rating,date,movieId,title,genres
0,1,1,4.0,2000-07-30 18:45:03,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,2000-07-30 18:20:47,2,Jumanji (1995),Adventure|Children|Fantasy
2,1,6,4.0,2000-07-30 18:37:04,3,Grumpier Old Men (1995),Comedy|Romance
3,1,47,5.0,2000-07-30 19:03:35,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,1,50,5.0,2000-07-30 18:48:51,5,Father of the Bride Part II (1995),Comedy
...,...,...,...,...,...,...,...
9737,64,3481,4.0,2006-10-22 12:37:45,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,64,3489,3.0,2006-10-22 23:28:09,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,64,3499,4.5,2006-10-22 23:26:41,193585,Flint (2017),Drama
9740,64,3510,3.0,2006-10-22 23:27:26,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


Unnamed: 0,userId,movieId,rating,date,title,genres
0,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,2000-07-30 18:37:04,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,2000-07-30 19:03:35,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,2000-07-30 18:48:51,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,2017-05-03 21:53:22,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,2017-05-03 22:21:31,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,2017-05-08 19:50:47,Get Out (2017),Horror
100834,610,168252,5.0,2017-05-03 21:19:12,Logan (2017),Action|Sci-Fi


### МЕТОД ОБЪЕДИНЕНИЯ MERGE

In [39]:
merged = ratings_dates.merge(
    movies,
    on='movieId',
    how='left'
)
display(merged)

Unnamed: 0,userId,movieId,rating,date,title,genres
0,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,2000-07-30 18:37:04,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,2000-07-30 19:03:35,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,2000-07-30 18:48:51,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,2017-05-03 21:53:22,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,2017-05-03 22:21:31,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,2017-05-08 19:50:47,Get Out (2017),Horror
100834,610,168252,5.0,2017-05-03 21:19:12,Logan (2017),Action|Sci-Fi


In [40]:
print('Число строк в таблице ratings_dates: ', ratings_dates.shape[0])
print('Число строк в таблице merged: ', merged.shape[0])
print(ratings_dates.shape[0] == merged.shape[0])

Число строк в таблице ratings_dates:  100836
Число строк в таблице merged:  100836
True


#### ОСОБЕННОСТИ ИСПОЛЬЗОВАНИЯ MERGE()

In [41]:
merged2 = ratings_dates.merge(
    movies,
    on='movieId',
    how='outer'
)
print('Число строк в таблице merged2: ', merged2.shape[0])
display(merged2.tail())

Число строк в таблице merged2:  100854


Unnamed: 0,userId,movieId,rating,date,title,genres
100849,,30892,,NaT,In the Realms of the Unreal (2004),Animation|Documentary
100850,,32160,,NaT,Twentieth Century (1934),Comedy
100851,,32371,,NaT,Call Northside 777 (1948),Crime|Drama|Film-Noir
100852,,34482,,NaT,"Browning Version, The (1951)",Drama
100853,,85565,,NaT,Chalet Girl (2011),Comedy|Romance


In [42]:
merge_ratings = ratings1.merge(ratings2, how='outer')
print('Число строк в таблице merge_ratings: ', merge_ratings.shape[0])
display(merge_ratings)

Число строк в таблице merge_ratings:  100836


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [43]:
a = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [103, 214, 124], 'C': [1, 4, 2]})
b = pd.DataFrame({'V': ['d', 'b', 'c'], 'U': [1393.7, 9382.2, 1904.5], 'C': [1, 3, 2]})
display(a)
display(b)
a.merge(b, how='right', on='C')

Unnamed: 0,A,B,C
0,a,103,1
1,b,214,4
2,c,124,2


Unnamed: 0,V,U,C
0,d,1393.7,1
1,b,9382.2,3
2,c,1904.5,2


Unnamed: 0,A,B,C,V,U
0,a,103.0,1,d,1393.7
1,,,3,b,9382.2
2,c,124.0,2,c,1904.5


In [44]:
items_df = pd.DataFrame({
    'item_id': [417283, 849734, 132223, 573943, 19475, 3294095, 382043, 302948, 100132, 312394], 
    'vendor': ['Samsung', 'LG', 'Apple', 'Apple', 'LG', 'Apple', 'Samsung', 'Samsung', 'LG', 'ZTE'],
    'stock_count': [54, 33, 122, 18, 102, 43, 77, 143, 60, 19]
})

purchase_df = pd.DataFrame({
    'purchase_id': [101, 101, 101, 112, 121, 145, 145, 145, 145, 221],
    'item_id': [417283, 849734, 132223, 573943, 19475, 3294095, 382043, 302948, 103845, 100132], 
    'price': [13900, 5330, 38200, 49990, 9890, 33000, 67500, 34500, 89900, 11400]
})

display(items_df, purchase_df)

merged_items = items_df.merge(purchase_df, how='inner', on='item_id')


merged_items['total'] = merged_items['stock_count'] * merged_items['price']

display(merged_items)

merged_items['total'].sum()

Unnamed: 0,item_id,vendor,stock_count
0,417283,Samsung,54
1,849734,LG,33
2,132223,Apple,122
3,573943,Apple,18
4,19475,LG,102
5,3294095,Apple,43
6,382043,Samsung,77
7,302948,Samsung,143
8,100132,LG,60
9,312394,ZTE,19


Unnamed: 0,purchase_id,item_id,price
0,101,417283,13900
1,101,849734,5330
2,101,132223,38200
3,112,573943,49990
4,121,19475,9890
5,145,3294095,33000
6,145,382043,67500
7,145,302948,34500
8,145,103845,89900
9,221,100132,11400


Unnamed: 0,item_id,vendor,stock_count,purchase_id,price,total
0,417283,Samsung,54,101,13900,750600
1,849734,LG,33,101,5330,175890
2,132223,Apple,122,101,38200,4660400
3,573943,Apple,18,112,49990,899820
4,19475,LG,102,121,9890,1008780
5,3294095,Apple,43,145,33000,1419000
6,382043,Samsung,77,145,67500,5197500
7,302948,Samsung,143,145,34500,4933500
8,100132,LG,60,221,11400,684000


19729490

## 8. Закрепление знаний 
*** перезагрузка таблицы и импорт библиотек ***

In [1]:
import pandas as pd
import re
movies_new = pd.read_csv('data/movies_data/ratings_movies.csv')

def get_year_release(arg):
    #находим все слова по шаблону "(DDDD)"
    candidates = re.findall(r'\(\d{4}\)', arg) 
    # проверяем число вхождений
    if len(candidates) > 0:
        #если число вхождений больше 0,
	#очищаем строку от знаков "(" и ")"
        year = candidates[0].replace('(', '')
        year = year.replace(')', '')
        return int(year)
    else:
        #если год не указан, возвращаем None
        return None

In [2]:

movies_new['year_release'] = movies_new['title'].apply(get_year_release)
movies_new['year_release'].isna().sum()

18

In [42]:
#movies_new[movies_new['year_release'] == 1999].pivot_table(
#    index='title',
#    columns='year_release',
#    values='rating',
#    aggfunc='mean',
#    fill_value=0
#).sort_values(by=1999.0)

mask = movies_new['year_release'] == 1999
movies_new[mask].groupby('title')['rating'].mean().sort_values()

title
Bloodsport: The Dark Kumite (1999)            0.5
Simon Sez (1999)                              1.0
Chill Factor (1999)                           1.0
Source, The (1999)                            1.0
Trippin' (1999)                               1.0
                                             ... 
Trailer Park Boys (1999)                      5.0
Larry David: Curb Your Enthusiasm (1999)      5.0
Sun Alley (Sonnenallee) (1999)                5.0
George Carlin: You Are All Diseased (1999)    5.0
Five Senses, The (1999)                       5.0
Name: rating, Length: 261, dtype: float64

In [4]:
movies_new[movies_new['year_release'] == 2010].pivot_table(
    index='genres',
    columns='year_release',
    values='rating',
    aggfunc='mean',
    fill_value=0
).sort_values(by=2010.0)

year_release,2010.0
genres,Unnamed: 1_level_1
Action|Sci-Fi,1.000000
Action|Adventure|Horror,1.500000
Action|Drama|Fantasy,1.500000
Crime|Romance,1.500000
Adventure|Comedy|Fantasy,1.833333
...,...
Crime,4.750000
Comedy|Musical,5.000000
Animation|Drama|Fantasy|Mystery,5.000000
Adventure|Children|Comedy|Mystery,5.000000


In [5]:

movies_new.pivot_table(
    values=['rating'],
    index='userId',
    aggfunc=['nunique','mean'],
    fill_value=0
).sort_values(by=[('nunique','rating'),('mean','rating')],ascending=[True,False])

Unnamed: 0_level_0,nunique,mean
Unnamed: 0_level_1,rating,rating
userId,Unnamed: 1_level_2,Unnamed: 2_level_2
53,1,5.000000
49,2,4.261905
404,2,3.400000
609,2,3.270270
251,3,4.869565
...,...,...
308,10,2.426087
517,10,2.386250
298,10,2.363685
567,10,2.245455


In [6]:
movs = movies_new[movies_new['year_release'] == 2018].pivot_table(
    values='rating',
    index='genres',
    aggfunc=['mean','count'],
    fill_value=0
)
movs[movs['count','rating'] > 9].sort_values(by=[('mean','rating'),('count','rating')],ascending=[False,True])
#.sort_values(by=[('mean','rating'),('count','rating')],ascending=[False,True])

Unnamed: 0_level_0,mean,count
Unnamed: 0_level_1,rating,rating
genres,Unnamed: 1_level_2,Unnamed: 2_level_2
Action|Adventure|Sci-Fi,3.928571,14
Action|Comedy|Sci-Fi,3.875,12


In [7]:
movies_new['date'] =pd.to_datetime(movies_new['date'])
movies_new['year_rating'] = movies_new['date'].dt.year

In [8]:
movies_new.pivot_table(
    values='rating',
    index='genres',
    columns='year_rating',
    aggfunc='mean',
    fill_value=0
).loc['Animation|Children|Mystery'].sort_values(key=lambda x: x == 5)

year_rating
1996    0.0
2016    0.0
2015    0.0
2014    0.0
2013    0.0
2012    0.0
2011    0.0
2010    0.0
2009    0.0
2008    0.0
2017    0.0
2007    0.0
2005    0.0
2004    0.0
2003    0.0
2002    0.0
2001    0.0
2000    0.0
1999    0.0
1998    0.0
1997    0.0
2006    0.0
2018    5.0
Name: Animation|Children|Mystery, dtype: float64

In [35]:
orders = pd.read_csv('data/orders_and_products/orders.csv',sep=';')
products = pd.read_csv('data/orders_and_products/products.csv',sep=';')


In [36]:
display(orders,products)


Unnamed: 0,Дата создания,Order ID,ID Покупателя,Статус,Оплачен,Отменен,Отгружен,ID товара,Количество
0,09.11.2019 21:55:51,9,10,"Принят, ожидается оплата",Нет,Нет,Нет,103,5
1,09.11.2019 15:05:57,8,9,"Принят, ожидается оплата",Нет,Нет,Нет,86,100
2,09.11.2019 15:05:57,8,9,"Принят, ожидается оплата",Нет,Нет,Нет,104,10
3,09.11.2019 12:50:07,7,8,"Принят, ожидается оплата",Нет,Нет,Нет,104,7
4,09.11.2019 12:00:00,6,1,"Принят, ожидается оплата",Нет,Нет,Нет,104,5
5,09.11.2019 12:00:00,6,1,"Принят, ожидается оплата",Нет,Нет,Нет,103,5
6,08.11.2019 08:36:22,5,5,Отменён,Нет,Да,Нет,124,1
7,08.11.2019 08:36:22,4,9,"Принят, ожидается оплата",Нет,Нет,Да,91,1
8,08.11.2019 08:36:22,3,8,"Оплачен, формируется к отправке",Да,Нет,Нет,103,3
9,08.11.2019 08:36:22,3,8,"Оплачен, формируется к отправке",Да,Нет,Нет,104,3


Unnamed: 0,Product_ID,Name,Price,CURRENCY
0,47,Шатны Полосатый рейс,2999,RUR
1,51,Платье Аленький цветочек,4999,RUR
2,53,Штаны Цветочная Поляна,4999,RUR
3,71,Платье Ночная Жизнь,7999,RUR
4,74,Платье Ночная Жизнь XXXL,8999,RUR
5,86,"Носки Простые, муж",45,RUR
6,91,"Носки Честные, муж",50,RUR
7,103,"Носки Подарочные, муж",199,RUR
8,104,"Носки Подарочные, жен",249,RUR
9,124,Носки беговые Camino,999,RUR


In [38]:
#orders_products = orders.merge(products, how='left', on='Product_ID')

orders_products = orders.merge(
    products, 
    left_on='ID товара',
    right_on='Product_ID',
    how='left')
orders_products.tail(1)['Order ID']


17    0
Name: Order ID, dtype: int64

In [40]:

orders_products[orders_products['Отменен'] == 'Да']['Name']


6    Носки беговые Camino
Name: Name, dtype: object

In [27]:
orders_products['total'] = orders_products['Price'] * orders_products['Количество']

In [33]:
#orders_products.pivot_table(
#    index='ID Покупателя',
#   columns='Оплачен',
#   values='total',
#   aggfunc='sum'
#)

orders_products[orders_products['Оплачен'] == 'Да'].groupby('ID Покупателя')['total'].sum().sort_values(ascending=False)


ID Покупателя
7    17096.0
5    13043.0
8     1344.0
1        0.0
Name: total, dtype: float64