In [124]:
import pandas as pd
import numpy as np
import matplotlib

In [316]:
#Функция объединяет датафреймы расположенные в папке, при этом отмечает какой месяц мы берем из каждого датафрейма. 
#к примеру если указано month_for_filter = 1, для этого датафрейма мы берем даты у которых месяц январь
def combine_df(x):
    df = pd.read_csv('/Users/peter/Desktop/DF/'+x+'_01.csv', delimiter = '\t', low_memory=False)
    df['month_for_filter'] = 1
    monthes = range(2,5)
    for month in monthes:
        path = '/Users/peter/Desktop/DF/'+x+'_0%d.csv' %month
        temp = pd.read_csv(path, delimiter = '\t', low_memory=False)
        temp['month_for_filter'] = month
        df = pd.concat([temp, df])
    return df

In [317]:
def preprocessing_1(df):
    #условие для дропа, если месяц лида не совпадает с месяцем в файле, колонка month_for_filter
    df['drop'] = np.where(df.LEAD_MONTH == df.month_for_filter, '0', '1')
    #дропаем  ненужные записи и фильтруем
    df = df.query('CAMPAIGN_MEDIUM == "PKW" & UNIQ == 1 & drop == "0"')
    #фильтруем
    df = df[(df['CAMPAIGN'] != 'TM') & (df['CAMPAIGN'] != 'VK')]
    #оставляем столбцы которые понадобятся при обработке
    filtered = ['CAMPAIGN','CAMPAIGN_TERM','CAMPAIGN_CONTENT','applied','issued','CAMPAIGN_MEDIUM','CAMPAIGN_NUMBER',
                'month_for_filter','UNIQ','LEAD_DATE','COOKIE','contacted','URL']
    df = df.filter(items=filtered)
    return df

In [318]:
def preprocessing_2(df):
    df['utm_type_ad'] = df.apply(lambda x: 'search' if 'search' in 
                                 (str(x.COOKIE).lower() + str(x.CAMPAIGN_CONTENT).lower()
                                  + str(x.URL).lower()) else 'network', axis=1 )
    
    df['utm_source'] = df.apply(lambda x: 'yandex' if 'yandex' in 
                                 (str(x.CAMPAIGN).lower() + str(x.COOKIE).lower()
                                  + str(x.URL).lower()) else 'google', axis=1 )
    
    df['utm_brand_or_no'] = df.apply(lambda x: 'brand' if 'brand' in 
                                 (str(x.CAMPAIGN_CONTENT).lower() + str(x.COOKIE).lower()
                                  + str(x.URL).lower()) else 'nobrand', axis=1 )

    df = df.groupby(['month_for_filter','utm_source','utm_type_ad','utm_brand_or_no'], as_index = False).\
    agg({'issued':'sum','applied':'sum','contacted':'sum'})
    
    return df

In [319]:
def combine_costs(x):
    col=['month_for_filter','campaign','utm_campaign_id','shows','clicks','costs']
    df = pd.DataFrame(columns=col)
    month_map = {'апр.19': 4, 'март.19': 3, 'февр.19': 2,'янв.19': 1, 'май.19': 5,
                'июнь.19': 6, 'июль.19': 7,'авг.19': 8}
    sources = ['google','yandex']
    for source in sources:
        path = '/Users/peter/Desktop/DF/'+x+'_'+source+'_costs.csv' 
        temp = pd.read_csv(path, delimiter = ';', low_memory=False)
        temp.columns = col
        temp['month_for_filter'] = temp['month_for_filter'].map(month_map).astype('int')
        temp['utm_source'] = source
        df = pd.concat([temp, df])
    df = df.replace(r'\s+','',regex=True)
    df['costs'] = df['costs'].str.replace(',', '.').astype('float')
    df['utm_campaign_id'] = df['utm_campaign_id'].astype('int')
    df['key'] = df[['utm_campaign_id', 'month_for_filter']].apply(lambda x: str(x[0]) + '_' + str(x[1]), axis = 1)
    df['utm_type_ad'] = df.apply(lambda x: 'search' if 'search' in x.campaign else 'network', axis=1 )
    df['utm_brand_or_no'] = df.apply(lambda x: 'brand' if 'brand' in str(x.campaign).lower() else 'nobrand', axis=1)
    for column in list(df.columns):
        if ('clicks' in column or 'shows' in column or 'costs' in column):
            df[column] = df[column].astype(float)
    df = df.groupby(['month_for_filter','utm_source','utm_type_ad','utm_brand_or_no'], as_index = False).\
    agg({'shows':'sum','clicks':'sum','costs':'sum'})
    return df

In [320]:
def aggregatefunc(x,y):
    df_1 = x.merge(y, on=['month_for_filter','utm_source','utm_type_ad','utm_brand_or_no'], how='left')    
    return df_1

_________________
_________________


In [344]:
product_name = 'halva'

In [345]:
df1 = combine_df(product_name)

In [346]:
df1 = preprocessing_1(df1)

In [347]:
df1 = preprocessing_2(df1)

In [348]:
df1_cost = combine_costs(product_name)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  del sys.path[0]


In [349]:
df1_cost

Unnamed: 0,month_for_filter,utm_source,utm_type_ad,utm_brand_or_no,shows,clicks,costs
0,1,google,network,nobrand,311477.0,2750.0,63918.588
1,1,google,search,brand,313202.0,86087.0,3716214.228
2,1,google,search,nobrand,83259.0,21573.0,940156.728
3,1,yandex,network,nobrand,864788.0,4821.0,54762.92
4,1,yandex,search,brand,415524.0,115140.0,2720114.58
5,2,google,network,nobrand,1568340.0,8225.0,108687.432
6,2,google,search,brand,353732.0,97617.0,2748475.872
7,2,yandex,network,nobrand,3685687.0,12832.0,209104.65
8,2,yandex,search,brand,394006.0,107319.0,2898549.09
9,3,google,network,nobrand,15054955.0,75270.0,686694.816


In [350]:
final = aggregatefunc(df1_cost,df1)

______________
______________

### Проверки

In [354]:
#Проверка расходов и выдач по месяцам и источникам
aggregatefunc(df1_cost,df1).groupby(['month_for_filter','utm_source']).\
    agg({'issued':'sum', 'costs':'sum','clicks':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,issued,costs,clicks
month_for_filter,utm_source,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,google,1605.0,4720289.544,110410.0
1,yandex,1631.0,2774877.5,119961.0
2,google,1437.0,2857163.304,105842.0
2,yandex,1729.0,3107653.74,120151.0
3,google,1307.0,3294922.224,165318.0
3,yandex,1504.0,2968816.18,91342.0
4,google,1241.0,2569884.564,93237.0
4,yandex,1185.0,1082897.92,69139.0


In [352]:
df1.groupby(['month_for_filter','utm_source']).agg({'issued':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,issued
month_for_filter,utm_source,Unnamed: 2_level_1
1,google,1605.0
1,yandex,1633.0
2,google,1438.0
2,yandex,1731.0
3,google,1307.0
3,yandex,1504.0
4,google,1241.0
4,yandex,1185.0


In [332]:
df1.head(10)

Unnamed: 0,month_for_filter,utm_source,utm_type_ad,utm_brand_or_no,issued,applied,contacted
0,1,google,search,brand,0.0,31.0,53
1,1,google,search,nobrand,5.0,54.0,99
2,1,yandex,search,brand,0.0,42.0,76
3,1,yandex,search,nobrand,2.0,61.0,153
4,2,google,network,nobrand,1.0,14.0,19
5,2,google,search,brand,4.0,55.0,105
6,2,google,search,nobrand,9.0,104.0,258
7,2,yandex,network,nobrand,0.0,1.0,2
8,2,yandex,search,brand,4.0,87.0,162
9,2,yandex,search,nobrand,4.0,75.0,184


______________
______________

### Результат

In [355]:
final

Unnamed: 0,month_for_filter,utm_source,utm_type_ad,utm_brand_or_no,shows,clicks,costs,issued,applied,contacted
0,1,google,network,nobrand,311477.0,2750.0,63918.588,252.0,1979.0,3811
1,1,google,search,brand,313202.0,86087.0,3716214.228,1181.0,8175.0,7258
2,1,google,search,nobrand,83259.0,21573.0,940156.728,172.0,1195.0,1108
3,1,yandex,network,nobrand,864788.0,4821.0,54762.92,10.0,63.0,68
4,1,yandex,search,brand,415524.0,115140.0,2720114.58,1621.0,10211.0,8991
5,2,google,network,nobrand,1568340.0,8225.0,108687.432,188.0,1197.0,2008
6,2,google,search,brand,353732.0,97617.0,2748475.872,1249.0,9267.0,7786
7,2,yandex,network,nobrand,3685687.0,12832.0,209104.65,28.0,204.0,210
8,2,yandex,search,brand,394006.0,107319.0,2898549.09,1701.0,10877.0,8933
9,3,google,network,nobrand,15054955.0,75270.0,686694.816,108.0,1330.0,1108


In [356]:
#Экспорт в эксельку
writer = pd.ExcelWriter('EXPORT_'+product_name+'.xlsx', engine='xlsxwriter')
final.to_excel(writer, 'Sheet1')
writer.save()