In [None]:
import pandas as pd
from sklearn.externals import joblib

In [None]:
def standarize_df(df, src):
    
    std_df = pd.DataFrame()
    std_df['id'] = df.index
    
    if 'facebook_id' in df.columns:
        std_df['facebook_id'] = df['facebook_id'].astype(str)
    
    if 'owner_name' in df.columns:
        std_df['place_name'] = df['place_name'].fillna(df['owner_name']).fillna('').apply(remove_special_chars)
    else:   
        std_df['place_name'] = df['place_name'].fillna('').apply(remove_special_chars)
        
    std_df['street'] = df['street'].fillna('').replace('ul.','').replace('Katowice','').apply(remove_special_chars)
    std_df['start_time'] = pd.to_datetime(df['start_time'])
    std_df['name'] = df['name'].fillna('').apply(remove_special_chars)
    std_df['description'] = df['description'].fillna('').apply(remove_special_chars)
    std_df['src'] = src
    std_df[src] = std_df['id'].astype(str)
    std_df = std_df.sort_values('id', ascending=False).groupby(['name', 'place_name', 'start_time']).head(1)
    return std_df

In [None]:
def remove_special_chars(text):
    return text.replace(',','').replace('\'','').replace('\"','').strip()

In [None]:
def fill_street(df):
    places = df[df['street'] != ''][['place_name','street']]
    places = places.groupby(['place_name'])[['street']].apply(pd.DataFrame.mode).reset_index(drop=False)
    places = places[places['level_1'] == 0].drop(columns='level_1')
    return df['street'].where(df['street'] != '', df['place_name'].map(places.set_index('place_name')['street'])).fillna('')

##### Ładowanie danych

In [None]:
fb_raw = pd.read_csv('../data/fb.csv')
ss_raw = pd.read_csv('../data/silesiaspace.csv')
cjg_raw = pd.read_csv('../data/cojestgrane.csv')
sk_raw = pd.read_csv('../data/silesiakultura.csv')
um_raw = pd.read_csv('../data/ultramaryna.csv')

##### Standaryzacja danych

In [None]:
fb = standarize_df(fb_raw,'fb')
ss = standarize_df(ss_raw,'ss')
cjg = standarize_df(cjg_raw,'cjg')
sk = standarize_df(sk_raw,'sk')
um = standarize_df(um_raw,'um')

##### Konkatenacja danych i imputacja adresów

In [None]:
concatenation = pd.concat([fb, ss, cjg, sk, um], ignore_index=True, sort=False)
concatenation['street'] = fill_street(concatenation)

##### Serializacja danych

In [None]:
joblib.dump(fb, '../pickles/fb.pkl')
joblib.dump(ss, '../pickles/ss.pkl')
joblib.dump(cjg, '../pickles/cjg.pkl')
joblib.dump(sk, '../pickles/sk.pkl')
joblib.dump(um, '../pickles/um.pkl')
joblib.dump(concatenation, '../pickles/concatenation.pkl')