In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from itertools import product
from sklearn.preprocessing import LabelEncoder
from datetime import timedelta
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = (14, 10)
plt.rcParams['savefig.dpi']=400

In [2]:
dau=pd.read_csv("practice_dau_and.csv",sep="|",header=None,engine="c",names=["id","country","number_of_pgr","actdate"],
                dtype={"number_of_pgr":"int16"},
                parse_dates=[3],na_filter=False).sort_values(by=["id","actdate","number_of_pgr"]).drop_duplicates(
    subset=["id","actdate"],keep="last")
pact=pd.read_csv("practice_pact_and.csv",sep="|",header=None,engine="c",names=
                ["id","actdate","regdate","clusters_ml_ended","revenue","transactions","quests_end","m_quests_end",
                "crystals_value_spend","store_enters"],usecols=["id","actdate","regdate","clusters_ml_ended",
                "revenue","transactions","quests_end","m_quests_end"],dtype={"clusters_ml_ended":"int8",
                "revenue":"float32","transactions":"int8","quests_end":"int16","m_quests_end":"int16"},
                 parse_dates=[1,2])
pact=pact.drop("regdate",axis=1).join(pact.groupby("id").regdate.min().to_frame(),on="id")
pact=pd.merge(dau,pact,on=["id","actdate"],copy=False)
lenc_cntr=LabelEncoder()
lenc_id=LabelEncoder()
pact.country=lenc_cntr.fit_transform(pact.country.values).astype("uint8")
pact.id=lenc_cntr.fit_transform(pact.id.values).astype("uint32")
del dau

In [17]:
def split_data(pact,days_obs,days_pred,testweek=False):
    y=pact[(pact.actdate-pact.regdate)<=timedelta(days=days_pred-1)].groupby("id").revenue.sum().to_frame()
    y.loc[y.revenue>1000,"revenue"]=1000
    pact_train=pact[(pact.actdate-pact.regdate)<=timedelta(days=days_obs-1)].copy()
    pact_train["day"]=((pact_train["actdate"]-pact_train["regdate"])/np.timedelta64(1, 'D')).astype("int8")
    pact_train=pact_train.set_index(["id","day"]).reindex(pd.MultiIndex.from_product([pact_train["id"].unique(), range(days_obs)], names=["id", "day"]))
    pact_train.regdate.fillna(method="ffill",inplace=True)
    pact_train.country.fillna(method="ffill",inplace=True)
    pact_train.actdate=pd.to_timedelta(pact_train.index.get_level_values(level=1),"d")+pact_train.regdate
    pact_train.fillna(0,inplace=True)
    pact_train["regmonth"]=pact_train.regdate.dt.month
    pact_train["regday"]=pact_train.regdate.dt.day
    pact_train["actmonth"]=pact_train.actdate.dt.month
    pact_train["actday"]=pact_train.actdate.dt.day
    pact_train=pact_train.drop(["regdate","actdate"],axis=1).unstack().drop(
        [col for col in product(["regmonth","regday","country"],range(1,days_obs))],axis=1)
    pact_train[[("country",0)]+[col for col in product(["transactions","actmonth","actday"],range(days_obs))]]=pact_train[[
    ("country",0)]+[col for col in product(["transactions","actmonth","actday"],range(days_obs))]].astype("uint8")
    pact_train[[col for col in product(["clusters_ml_ended"],range(days_obs))]]=pact_train[
        [col for col in product(["clusters_ml_ended"],range(days_obs))]].astype("int8")
    pact_train[[col for col in product(["revenue"],range(days_obs))]]=pact_train[
        [col for col in product(["revenue"],range(days_obs))]].astype("float32")
    pact_train[[col for col in product(["number_of_pgr","quests_end","m_quests_end"],range(days_obs))]]=pact_train[[
        col for col in product(["number_of_pgr","quests_end","m_quests_end"],range(days_obs))]].astype("int16")
    pact_train.columns=["".join(map(str,col)) for col in pact_train.columns]
    pact_train.rename(columns={"regmonth0":"regmonth","regday0":"regday","country0":"country"},inplace=True)
    pact_train.index.name=None
    cleaned=pd.merge(pact_train,y,left_index=True,right_index=True)
    if testweek:
        train=cleaned[(cleaned.regmonth==1)|((cleaned.regmonth==2)&cleaned.regday<13)]
        test=[cleaned[(cleaned.regmonth==2)&(cleaned.regday==day)] for day in range(13,20)]
        return train,test
    else:
        train=cleaned[(cleaned.regmonth<3)|((cleaned.regmonth==3)&(cleaned.regday<17))]
        test=cleaned[~((cleaned.regmonth<3)|((cleaned.regmonth==3)&(cleaned.regday<17)))]
        print("Finished processing data")
        return train,test

In [4]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
from lightgbm import LGBMRegressor, early_stopping
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
import lightgbm
import sklearn
from multiprocessing import Pool, cpu_count
from functools import partial

In [5]:
# train,test=split_data(pact,7,30)
# regr1=LGBMRegressor(n_estimators=58)
# regr1.fit(train.drop("revenue",axis=1),train.revenue)
# print(r2_score(test.revenue,regr1.predict(test.drop("revenue",axis=1))),
#      mean_squared_error(test.revenue,regr1.predict(test.drop("revenue",axis=1))),
#      np.sqrt(mean_squared_error(test.revenue,regr1.predict(test.drop("revenue",axis=1)))),
#      mean_absolute_error(test.revenue,regr1.predict(test.drop("revenue",axis=1))))
# plot_importance(regr1)

In [6]:

# print(true_sum,regr1.predict(oneday.drop("revenue",axis=1)).sum())

In [20]:
def fit(regr,X,y):
    return regr.fit(X,y)

def get_models_scores(pact,days_obs,days_pred):
    print(str(days_obs)+"-"+str(days_pred))
    train,test=split_data(pact,days_obs,days_pred)
    regr1=LGBMRegressor(n_estimators=100)
    regr2=ElasticNet()
    regr3=RandomForestRegressor()
    regr4=ExtraTreesRegressor()
    regr5=GradientBoostingRegressor(subsample=0.85)
    x_train,y_train=train.drop("revenue",axis=1),train.revenue
    x_test,y_test=test.drop("revenue",axis=1),test.revenue
    regr1.fit(x_train,y_train,eval_set=[(x_test,y_test)],callbacks=[early_stopping(10,verbose=False)],verbose=False)
    with Pool(processes=cpu_count()) as p:
        regr2,regr3,regr4,regr5=p.map(partial(fit,X=x_train,y=y_train),[regr2,regr3,regr4,regr5])
    oneday=test[(test.regmonth==5)&(test.regday==1)]
    true_sum=oneday.revenue.sum()
    mse_results=[mean_squared_error(y_test,regr.predict(x_test)) for regr in [regr1,regr2,regr3,regr4,regr5]]
    best=np.argmin(mse_results)
    if best==0:
        lightgbm.plot_importance(regr1,max_num_features=10)
    elif best==2:
        sklearn.ensemble.plot_importance(regr2)
    elif best==3:
        sklearn.ensemble.plot_importance(regr3)
    elif best==4:
        sklearn.ensemble.plot_importance(regr4)
    plt.show()
    mae_results=[mean_absolute_error(y_test,regr.predict(x_test)) for regr in [regr1,regr2,regr3,regr4,regr5]]
    oneday_results=[regr.predict(oneday.drop("revenue",axis=1)).sum() for regr in [regr1,regr2,regr3,regr4,regr5]]
    return [pd.DataFrame(np.sqrt(np.array(mse_results+[np.mean(np.square(test.revenue))])).reshape(1,6),
            index=[str(days_obs)+"-"+str(days_pred)],columns=["LightGBM","ElasticNet","RandomForestRegressor",
            "ExtraTreesRegressor","GradientBoostingRegressor","RMS of revenues"]),
            pd.DataFrame(np.array(mae_results+[np.mean(test.revenue)]).reshape(1,6),index=[
            str(days_obs)+"-"+str(days_pred)],columns=["LightGBM","ElasticNet","RandomForestRegressor",
            "ExtraTreesRegressor","GradientBoostingRegressor","Mean of revenues"]),
            pd.DataFrame(np.array(oneday_results+[true_sum]).reshape(1,6),index=[
            str(days_obs)+"-"+str(days_pred)],columns=["LightGBM","ElasticNet","RandomForestRegressor",
            "ExtraTreesRegressor","GradientBoostingRegressor","True sum"])
           ]

def get_week_results(pact,days_obs,days_pred):
    print(str(days_obs)+"-"+str(days_pred))
    train,test=split_data(pact,days_obs,days_pred,testweek=True)
    regr1=LGBMRegressor(n_estimators=100)
    regr2=ElasticNet()
    regr3=RandomForestRegressor()
    regr4=ExtraTreesRegressor()
    regr5=GradientBoostingRegressor(subsample=0.85)
    x_train,y_train=train.drop("revenue",axis=1),train.revenue
    regr1.fit(x_train,y_train)
    with Pool(processes=cpu_count()) as p:
        regr2,regr3,regr4,regr5=p.map(partial(fit,X=x_train,y=y_train),[regr2,regr3,regr4,regr5])
    return [pd.DataFrame(np.array([regr.predict(test[day].drop("revenue",axis=1)).sum() for regr in [regr1,regr2,regr3,regr4,
            regr5]]+[test[day].revenue.sum()]).reshape(1,6),index=[str(days_obs)+"-"+str(days_pred)],columns=["LightGBM",
            "ElasticNet","RandomForestRegressor","ExtraTreesRegressor","GradientBoostingRegressor","True sum"]
                        ) for day in range(7)]

In [None]:
results_list=[get_models_scores(pact,obs,pred) for obs,pred in product([3,7,15],[15,30,60,100])]

In [None]:
pd.concat([item[0] for item in results_list])

In [None]:
pd.concat([item[1] for item in results_list])

In [None]:
pd.concat([item[2] for item in results_list])

In [None]:
pact.columns[0]

In [None]:
test.loc[test.actmonth2==5,"actday2"].sort_values()

In [None]:
test

In [21]:
week_result_list=[get_week_results(pact,obs,pred) for obs,pred in product([3,7,15],[15,30,60,100])]

3-15
3-30
3-60
3-100
7-15
7-30
7-60
7-100
15-15
15-30
15-60
15-100


In [22]:
pd.concat([item[0] for item in week_result_list])

Unnamed: 0,LightGBM,ElasticNet,RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,True sum
3-15,100885.385618,100316.465612,109656.145898,109101.069946,98614.354339,109101.078125
3-30,110156.010805,108078.133541,124010.530563,120058.369995,109530.289055,120058.375
3-60,116752.721432,112343.863574,131786.067535,127922.209961,115179.451342,127922.203125
3-100,118538.819913,113660.835558,135094.663193,130922.209961,116942.497814,130922.203125
7-15,106370.606682,103899.297066,108328.346924,109101.069946,103199.202245,109101.078125
7-30,117685.732339,112986.553948,121599.312651,120058.369995,112582.359756,120058.375
7-60,124856.715073,117737.274404,127939.438947,127922.209961,117615.695872,127922.203125
7-100,127039.030301,119244.950776,131663.684973,130922.209961,119784.991668,130922.203125
15-15,109017.254602,103333.231926,109209.221918,109101.069946,104354.915179,109101.078125
15-30,119149.946061,112783.803612,121543.72395,120058.369995,112058.166599,120058.375


In [23]:
pd.concat([item[1] for item in week_result_list])

Unnamed: 0,LightGBM,ElasticNet,RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,True sum
3-15,102573.319731,109262.076884,111470.589518,108122.569824,101534.1228,108122.570312
3-30,112036.894985,117246.357202,121283.949481,118131.079834,112069.313285,118131.078125
3-60,118141.211017,121551.924159,126135.207802,120288.589844,117695.144284,120288.59375
3-100,120236.161924,122881.770908,130244.976233,123705.579834,119560.784297,123705.578125
7-15,107786.865811,113336.154658,107808.998846,108122.569824,107577.423636,108122.570312
7-30,119163.396649,122773.179837,124003.089673,118131.079834,117505.35102,118131.078125
7-60,124920.10486,127627.521338,125896.551839,120288.589844,123093.299516,120288.59375
7-100,127566.729191,129175.672308,130658.195678,123705.579834,124523.765445,123705.578125
15-15,108207.018044,111180.076571,108309.80484,108122.569824,104229.766547,108122.570312
15-30,118228.177197,120999.445398,119942.961503,118131.079834,113953.151149,118131.078125


In [24]:
pd.concat([item[2] for item in week_result_list])

Unnamed: 0,LightGBM,ElasticNet,RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,True sum
3-15,133649.300304,138828.113592,140302.658862,137331.139893,132205.069617,137331.140625
3-30,145028.913148,148649.75866,158554.29187,154339.649902,145157.914757,154339.65625
3-60,152836.965272,153876.807622,164157.912885,161339.649902,151825.546307,161339.65625
3-100,154816.640177,155495.908104,167614.608152,164339.649902,153549.978406,164339.65625
7-15,137417.568835,142196.398237,136884.289899,137331.139893,137422.934687,137331.140625
7-30,151294.074875,153795.624799,155310.287528,154339.649902,148863.613953,154339.65625
7-60,158488.823203,159699.938139,158596.017546,161339.649902,154984.19919,161339.65625
7-100,161056.081092,161602.171095,163398.89822,164339.649902,156708.795103,164339.65625
15-15,137246.24717,139127.835176,137179.989893,137331.139893,135108.245127,137331.140625
15-30,150466.597414,151154.345302,154276.212897,154339.649902,144739.388299,154339.65625


In [25]:
pd.concat([item[3] for item in week_result_list])

Unnamed: 0,LightGBM,ElasticNet,RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,True sum
3-15,144193.749696,145435.539986,155556.955124,154152.279846,142533.163832,154152.28125
3-30,156398.098327,156103.935351,172843.567744,167747.979797,156462.765162,167747.984375
3-60,164815.033797,161733.922177,179298.785582,177879.809814,163533.06666,177879.8125
3-100,166872.100879,163487.437532,183827.207772,185981.779785,165639.90139,185981.78125
7-15,148699.277602,153378.813662,155311.445837,154152.279846,149524.97401,154152.28125
7-30,162562.100362,166119.770027,168859.594818,167747.979797,162440.945233,167747.984375
7-60,169951.281356,172492.560646,177634.905038,177879.809814,169304.392702,177879.8125
7-100,174810.573305,174573.360726,181355.575187,185981.779785,171345.081022,185981.78125
15-15,154314.409814,154193.208764,154143.709839,154152.279846,153225.578575,154152.28125
15-30,167174.317889,166985.378412,167632.213831,167747.979797,163764.201661,167747.984375


In [26]:
pd.concat([item[4] for item in week_result_list])

Unnamed: 0,LightGBM,ElasticNet,RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,True sum
3-15,157746.947884,157822.785134,168950.052734,167802.919739,156174.199014,167802.90625
3-30,170364.399003,169002.139081,185344.559233,183624.209656,169894.682342,183624.21875
3-60,177730.051877,174828.809075,196174.622375,195215.729675,177224.926956,195215.734375
3-100,180373.287702,176646.656065,198318.930221,197215.729675,179176.505042,197215.734375
7-15,157204.725957,157204.788468,166354.740729,167802.919739,154931.869873,167802.90625
7-30,170472.096355,169893.612616,183926.762695,183624.209656,166628.418634,183624.21875
7-60,177756.769747,176130.721002,193329.763733,195215.729675,173169.817176,195215.734375
7-100,181769.217491,178197.090347,194656.517719,197215.729675,175359.340372,197215.734375
15-15,167519.308838,160520.807512,167603.307721,167802.919739,158910.185189,167802.90625
15-30,179391.600548,173646.297518,183683.286664,183624.209656,169582.852263,183624.21875


In [27]:
pd.concat([item[5] for item in week_result_list])

Unnamed: 0,LightGBM,ElasticNet,RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,True sum
3-15,181834.74004,168495.940982,190158.011792,186675.569824,179834.512615,186675.5625
3-30,196523.668497,180839.616133,203204.338794,200003.129822,195556.554575,200003.125
3-60,204355.683178,187216.589901,212898.138137,215237.229858,203518.119483,215237.21875
3-100,207645.574876,189197.747326,215458.60882,219079.719849,205916.504168,219079.71875
7-15,182514.919533,167502.860621,187420.224817,186675.569824,180843.445139,186675.5625
7-30,196917.399506,181376.194853,200650.382825,200003.129822,193062.628354,200003.125
7-60,205311.302843,188043.134734,218517.536847,215237.229858,199131.540634,215237.21875
7-100,208339.54829,190254.7823,220962.609821,219079.719849,201747.72942,219079.71875
15-15,186112.365294,169014.794155,186389.125806,186675.569824,179695.137453,186675.5625
15-30,201335.463122,185093.273293,199937.109802,200003.129822,193733.245637,200003.125


In [28]:
pd.concat([item[6] for item in week_result_list])

Unnamed: 0,LightGBM,ElasticNet,RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,True sum
3-15,216690.45395,181274.518119,224477.486049,220202.289734,214928.280487,220202.296875
3-30,231100.121609,194280.190578,241443.143677,232793.809692,230081.759282,232793.8125
3-60,239713.312162,200926.49451,249704.710438,243568.219727,238256.954553,243568.21875
3-100,242511.357631,202982.595993,254294.552479,248636.299744,240437.881311,248636.3125
7-15,221153.491124,190574.901274,224409.037708,220202.289734,219527.075631,220202.296875
7-30,235271.301385,205787.709309,237929.855369,232793.809692,231736.618305,232793.8125
7-60,242849.374206,213020.941056,247821.385411,243568.219727,238346.515762,243568.21875
7-100,246803.208188,215403.003507,250284.916404,248636.299744,240679.509845,248636.3125
15-15,220392.417971,194574.714974,220332.10473,220202.289734,220746.666286,220202.296875
15-30,235589.029551,209875.924717,234835.568726,232793.809692,233665.895412,232793.8125
