# 라이브러리 로드

In [29]:
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

# Stage 1

## 데이터 로드

In [8]:
train_df1 = pd.read_csv('train_1.csv')
train_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145063 entries, 0 to 145062
Columns: 551 entries, Page to 2016-12-31
dtypes: float64(550), object(1)
memory usage: 609.8+ MB


In [9]:
train_df1.head()

Unnamed: 0,Page,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,...,2016-12-22,2016-12-23,2016-12-24,2016-12-25,2016-12-26,2016-12-27,2016-12-28,2016-12-29,2016-12-30,2016-12-31
0,2NE1_zh.wikipedia.org_all-access_spider,18.0,11.0,5.0,13.0,14.0,9.0,9.0,22.0,26.0,...,32.0,63.0,15.0,26.0,14.0,20.0,22.0,19.0,18.0,20.0
1,2PM_zh.wikipedia.org_all-access_spider,11.0,14.0,15.0,18.0,11.0,13.0,22.0,11.0,10.0,...,17.0,42.0,28.0,15.0,9.0,30.0,52.0,45.0,26.0,20.0
2,3C_zh.wikipedia.org_all-access_spider,1.0,0.0,1.0,1.0,0.0,4.0,0.0,3.0,4.0,...,3.0,1.0,1.0,7.0,4.0,4.0,6.0,3.0,4.0,17.0
3,4minute_zh.wikipedia.org_all-access_spider,35.0,13.0,10.0,94.0,4.0,26.0,14.0,9.0,11.0,...,32.0,10.0,26.0,27.0,16.0,11.0,17.0,19.0,10.0,11.0
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...,,,,,,,,,,...,48.0,9.0,25.0,13.0,3.0,11.0,27.0,13.0,36.0,10.0


In [10]:
pred_df1 = pd.read_csv('key_1.csv')
pred_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8703780 entries, 0 to 8703779
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Page    object
 1   Id      object
dtypes: object(2)
memory usage: 132.8+ MB


In [11]:
pred_df1.head()

Unnamed: 0,Page,Id
0,!vote_en.wikipedia.org_all-access_all-agents_2...,bf4edcf969af
1,!vote_en.wikipedia.org_all-access_all-agents_2...,929ed2bf52b9
2,!vote_en.wikipedia.org_all-access_all-agents_2...,ff29d0f51d5c
3,!vote_en.wikipedia.org_all-access_all-agents_2...,e98873359be6
4,!vote_en.wikipedia.org_all-access_all-agents_2...,fa012434263a


## 훈련데이터 전처리 & EDA

In [13]:
train_df1['Page'].nunique()

145063

In [16]:
train_df1.set_index('Page').isnull().sum().sort_values(ascending=False)

2015-07-02    20816
2015-07-01    20740
2015-07-07    20664
2015-07-05    20659
2015-07-04    20654
              ...  
2016-12-12     3538
2016-12-31     3465
2016-12-20     3268
2016-12-21     3236
2016-12-24     3189
Length: 550, dtype: int64

In [19]:
train_df1_pivot = pd.melt(train_df1, id_vars='Page', var_name='Date', value_name='Visits')
train_df1_pivot.head()

Unnamed: 0,Page,Date,Visits
0,2NE1_zh.wikipedia.org_all-access_spider,2015-07-01,18.0
1,2PM_zh.wikipedia.org_all-access_spider,2015-07-01,11.0
2,3C_zh.wikipedia.org_all-access_spider,2015-07-01,1.0
3,4minute_zh.wikipedia.org_all-access_spider,2015-07-01,35.0
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...,2015-07-01,


In [21]:
train_df1_pivot['Date'] = pd.to_datetime(train_df1_pivot['Date'])
train_df1_pivot['Date'].dtype

dtype('<M8[ns]')

In [27]:
# average visits per day of wiki
train_df1_pivot_groupby_date = train_df1_pivot.groupby('Date').mean().reset_index()

fig = px.line(train_df1_pivot_groupby_date, x='Date', y='Visits', title='Average Visits per day')
fig.show()

In [35]:
train_df1_pivot_groupby_date.index

RangeIndex(start=0, stop=550, step=1)

In [50]:
# split to train test dataset in each page
train_size = int(len(train_df1_pivot_groupby_date) * 0.8)
train_df1_pivot_groupby_date.iloc[0 : train_size, 0]

0     2015-07-01
1     2015-07-02
2     2015-07-03
3     2015-07-04
4     2015-07-05
         ...    
435   2016-09-08
436   2016-09-09
437   2016-09-10
438   2016-09-11
439   2016-09-12
Name: Date, Length: 440, dtype: datetime64[ns]

In [79]:
train_due_date = '2016-09-12'
train_df1_trainset = pd.melt(train_df1.loc[:, :train_due_date], id_vars='Page', var_name='Date', value_name='Visits')
train_df1_testset = pd.melt(train_df1.loc[:, train_df1.columns[441:].insert(0, 'Page')], id_vars='Page', var_name='Date', value_name='Visits')

len(train_df1_trainset), len(train_df1_testset)

(63827720, 15956930)

In [80]:
train_df1_trainset

Unnamed: 0,Page,Date,Visits
0,2NE1_zh.wikipedia.org_all-access_spider,2015-07-01,18.0
1,2PM_zh.wikipedia.org_all-access_spider,2015-07-01,11.0
2,3C_zh.wikipedia.org_all-access_spider,2015-07-01,1.0
3,4minute_zh.wikipedia.org_all-access_spider,2015-07-01,35.0
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...,2015-07-01,
...,...,...,...
63827715,Underworld_(serie_de_películas)_es.wikipedia.o...,2016-09-12,
63827716,Resident_Evil:_Capítulo_Final_es.wikipedia.org...,2016-09-12,
63827717,Enamorándome_de_Ramón_es.wikipedia.org_all-acc...,2016-09-12,
63827718,Hasta_el_último_hombre_es.wikipedia.org_all-ac...,2016-09-12,


In [83]:
train_df1_testset

Unnamed: 0,Page,Date,Visits
0,2NE1_zh.wikipedia.org_all-access_spider,2016-09-13,11.0
1,2PM_zh.wikipedia.org_all-access_spider,2016-09-13,37.0
2,3C_zh.wikipedia.org_all-access_spider,2016-09-13,4.0
3,4minute_zh.wikipedia.org_all-access_spider,2016-09-13,11.0
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...,2016-09-13,13.0
...,...,...,...
15956925,Underworld_(serie_de_películas)_es.wikipedia.o...,2016-12-31,10.0
15956926,Resident_Evil:_Capítulo_Final_es.wikipedia.org...,2016-12-31,
15956927,Enamorándome_de_Ramón_es.wikipedia.org_all-acc...,2016-12-31,
15956928,Hasta_el_último_hombre_es.wikipedia.org_all-ac...,2016-12-31,
