# Train-Test Split

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('content_views_merged.csv')

In [3]:
formatted = pd.read_csv('formatted_content.csv')
formatted = formatted[['id', 'processed_content']]

This drops ~400 articles with no content (e.g. opinion art)

In [4]:
data = data.merge(formatted, on = 'id', how = 'left')
data = data.dropna()

In [5]:
# convert datetime
data['published_date'] = data['published_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
data = data.sort_values('published_date', ascending = True)
# Get year-month variable (to do the stratified train-test split)
data['year_month'] = data['published_date'].apply(lambda x: datetime.strftime(x, '%Y-%m'))
# Remove the 2012-01 article
data = data.loc[data['year_month'] != '2012-01', :]

In [6]:
data.groupby('year_month')['year_month'].count().sort_values(ascending = False)

year_month
2013-03    460
2013-09    428
2016-11    420
2014-03    397
2017-02    396
          ... 
2018-06     40
2019-06     39
2012-08     37
2018-07     27
2012-06     10
Name: year_month, Length: 98, dtype: int64

## Add the Percentile Feature

Percentile of the article's views compared to other articles in the same **month**. Thus, with this derived variable, we can do whatever classification threshold we want later (top 10% or 25% etc.)

In [7]:
data['percentile'] = data.groupby('year_month')['pageviews'].transform(lambda x: x.rank(pct = True))

In [8]:
# Split into train and test
train, test = train_test_split(data, test_size = 0.2, stratify = data['year_month'], random_state = 42)

- Train: 16772
- Test: 4194

In [9]:
# Train and Test Split
train = train.sort_values('published_date', ascending = False)
test = test.sort_values('published_date', ascending = False)

In [17]:
train['top25pct'] = (train['percentile'] >= 0.75).astype(int)
test['top25pct'] = (test['percentile'] >= 0.75).astype(int)

In [88]:
train.to_csv('train.csv', index = None)
test.to_csv('test.csv', index = None)