In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

In [2]:
#import campnew data as dataframe
camp=pd.read_csv('..\camp_data.csv')
camp['date']=pd.to_datetime(camp['date'], format='%d/%m/%Y')
print("date range is: ", camp.date.min(), camp.date.max())

date range is:  2018-09-17 00:00:00 2018-10-04 00:00:00


# Fill in missing value

In [3]:
camp.isnull().sum().sort_values(ascending=False).head()

goal_type          32143
conversions         1551
clicks              1522
impressions         1502
total_spend_cpm        0
dtype: int64

In [4]:
#fill in 0 for goal_type missing value
camp.goal_type=camp.goal_type.fillna(0)

In [5]:
#fill in median value for converions missing value. Generally the median is 0
camp.conversions=camp.conversions.fillna(camp.conversions.median())

In [6]:
#fill in median value for clicks missing value. Generally the median is 0
camp.clicks=camp.clicks.fillna(camp.clicks.median())

In [7]:
#fill in median value for impressions missing value
camp.impressions=camp.impressions.fillna(camp.impressions.median())

In [8]:
#fill in median value for total_spend_cpm missing value
camp.total_spend_cpm=camp.total_spend_cpm.fillna(camp.total_spend_cpm.median())

In [9]:
camp.isnull().sum()

date                 0
business_vertical    0
country              0
region               0
city_code            0
strategy_id          0
channel_name         0
goal_type            0
total_spend_cpm      0
impressions          0
clicks               0
conversions          0
dtype: int64

In [10]:
camp.dtypes

date                 datetime64[ns]
business_vertical             int64
country                       int64
region                       object
city_code                    object
strategy_id                   int64
channel_name                  int64
goal_type                   float64
total_spend_cpm             float64
impressions                 float64
clicks                      float64
conversions                 float64
dtype: object

# Remove outlier

In [11]:
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from numpy import std

In [12]:
#standard deviation method
cpm_mean, cpm_std = mean(camp['total_spend_cpm']), std(camp['total_spend_cpm'])
print("mean of total spend cpm:",cpm_mean,"\nstd dev of total spend cpm",cpm_std)

mean of total spend cpm: 424.82676979982716 
std dev of total spend cpm 5275.148960136036


In [13]:
cut_off = cpm_std * 3
lower, upper = cpm_mean - cut_off, cpm_mean + cut_off

In [14]:
outliers = [x for x in camp['total_spend_cpm'] if x < lower or x > upper]

In [15]:
outliers_removed = [x for x in camp['total_spend_cpm'] if x > lower and x < upper]

In [16]:
print("number of outliers:",len(outliers),"\nnumber of kept values:",len(outliers_removed))

number of outliers: 353 
number of kept values: 74852


In [17]:
camp = camp.loc[(camp['total_spend_cpm'] > lower) & (camp['total_spend_cpm'] < upper)]

In [18]:
camp.shape

(74852, 12)

# Numerical input

In [19]:
for col in camp.columns:
    if camp[col].dtypes == 'float64':
        uni_count = len(camp[col].unique())
        print("Number of unique features for '{col}' is {uni_count}".format(col=col, uni_count=uni_count))

Number of unique features for 'goal_type' is 3
Number of unique features for 'total_spend_cpm' is 48132
Number of unique features for 'impressions' is 2309
Number of unique features for 'clicks' is 52
Number of unique features for 'conversions' is 1


In [20]:
for col in camp.columns:
    if camp[col].dtypes == 'int64':
        uni_count = len(camp[col].unique())
        print("Number of unique features for '{col}' is {uni_count}".format(col=col, uni_count=uni_count))

Number of unique features for 'business_vertical' is 2
Number of unique features for 'country' is 8
Number of unique features for 'strategy_id' is 75
Number of unique features for 'channel_name' is 3


In [21]:
#choose features which has larger dataset or meaningful dataset for prediction
numerical_features = camp[['strategy_id','channel_name','total_spend_cpm','impressions','clicks']].values

# Categorical input

In [22]:
camp['date'].head(3)

0   2018-09-17
1   2018-09-17
2   2018-09-17
Name: date, dtype: datetime64[ns]

In [23]:
camp['day']=camp['date'].dt.strftime('%d')
#pd.to_numeric(camp['day']).astype(int)
camp['month']=camp['date'].dt.strftime('%m')
#pd.to_numeric(camp['month']).astype(int)
camp['year']=camp['date'].dt.strftime('%Y')
#pd.to_numeric(camp['year']).astype(int)

In [24]:
for col in camp.columns:
    if camp[col].dtypes == 'object':
        uni_count = len(camp[col].unique())
        print("Number of unique features for '{col}' is {uni_count}".format(col=col, uni_count=uni_count))

Number of unique features for 'region' is 127
Number of unique features for 'city_code' is 3894
Number of unique features for 'day' is 18
Number of unique features for 'month' is 2
Number of unique features for 'year' is 1


In [25]:
from sklearn.preprocessing import LabelEncoder

region_le = LabelEncoder()
camp['region'] = region_le.fit_transform(camp['region'])
city_code_le = LabelEncoder()
camp['city_code'] = city_code_le.fit_transform(camp['city_code'])
camp

Unnamed: 0,date,business_vertical,country,region,city_code,strategy_id,channel_name,goal_type,total_spend_cpm,impressions,clicks,conversions,day,month,year
0,2018-09-17,1,2,0,2709,3718754,1,1.0,108.258821,88.0,0.0,0.0,17,09,2018
1,2018-09-17,1,2,0,2709,3714868,1,1.0,149.565425,48.0,0.0,0.0,17,09,2018
2,2018-09-17,1,2,0,2709,3718750,1,1.0,255.451460,157.0,0.0,0.0,17,09,2018
3,2018-09-17,1,2,0,2709,3715603,1,1.0,291.516169,124.0,1.0,0.0,17,09,2018
4,2018-09-17,1,2,0,2709,3716560,1,2.0,509.019147,415.0,0.0,0.0,17,09,2018
5,2018-09-17,1,2,0,2709,3718702,2,1.0,3.761886,1.0,0.0,0.0,17,09,2018
6,2018-09-17,1,2,0,2709,3726250,2,1.0,72.744913,24.0,0.0,0.0,17,09,2018
7,2018-09-17,1,2,0,2709,3718728,2,1.0,210.836410,129.0,0.0,0.0,17,09,2018
8,2018-09-17,1,2,0,2709,3718757,2,1.0,226.906143,217.0,1.0,0.0,17,09,2018
9,2018-09-17,1,2,0,2709,3718749,2,1.0,260.405991,201.0,0.0,0.0,17,09,2018


In [26]:
categorical_features=camp[['region','city_code']].values

In [27]:
#wrong trial
#pd.get_dummies(camp,columns=camp['region'])

# Split the data

In [28]:
features_array = np.concatenate((numerical_features, categorical_features),axis=1)

In [29]:
features_array.shape

(74852, 7)

In [30]:
target_array=camp[['month','day']]

In [31]:
target_array.shape

(74852, 2)

In [32]:
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(
    features_array, target_array, test_size=0.20, random_state=0)

# Feature Scaling

In [37]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
scaler = preprocessing.StandardScaler()
features_train_scaled = scaler.fit_transform(features_train)       

In [38]:
features_train_scaled

array([[ 0.22314163, -1.02334797, -0.16680429, ..., -0.08573281,
         1.17714094, -0.16018005],
       [ 0.22305329,  0.20464088, -0.18550706, ..., -0.08573281,
         0.65839127, -0.67586455],
       [ 0.22305329,  0.20464088, -0.1854626 , ..., -0.08573281,
         1.29919968,  0.39775793],
       ...,
       [-0.8782452 ,  1.43262974, -0.18748358, ..., -0.08573281,
         1.42125843,  0.84045915],
       [ 0.22314163, -1.02334797, -0.05605452, ..., -0.08573281,
        -0.10447589, -1.8051848 ],
       [ 0.86465043, -1.02334797, -0.17506987, ..., -0.08573281,
         1.54331718,  0.45057478]])