In [1]:
%matplotlib inline
from pandas import read_csv, DataFrame
from numpy import nanmean
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import normalize
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from random import sample
from sklearn.metrics import accuracy_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE, SelectFromModel, SelectKBest, VarianceThreshold

In [None]:
from warnings import simplefilter
simplefilter("ignore")

In [None]:
# Train data
train = read_csv('train.csv', nrows=50000000, parse_dates=['click_time', 'attributed_time'])
print(train.head())

### Exploration

In [None]:
# Class imbalance
train['is_attributed'].value_counts()

In [None]:
cols = ['ip', 'app', 'device', 'os', 'channel']
print('Number of unique values in each column')
for col in cols:
    train[col] = train[col].astype('category',copy=False)
    print(col, len(train[col].unique()))

In [None]:
train['day'] = train['click_time'].dt.day.astype(int)
train['hour'] = train['click_time'].dt.hour.astype(int)
train['minute'] = train['click_time'].dt.minute.astype(int)
train['second'] = train['click_time'].dt.second.astype(int)
print(train.head())

In [None]:
# Download time from click (seconds)
train['download_time'] = train.attributed_time - train.click_time
train['download_time'] = train['download_time'].dt.seconds
print(train[train['is_attributed'] == 1].head())

### Frequency Columns

In [None]:
FREQUENCY_COLUMNS = ['ip', 'app', 'device', 'os', 'channel']

# Find frequency of is_attributed for each unique value in column
freqs = {}
for col in FREQUENCY_COLUMNS:
    print(f">> Calculating frequency for: {col}")

    # Get counts, sums and frequency of is_attributed
    df = DataFrame({
        'sums': train.groupby(col)['is_attributed'].sum(),
        'counts': train.groupby(col)['is_attributed'].count()
    })
    df.loc[:, 'freq'] = df.sums / df.counts
    
    # If we have less than 3 observations, e.g. for an IP, then assume freq of 0
    df.loc[df.counts <= 3, 'freq'] = 0        
    
    # Add to X_total
    train[col+'_freq'] = train[col].map(df['freq'])
    
train.head()

In [None]:
y = train['is_attributed']
X = train[[col for col in train.columns.values if col not in  ['is_attributed', 'click_time', 'attributed_time']]]
X['download_time'].fillna(0, inplace=True)

In [None]:
# Splitting Train test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.01, random_state=12)

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)

In [None]:
y_pred = nb.predict(X_test)
print(y_pred[:4])

In [None]:
print('AUC:', roc_auc_score(y_test, y_pred))
print('MCC:', matthews_corrcoef(y_test, y_pred))
print('Acc:', accuracy_score(y_test, y_pred))
print('Confusion matrix\n', confusion_matrix(y_test, y_pred))

In [None]:
# Normalization of Train and Test
cols = list(X.columns.values)

# Train
X = DataFrame(normalize(X))
X.columns = cols
X.head(2)

# Test
test_xgb_org = DataFrame(normalize(test_xgb_org))
test_xgb_org.columns = cols
test_xgb_org.head(2)

In [None]:
# Splitting Train test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.9, random_state=12)

In [None]:
y_test.head(2)

# Data Exploration and prepocessing

In [None]:
# Check for duplicate rows

In [None]:
# Missing value check
X_train.isnull().sum()

In [None]:
# Outliers
fig, ax = plt.subplots(figsize=(15,  15))
# X_train.boxplot(by='target', ax=ax)

In [None]:
# Bar plots
X_train.iloc[:, :4].hist()

In [None]:
# Finding best distribution for each feature

cdfs = [
    "norm",            #Normal (Gaussian)
    "alpha",           #Alpha
    "beta",            #Beta
    "expon",           #Exponential
    "gamma",           #Gamma
    "laplace",         #Laplace
    "rayleigh",        #Rayleigh
    "uniform",         #Uniform
       ]

col_name=list(X_train.columns.values)
X_train.fillna(0, inplace=True)
trans = {}
for i in range(X_train.shape[1]):
    p_max = -100
    dist = ''
    temp = X_train[col_name[i]].transpose().values.tolist()
    # fit our data set against every probability distribution
    for cdf in cdfs:
        parameters = eval("stats."+cdf+".fit(temp)")
        #Applying the Kolmogorov-Smirnof one sided test
        D, p = stats.kstest(temp, cdf, args=parameters)
        if p > p_max:
            p_max = p
            dist = cdf
            #pretty-print the results
        #print cdf.ljust(16) + ("p: "+str(p)).ljust(25)+"D: "+str(D)
    #trans.append(dist)
    trans[col_name[i]]=dist
    print(col_name[i], ":", dist, "distribution")

# Feature Engineering / Selection

In [None]:
from numpy import var, count_nonzero

def agg_feat(data):
    data['sum'] = data.apply(lambda row: sum(row.values), axis=1)
    data['var'] = data.apply(lambda row: var(row.values), axis=1)
    data['nonzero'] = data.apply(lambda row: count_nonzero(row.values), axis=1)
    return data

In [None]:
X_train = agg_feat(X_train)
X_test = agg_feat(X_test)
test_xgb_org = agg_feat(test_xgb_org)

In [None]:
test_xgb_org.head(2)

In [None]:
cols = list(X_train.columns.values)

In [None]:
# Checking collinearity (using correlation)
correl = X_train.corr()
# train["feat_1"].corr(train["feat_2"])

In [None]:
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        curr_cor = correl.loc[cols[i], cols[j]]
        if (curr_cor >= 0.7) and (curr_cor < 0.8):
            print(cols[i], cols[j], curr_cor)

### Variance Threshold Check

In [None]:
vt = VarianceThreshold()
vt_train = vt.fit(X_train)

In [None]:
# vt.variances_
vt_df = DataFrame({'feature': list(X_train.columns.values), 'variance': vt.variances_}).sort_values(by='variance', ascending=True)
print(vt_df.tail(10))

### Normalization

In [None]:
# Normalizing data
norm_train = DataFrame(normalize(X_train))
norm_train.columns = list(X_train.columns.values)
norm_train.head(2)

### PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=len(norm_train.columns.values))

In [None]:
pca_train = DataFrame(pca.fit_transform(norm_train))

In [None]:
sum(pca.explained_variance_[:40])

### Feature Selection

In [None]:
# Select From Model
feats = list(X_train.columns.values)

rf = RandomForestClassifier(n_estimators=100, verbose=2, random_state=1, max_depth=20)

# define Boruta feature selection method
feat_selector = SelectFromModel(rf)

# find all relevant features - 20 features should be selected
feat_selector.fit(X_train, y_train)

In [None]:
sfmodel_feats = [feats[i] for i in feat_selector.get_support(indices=True)]
print(sfmodel_feats)

# Model Training

Random Forest

In [None]:
rf = RandomForestClassifier()
params = {'n_estimators': [10, 20],
         'max_depth': [5]}
grid = GridSearchCV(estimator=rf, param_grid=params, scoring='roc_auc')
grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)
# print(y_pred[:4])
print('AUC:', roc_auc_score(y_test, y_pred))
print('MCC:', matthews_corrcoef(y_test, y_pred))
print('Acc:', accuracy_score(y_test, y_pred))
print('Confusion Matrix\n', confusion_matrix(y_test, y_pred))

### XGBoost

In [None]:
import xgboost as xgb

# Doing gridsearch to find best params configuration
clf = xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss')

params = {
    'learning_rate': [0.02],   # Learning rate alpha
    'max_depth': [10],   # maximum depth of the tree
    'gamma': [1],   # minimum eval_score deduction at each split
    'min_child_weight': [6],  # minimum number of datapoints in a split
    'subsample': [0.9],  # sample size row-wise during bootstrap
    'colsample_bytree': [0.4],  # column-wise sample size
    'n_estimators': [1000],   # number of trees to build
    }

grid = GridSearchCV(clf, params, cv=5, verbose=20, n_jobs=-1, refit=True)

grid.fit(X_train, y_train)

# CV results
cv_result = DataFrame(grid.cv_results_).to_csv('cv_results.csv', index=False)

In [None]:
print(grid.best_score_)
print(grid.best_estimator_)

In [None]:
# Testing on X_test
pred = grid.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred))
print('MCC:', matthews_corrcoef(y_test, pred))

In [None]:
# Using best params to find optimum number of iterations
grid_output = grid.best_params_
params = {
    'objective': 'multi:softprob', 
    'eval_metric': 'mlogloss', 
    'num_class': 9
    }

best_params = {**grid_output, **params}
#best_params['learning_rate'] = 0.02
#print(best_params)

In [None]:
train_xgb = xgb.DMatrix(X_train, y_train)

cv_results = xgb.cv(best_params, train_xgb, num_boost_round=10000, nfold=5, stratified=True, as_pandas=True, 
                    seed=1, shuffle=True, early_stopping_rounds=20, verbose_eval=True)

In [None]:
nround = cv_results.shape[0]  # Where the best iteration happened
print('Best Iteration:', nround)
xgb_clf = xgb.train(best_params, train_xgb, num_boost_round=nround, verbose_eval=True)

# Predicting on the test set
test_xgb  = xgb.DMatrix(test_xgb_org)
test_pred = xgb_clf.predict(test_xgb)
Class_1, Class_2, Class_3, Class_4, Class_5, Class_6, Class_7, Class_8, Class_9 = map(list, zip(*test_pred))
output = DataFrame({'id': test['id'],
                    'Class_1': Class_1, 
                    'Class_2': Class_2, 
                    'Class_3': Class_3, 
                    'Class_4': Class_4, 
                    'Class_5': Class_5, 
                    'Class_6': Class_6, 
                    'Class_7': Class_7, 
                    'Class_8': Class_8, 
                    'Class_9': Class_9})
output = output[['id', 'Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']]

output.to_csv('output.csv', index=False)
output.head(2)