Procedure
1. Prepare
2. N Fold Split
3. Rebalance (Up-sampling)
4. Feature Selection
5. Training
6. Testing

In [1]:
# Import settings
import config
from config import load_ml_data
# Imports
import pandas as pd
from pandas import *
import numpy as np
# Import scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import *
from sklearn.metrics import *
import random
import pickle

# Parameters
n_folds = 2
interval = 30

# 1. Prepare
def prepare_data(interval):
    df = load_ml_data(interval)
    # Converting all type to float, to prepare for feature selection
    df = df.astype('float')
    # Reset index, with drop equals to true to avoid setting old index as a new column
    df = df.reset_index(drop=True)
    # Visualize distribution
    print('[Original] data counts, with uninfected (0): {}, infected (1): {}'.format(
        df['label'].value_counts()[0],
        df['label'].value_counts()[1]
    ))
    df.groupby(['tCurrent','label']).size().unstack(fill_value=0).plot.bar(title='Original Data Distribution')
    
    columns = list(df.columns)
    columns.remove('label')
    
    X = df[columns]
    y = df[['label']]
    return df, X, y


# 1. Prepare
df, X, y = prepare_data(interval)

# 2. N Fold Split

# Stratified K-Folds cross-validator
# Provides train/test indices to split data in train/test sets. 
# This cross-validation object is a variation of KFold that returns stratified folds. 
# The folds are made by preserving the percentage of samples for each class.
skf = StratifiedKFold(n_splits=n_folds, shuffle=True)

mse = []
fold_count = 0

print("Start cross validation")
for train, test in skf.split(X, y):
    print("===Processing fold %s===" % fold_count)
    train_fold = df.loc[train]
    test_fold = df.loc[test]
    
    # 3. Rebalance (Up-sampling)
    
    # 4. Feature Selection
    corr = train_fold.corr()['label'][train_fold.corr()['label'] < 1].abs()
    corr = corr.sort_values(ascending=False)
    print(corr)
    features = corr.index[range(10)].values
    
    # 5. Training
    # Get training examples
    train_fold_input  = train_fold[features].values
    train_fold_output = train_fold['label']
    # Fit logistic regression
    logreg = LogisticRegression()
    logreg.fit(train_fold_input, train_fold_output)
    
    pickle_out = open("rubbish_predictor.pickle","wb")
    pickle.dump(logreg, pickle_out)
    pickle_out.close()
    
    # 6. Testing
    # Check MSE on test set
    pred = logreg.predict(test_fold[features])
    mse.append(mean_squared_error(test_fold.label, pred))
    
    cm = confusion_matrix(test_fold.label, pred)
    print(cm)
    
    # Done with the fold
    fold_count += 1
print("Finished cross validation")
print(DataFrame(mse).mean())

{'calculate': {'analysis': True,
               'friends': False,
               'network': True,
               'uniquetweets': True,
               'uniqueusers': True},
 'data': {'dates': ['2018-03-11', '2018-03-12', '2018-03-13'],
          'eventname': "Givenchy's Death",
          'phrases': ['givenchy%20death', 'givenchy%20passed%20away'],
          'starttime': 'Mar 12 08:20:00 -0500 2018'},
 'path': {'cwd': '/Users/lzhou/git/github/uclresearchanalysis/data/givenchy',
          'ml': '/Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle',
          'networkx': {'all': '/Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/networkx_all.dat',
                       'friends': '/Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/networkx_friends.dat',
                       'potential': '/Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/networkx_potential.dat'},
          'newcrawl': '/Users/lzhou/git/github/uclresearchanalysis/ot

Reference
- http://www.alfredo.motta.name/cross-validation-done-wrong/
- https://www.marcoaltini.com/blog/dealing-with-imbalanced-data-undersampling-oversampling-and-proper-cross-validation