In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import ShuffleSplit, train_test_split
from sklearn import preprocessing
from sklearn.ensemble import BaggingRegressor
import math, os, random 
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, make_scorer, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor


In [2]:
# Plot the learning curve
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
  
    if axes is None:
        _, axes = plt.subplots(1, 1, figsize=(20, 5), squeeze=False)

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")
    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True, scoring=make_scorer(r2_score))
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)
    print("the shape of train_scores_mean", train_scores_mean.shape)
    print(train_scores_mean)
    print("the shape of test_scores_mean", test_scores_mean.shape)
    print(test_scores_mean)
    
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training r2_score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Testing r2_score")
    axes[0].legend(loc="best")

    return plt

In [3]:
def mixup_data(x, y, alpha=0.2, use_cuda=False):

    # pandas DataFrame to numpy array
    x = x.values
    y = y.values

    '''Compute the mixup data. Return mixed inputs, pairs of targets, and lambda'''
    if alpha > 0.:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1.

    train_size = x.shape[0]
    if use_cuda:
        # index = torch.randperm(batch_size).cuda()
        pass
    else:
        index = np.random.permutation(train_size)

    mixed_x = lam * x + (1 - lam) * x[index,:]
    mixed_y = lam * y + (1 - lam) * y[index]

    # numpy array to pandas DataFrame
    mixed_x = pd.DataFrame(mixed_x)
    mixed_y = pd.Series(mixed_y)

    return mixed_x, mixed_y


In [4]:
# Load the train data and test data

def load_data():
    files = os.listdir("Data")
    train_file = "blogData_train.csv"
    
    train_data = pd.read_csv("./Data/{}".format(train_file),header=None)
    train_output = train_data[len(train_data.columns)-1]
    train_num = train_data.shape[0]
    del train_data[len(train_data.columns)-1]

    files.remove(train_file)
    file_list = files
    test_data = pd.DataFrame()
    for filename in file_list:
        df = pd.read_csv("./Data/{}".format(filename),header=None)
        test_data = pd.concat([test_data, df], axis=0)
    test_output = test_data[len(test_data.columns)-1]
    del test_data[len(test_data.columns)-1]

    data_X = pd.concat([train_data, test_data], axis=0)
    data_Y = pd.concat([train_output, test_output], axis=0)

    return data_X, data_Y, train_num


def load_data_mixup():
    files = os.listdir("Data")
    train_file = "blogData_train.csv"
    
    train_data = pd.read_csv("./Data/{}".format(train_file),header=None)
    train_output = train_data[len(train_data.columns)-1]
    train_num = train_data.shape[0]
    del train_data[len(train_data.columns)-1]
    # mix up
    train_data , train_output = mixup_data(train_data, train_output)

    files.remove(train_file)
    file_list = files
    test_data = pd.DataFrame()
    for filename in file_list:
        df = pd.read_csv("./Data/{}".format(filename),header=None)
        test_data = pd.concat([test_data, df], axis=0)
    test_output = test_data[len(test_data.columns)-1]
    del test_data[len(test_data.columns)-1]

    
    data_X = pd.concat([train_data, test_data], axis=0)
    data_Y = pd.concat([train_output, test_output], axis=0)

    return data_X, data_Y, train_num


In [5]:
def train():
    fig, axes = plt.subplots(1, 1, figsize=(20, 15), squeeze=False)
    
    data_X, data_Y, train_num = load_data()
    print(data_Y.shape, data_Y.shape)
    print(train_num, data_X.shape[0])
    
    train_indices = [list(range(0, train_num))]
    test_indices =  [list(range(train_num, data_X.shape[0]))]
    custom_cv = zip(train_indices, test_indices)
    
    # rf = RandomForestRegressor(n_estimators=100, max_features=100)
    linear_reg = LinearRegression()
    plot_learning_curve(estimator=linear_reg, title="LinearRegression", X=data_X, y=data_Y, axes=axes[:,0], cv=custom_cv, train_sizes=np.linspace(.1, 0.87, 70))
    # plot_learning_curve(estimator=linear_reg, title="Random Forest", X=data_X, y=data_Y, axes=axes[:,0], cv=custom_cv, train_sizes=np.linspace(.1, 0.87, 100))
    # plt.show()

    plt.ylim(-1, 1)
    plt.savefig("./learning_curve(linear_reg).png")
    plt.close()

def train_mixup():
    print("Mix Up")
    fig, axes = plt.subplots(1, 1, figsize=(20, 15), squeeze=False)
    
    data_X, data_Y, train_num = load_data_mixup()
    print(data_X.shape, data_Y.shape)
    print(train_num, data_X.shape[0])
    train_indices = [list(range(0, train_num))]
    test_indices =  [list(range(train_num, data_X.shape[0]))]
    custom_cv = zip(train_indices, test_indices)
    
    # rf = RandomForestRegressor(n_estimators=100, max_features=100)
    linear_reg = LinearRegression()
    plot_learning_curve(estimator=linear_reg, title="LinearRegression", X=data_X, y=data_Y, axes=axes[:,0], cv=custom_cv, train_sizes=np.linspace(.1, 0.87, 70))
    # plot_learning_curve(estimator=linear_reg, title="Random Forest", X=data_X, y=data_Y, axes=axes[:,0], cv=custom_cv, train_sizes=np.linspace(.1, 0.87, 100))
    # plt.show()
    plt.ylim(-1, 1)
    plt.savefig("./learning_curve(linear_reg mix_up).png")
    plt.close()


In [6]:
def main():
    train()
    train_mixup()

In [7]:

if __name__ == "__main__":
    main()

(60021,) (60021,)
52397 60021
the shape of train_scores_mean (70,)
[0.28253212 0.25854511 0.26203898 0.26460459 0.26453074 0.2656034
 0.2668788  0.26340848 0.25411724 0.25468729 0.25566672 0.25364506
 0.41012977 0.44538327 0.44532323 0.44146491 0.43534761 0.43234129
 0.43125308 0.43037798 0.43047549 0.43017606 0.4263807  0.41989899
 0.41562898 0.4141435  0.41286634 0.40962457 0.40894157 0.40306377
 0.40225797 0.4027358  0.40310386 0.40353178 0.40385902 0.4036278
 0.40369544 0.4022944  0.39798033 0.39380374 0.3932907  0.39275142
 0.39204499 0.39194002 0.39077872 0.39059211 0.39022085 0.38923408
 0.38817063 0.38739447 0.38746336 0.38750093 0.38640102 0.38617843
 0.38621092 0.38609455 0.38601755 0.37629046 0.37650656 0.37606418
 0.37453612 0.37438022 0.37430948 0.37445302 0.37359368 0.37111754
 0.37054354 0.37006901 0.36926447 0.36853416]
the shape of test_scores_mean (70,)
[-1.66766019e+19 -1.25548796e+20 -1.92377582e+18 -2.07597809e+21
 -5.57300312e+18 -2.03837847e+20 -2.00869526e+20 -4