In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import datasets
import warnings
import pickle

In [8]:
def size(data):
    
    X_train, y_train, X_test, y_test = data
    train_n, p = X_train.shape
    test_n = y_test.shape[0]
    
    print(train_n, test_n, p)
    
    return 

In [9]:
def parse_UCI(path):
    
    path=os.path.join("Datasets", path)
    file = open(path, "r")
    X=[]
    y=[]
    for line in file:
        line = (line.strip("\n")).split(" ")
        y.append(float(line[0]))
        x = np.array([float(i.split(":")[1]) for i in line[1:]])
        X.append(x)
    
    return np.stack(X, axis=0), np.array(y)

In [10]:
def load_wine():
    
    path_red=os.path.join("Datasets", 'winequality-red.csv')
    path_white=os.path.join("Datasets", 'winequality-white.csv')
    
    red = pd.read_csv(path_red, sep=";").values
    white = pd.read_csv(path_white, sep=";").values
    
    X_train, y_train = white[:, 0:-1], white[:, -1]
    X_test, y_test = red[:, 0:-1], red[:, -1]
    
    X_scaler = StandardScaler()
    X_train = X_scaler.fit_transform(X_train)
    X_test = X_scaler.transform(X_test)
    
    
    return X_train, y_train, X_test, y_test

In [11]:
def load_parkinson():
    
    park=os.path.join("Datasets", 'parkinson.csv')
    park = pd.read_csv(park)
    
    tr_ind = park["age"] < 60
    test_ind = park["age"] >= 60
     
    train, test = park.values[tr_ind, :], park.values[test_ind, :]
    y_train, X_train = train[:, 3], np.hstack((train[:, 1:2], train[:, 4:]))
    y_test, X_test = test[:, 3], np.hstack((test[:, 1:2], test[:, 4:]))
    
    X_scaler = StandardScaler()
    X_train = X_scaler.fit_transform(X_train)
    X_test = X_scaler.transform(X_test)
    
    return X_train, y_train, X_test, y_test

In [12]:
def load_triazines(test_size=.25, random_state=100):
    
    X, y = parse_UCI("triazines")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
    
    X_scaler = StandardScaler()
    X_train = X_scaler.fit_transform(X_train)
    X_test = X_scaler.transform(X_test)
    
    return X_train, y_train, X_test, y_test

In [13]:
def exclude(lst, i):
    if i == 0:
        return lst[i+1:]

    return lst[:i] + lst[i+1:]

In [14]:
def load_fertility(feature_split=0):
    
    path=os.path.join("Datasets", "fertility_Diagnosis.txt")
    file = open(path, "r")
    X=[]
    y=[]
    for line in file:
        line = (line.strip("\n")).split(",")
        y.append(line[-1])
        x = [float(i) for i in line[:-1]]
        X.append(x)
    
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    
    def conv(y):
        if y=="N":
            return 1
        elif y=="O":
            return 0
        else:
            raise Exception("BAD")
    split=feature_split
    num_pts = len(y)
    for i in range(num_pts):
        if X[i][split]==1:
            X_test.append(exclude(X[i], split))
            y_test.append(conv(y[i]))
        else:
            X_train.append(exclude(X[i], split))
            y_train.append(conv(y[i]))
            
    X_train = np.stack(X_train, axis=0)
    X_test = np.stack(X_test, axis=0)
            
    X_scaler = StandardScaler()
    X_train = X_scaler.fit_transform(X_train)
    X_test = X_scaler.transform(X_test)
        
    return X_train.astype(float), np.array(y_train).astype(float), X_test.astype(float), np.array(y_test).astype(float)

In [15]:
def load_forest_fires(split_month="sep"):
    
    fire=os.path.join("Datasets", 'forestfires.csv')
    fire = pd.read_csv(fire)
    num_pts = len(fire["month"])

    month_to_num = { "jan" : 1, "feb" : 2, "mar" : 3, "apr" : 4, "may" : 5, "jun" : 6, "jul" : 7, "aug" : 8, "sep" : 9, "oct" : 10, "nov" : 11, "dec" : 12}

    tr_ind = [i for i, j in enumerate(fire["month"]) if month_to_num[j] < month_to_num[split_month]]
    test_ind = [i for i, j in enumerate(fire["month"]) if month_to_num[j] >= month_to_num[split_month]]

    
    train, test = fire.values[tr_ind, :], fire.values[test_ind, :]
    y_train, X_train = train[:, -1], np.hstack((train[:, 0:2], train[:, 4:-1]))
    y_test, X_test = test[:, -1], np.hstack((test[:, 0:2], test[:, 4:-1]))

    X_scaler = StandardScaler()
    X_train = X_scaler.fit_transform(X_train)
    X_test = X_scaler.transform(X_test)
     
    return X_train.astype(float), y_train.astype(float), X_test.astype(float), y_test.astype(float)