# MinneMUDAC Data Science Challenge - Functions

I'll keep all of the functions used for the challenge here for book-keeping.

In [None]:
def test_train_sample(X,y,r):
    # Imputs:
    # 1. A feature matrix X (n-dimensional array of n-dim arrays)
    # 2. A n-dim ouput array y
    # 3. A np.float r (ratio)
    # Outputs:
    # training and testing data with ratio r (e.g. r=0.75 takes 3/4 of the data for training)
    
    import random
    
    t=random.sample(range(0,len(y)),int(r*len(y)))
    t.sort()
    X_train=[]
    X_test=[]
    y_train=[]
    y_test=[]
    for j in range(0,len(y)):
        if j in t:
            X_train.append(X[j])
            y_train.append(y[j])
        else:
            X_test.append(X[j])
            y_test.append(y[j])
            
    return [X_train, y_train, X_test, y_test]

In [None]:
def plot_ols(X,y,r=0.75,n=10,supress=False):
    #plot_ols randomly selects testing and training data n times, fits ordinary least squares model and
    #computes average error (cross-validation)
    # Inputs:
    # X is an nxn feature matrix (n-array of n-arrays)
    # y is an n-dim output vector
    # r is the test-train ratio. If none is given, it is set to 0.75 by default
    # n is the number of random sample iterations. If none is given, it is set to 10
    # if supress is true, no plots are made
    
    from sklearn import linear_model
    
    reg = linear_model.LinearRegression()
    errors=[]
    for i in range(0,n):
        [X_train, y_train, X_test, y_test]=test_train_sample(X,y,r)        
        reg.fit(X_train, y_train) 
        y_pred=reg.predict(X_test)
      
        # Compute average error
        errors.append(np.mean(abs(np.array(y_test)-np.array(y_pred))))
        
        if supress==False:    
            # Quick plot
            plt.figure(i,figsize=(10,8))
            plt.ylabel('Percent turnout')
            plt.plot(range(0,len(y_pred)),y_pred,color='orange',label='Predicted')
            plt.plot(range(0,len(y_pred)),y_test,color='blue',label='Actual')
            plt.legend()

    error=np.mean(errors)
    print('The average error for OLS is',error)

In [None]:
def plot_lasso(X,y,r=0.75,n=10,supress=False):
    #plot_ols randomly selects testing and training data n times, regresses using lasso and
    #computes average error (cross-validation)
    # Inputs:
    # X is an nxn feature matrix (n-array of n-arrays)
    # y is an n-dim output vector
    # r is the test-train ratio. If none is given, it is set to 0.75 by default
    # n is the number of random sample iterations. If none is given, it is set to 10
    # if supress is true, no plots are made
       
    errors=[]
    for i in range(0,n):
        [X_train, y_train, X_test, y_test]=test_train_sample(X,y,r)
        from sklearn import linear_model

        clf = linear_model.Lasso(alpha=0.1)
        clf.fit(X_train,y_train)

        y_pred=clf.predict(X_test)
      
        # Compute average error
        errors.append(np.mean(abs(np.array(y_test)-np.array(y_pred))))
        
        if supress==False:    
            # Quick plot
            plt.figure(i,figsize=(10,8))
            plt.ylabel('Percent turnout')
            plt.plot(range(0,len(y_pred)),y_pred,color='orange',label='Predicted')
            plt.plot(range(0,len(y_pred)),y_test,color='blue',label='Actual')
            plt.legend()
            # See what features were selected
            # Let us see which features it picked
            t=[]
            for i in range(0,len(clf.coef_)):
                if clf.coef_[i]!=0:
                    t.append(i)
            print('Features selected:', df_ml.columns[t])
        

    error=np.mean(errors)
    print('The average error for Lasso is',error)

In [None]:
def plot_decision_tree(X,y,r=0.75,n=10,d=2,supress=False):
    #plot_ols randomly selects testing and training data n times, regresses using lasso and
    #computes average error (cross-validation)
    # Inputs:
    # X is an nxn feature matrix (n-array of n-arrays)
    # y is an n-dim output vector
    # r is the test-train ratio. If none is given, it is set to 0.75 by default
    # d is the max depth, set to 2 if not given
    # n is the number of random sample iterations. If none is given, it is set to 10
    # if supress is true, no plots are made
    
    from sklearn.tree import DecisionTreeRegressor
       
    errors=[]
    for i in range(0,n):
        [X_train, y_train, X_test, y_test]=test_train_sample(X,y,r)
        
        clf = DecisionTreeRegressor(max_depth=d)
        clf = clf.fit(X, y)
        y_pred=clf.predict(X_test)
      
        # Compute average error
        errors.append(np.mean(abs(np.array(y_test)-np.array(y_pred))))
        
        if supress==False:    
            # Quick plot
            plt.figure(i,figsize=(10,8))
            plt.ylabel('Percent turnout')
            plt.plot(range(0,len(y_pred)),y_pred,color='orange',label='Predicted')
            plt.plot(range(0,len(y_pred)),y_test,color='blue',label='Actual')
            plt.legend()        

    error=np.mean(errors)
    print('The average error for decision tree is',error)

In [None]:
def plot_svr_select(X,y,ker,r=0.75,n=10,m=10,gam=0,deg=0,supress=False):
    # Inputs:
    # X is an nxn feature matrix (n-array of n-arrays)
    # y is an n-dim output vector
    # r is the test-train ratio. If none is given, it is set to 0.75 by default
    # n is the number of random sample iterations. If none is given, it is set to 10
    # m is the number of best features to be selected
    # kernel can be 'rbf', 'linear', or 'poly'. If 'rbf' is given then a value for gamma is needed. If 'poly' is selected,
    # a value for the degree is needed
    # if supress is true, no plots are made
    
    #First select m best features
    from sklearn.feature_selection import SelectKBest, f_regression

    X_new = SelectKBest(f_regression, k=m).fit_transform(np.asarray(X), y)
    indeces=SelectKBest(f_regression,k=m).fit(np.asarray(X),y).get_support(True) #Selected features
    indeces=np.asarray(indeces).astype(int)
    #print('The features selected were:', df_ml.columns[indeces])
    
    
    from sklearn import svm
    errors=[]
    for i in range(0,n):
        [X_train, y_train, X_test, y_test]=test_train_sample(X_new,y,r)        
        if ker=='rbf':
            clf = svm.SVR(kernel='rbf', C=1e3, gamma=gam)
        elif ker=='poly':
            clf = svm.SVR(kernel='poly', C=1e3, degree=deg)
        else:
            clf = svm.SVR(kernel=ker, C=1e3)
        clf.fit(X_train, y_train) 
        y_pred=clf.predict(X_test)
      
        # Compute average error
        errors.append(np.mean(abs(np.array(y_test)-np.array(y_pred))))
        
        if supress==False:    
            # Quick plot
            plt.figure(i,figsize=(10,8))
            plt.ylabel('Percent turnout')
            plt.plot(range(0,len(y_pred)),y_pred,color='orange',label='Predicted')
            plt.plot(range(0,len(y_pred)),y_test,color='blue',label='Actual')
            plt.legend()

    error=np.mean(errors)
    print('The average error for SVR with %d features is'%(m),error)