In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import datetime

#Set Working Directory
os.chdir(r"/Users/glinn/Documents/CSCI5622-machine-learning/project/csci5622project")

# Import Data sets, after splitting into 75,10,15 train, development, test. 

In [3]:
# https://stackoverflow.com/questions/50781562/stratified-splitting-of-pandas-dataframe-in-training-validation-and-test-set
# this should be 75%, 10%, 15%
test_full = pd.read_csv(r"data/2018-test.csv").dropna()
train_full = pd.read_csv(r"data/2018-train.csv").dropna()
validate_full = pd.read_csv(r"data/2018-validate.csv").dropna()

# Clean the Data

We agreed to define success as simply pledged >= Goal, and to drop projects with a duration under 1 day, considered live at the time of acquiring the data, and projects that were cancelled. 

In [4]:
#Organize Data by deadline, launched, success, and duraction
def preprocess_df(df):
    df["deadline"] = pd.to_datetime(df["deadline"])
    df["launched"] = pd.to_datetime(df["launched"])
    #Define Success as pledged>=goal
    df["success"] = df["pledged"] >= df["goal"]
    df["duration"] = df["deadline"] - df["launched"]
    
    return df
#This drops data with a duration that is too short, live, or cancelled.
def clean_df(df):
    df = df.drop(df.loc[df["duration"] < datetime.timedelta(days=1)].index)
    df = df.drop(df.loc[df["state"] == "live"].index)
    df = df.drop(df.loc[df["state"] == "canceled"].index)
    
    return df

## Now apply these functions to the datasets:
test_organized = preprocess_df(test_full)
train_organized = preprocess_df(train_full)
validate_organized = preprocess_df(validate_full)
test = clean_df(test_organized)
train = clean_df(train_organized)
validate = clean_df(validate_organized)

# Create the Baseline from training Data

As our baseline, we will learn the percent of success and failures. We will use these probabilites to guess the outcome of each Kickstarter project. That is, we will take the expectation value of the confusion matrix using the probability of each outcome (pass/fail) learned from the training data. 

In [12]:
# First we find the fraction of success and failures in the training data
NTrain = len(train.success)
PercSuc = len(train.loc[train.success == True])/NTrain
PercFail = 1 - PercSuc

# This Function will make a Confusion matrix, given a percent success rate learned from the training data
def CMBaseline(df,PercentSuccessOfTrain):
    PS = PercentSuccessOfTrain #Percent of training data that succeeded
    PF = 1 - PS #Percent of training data that failed
    NSuccess = len(df.loc[df.success == True]) #Number of successes in the given dataset
    NFail = len(df.loc[df.success == False]) #Number of failures in the given dataset
    CM = [[r"Pred\Acutal","Success","Fail"],["Success",NSuccess*PS,NFail*PS],["Fail",NSuccess*PF,NFail*PF]] #Expectation value of the Confusion matrix
    return CM

def Metrics(CM):
    CMnp = np.array([[CM[1][1],CM[2][1]],[CM[2][1],CM[2][2]]])
    Acc = np.trace(CMnp)/np.sum(CMnp)
    TPR = CMnp[0,0]/(CMnp[0,0]+CMnp[1,0])
    FNR = 1-TPR
    TNR = CMnp[1,1]/(CMnp[0,1]+CMnp[1,1])
    FPR = 1-TNR
    Precision = CMnp[0,0]/(CMnp[0,0]+CMnp[0,1])
    NPV = CMnp[1,1]/(CMnp[1,0]+CMnp[1,1])
    F1 = 2*CMnp[0,0]/(2*CMnp[0,0]+CMnp[0,1]+CMnp[1,0])
    Metric = {}
    Metric["Accuracy"]=Acc
    Metric["True_Positive_Rate"] = TPR
    Metric["False Negative Rate"] = FNR
    Metric["True Negative Rate"] = TNR
    Metric["False Positive Rate"] = FPR
    Metric["Precision"] = Precision
    Metric["Negative Predictive Value"] = NPV
    Metric["F1"] = F1
    return Metric

def ShowCMPercent(CM):
    CMnp = np.array([[CM[1][1],CM[2][1]],[CM[2][1],CM[2][2]]])
    Norm = np.sum(CMnp,0)
    Norm = Norm.reshape(1,2)
    CMnp = CMnp/Norm
    print(Norm.shape)
    CM[1][1] = CMnp[0,0]
    CM[1][2] = CMnp[0,1]
    CM[2][1] = CMnp[1,0]
    CM[2][2] = CMnp[1,1]
    return pd.DataFrame(CM)
def AccuracyMajority(df):
    NFail = len(df.loc[df.success == False])
    NData = len(df["success"])
    Accuracy = NFail/NData
    return Accuracy

##### Let's create the expectation value of the confusion matrix for the Validation dataset #########

In [6]:
CMVal = CMBaseline(validate,PercSuc)
MetricVal = Metrics(CMVal)
print(MetricVal)
pd.DataFrame(CMVal) #This Line shows the CM in traditional form
#ShowCMPercent(CMVal) #This shows the CM in percent form

{'Accuracy': 0.5213595021886327, 'True_Positive_Rate': 0.402446830642193, 'False Negative Rate': 0.597553169357807, 'True Negative Rate': 0.600800024060874, 'False Positive Rate': 0.39919997593912604, 'Precision': 0.402446830642193, 'Negative Predictive Value': 0.600800024060874, 'F1': 0.402446830642193}


Unnamed: 0,0,1,2
0,Pred\Acutal,Success,Fail
1,Success,5341.68,8039.28
2,Fail,7931.32,11936.7


##### Let's create the expectation value of the confusion matrix for the Test dataset #########

In [7]:
CMTest = CMBaseline(test,PercSuc)
MetricTest = Metrics(CMTest)
print(MetricTest)
pd.DataFrame(CMTest) #This Line shows the CM in traditional form
#ShowCMPercent(CMTest) #This shows the CM in percent form

{'Accuracy': 0.5173827589338706, 'True_Positive_Rate': 0.402446830642193, 'False Negative Rate': 0.597553169357807, 'True Negative Rate': 0.5952366668666387, 'False Positive Rate': 0.4047633331333613, 'Precision': 0.402446830642193, 'Negative Predictive Value': 0.5952366668666387, 'F1': 0.402446830642193}


Unnamed: 0,0,1,2
0,Pred\Acutal,Success,Fail
1,Success,8145.93,11979.2
2,Fail,12095.1,17786.8


In [10]:
NumSucTrain = len(train.loc[train.success == True])
NumTrain = len(train["success"])
NumFailTrain = NumTrain - NumSucTrain
print(NumFailTrain/NumTrain)

0.597553169357807


Therefore, we choose to assign all data as failures for a majority baseline

In [14]:
Acc_Val = AccuracyMajority(validate)
Acc_Test = AccuracyMajority(test)
print(f"The accuracy of a majority vote baseline on the validation data is {Acc_Val}")
print(f"The accuracy of a majority vote baseline on the test data is {Acc_Test}")

The accuracy of a majority vote baseline on the validation data is 0.600800024060874
The accuracy of a majority vote baseline on the test data is 0.5952366668666387
