In [66]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score, roc_auc_score, r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, BaggingRegressor, RandomForestRegressor
from IPython.display import Image
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
import pydot

class Data(object):
    #initializer for data with attributes data, y series, X dataframe, X-train dataframe, X_test dataframe
    #y_train series, y_test series
    def __init__(self, df, tarCol, size):
        self.dat = pd.read_csv(df, index_col=0)
        self.y = self.dat[tarCol]
        self.X = self.dat.drop(tarCol, axis = 1)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=size, random_state=42)
    
    #turns a column into an int by category
    def label_enc(self, col):
        le= LabelEncoder()
        col = le.fit_transform(col)
        return col
    
class Model(object):
    
    #initializer for model, name is a string, metric is a dictionary of metric names and values,
    #params initially set by user and transformed to best params by grid search,
    #model is a model function by sklearn, fit is the fit of the model on X_test and Y_test,
    #predict is the predicted y values, image is a png image of a decision tree
    def __init__(self, name):
        self.name = name
        self.metric = {}
        self.params = {}
        self.model= None
        self.fit = None
        self.predict = None
        self.image = None
    
    #sets the model to be a linear Regression
    def set_model_lin(self):
        self.model = LinearRegression()
    
    #sets the model as KNearest-Neighbors
    def set_model_knn(self):
        self.model = KNeighborsClassifier()
    
    #sets the model as Knearest-Regression
    def set_model_knr(self):
        self.model = KNeighborsRegressor()
    
    #sets the model as Logistic Regression
    def set_model_log(self):
        self.model = LogisticRegression()
    
    #sets the model as Decision Tree Classifier
    def set_model_DT_Class(self):
        self.model = DecisionTreeClassifier()
    
    #sets the model as Decision Tree Regressor
    def set_model_DT_Reg(self):
        self.model = DecisionTreeRegressor()
    
    #sets the model as Random Forest Regressor
    def set_model_RF_Reg(self):
        self.model = RandomForestRegressor()
    
    #sets the model as Random Forest Classifier
    def set_model_RF_Class(self):
        self.model = RandomForestClassifier()
    
    #fits the model based on X_train and y_train
    def set_model_fit(self, data):
        self.fit = self.model.fit(data.X_train, data.y_train)
    
    #predicts y_values based on the model fit and X_test
    def set_model_pred(self, data):
        self.predict = self.model.predict(data.X_test)
    
    #sets the params for grid_search 
    def set_model_params(self, dictiona):
        self.params = dictiona
    
    #conducts grid search for any sort of model and sets the fit and predict and saves best params
    def grid_search(self, data):
        model = self.model
        clf = GridSearchCV(model, self.params)
        self.model = clf
        self.params.clear()
        self.fit = clf.fit(data.X_train, data.y_train)
        self.predict = clf.predict(data.X_test)
        self.params = clf.best_params_
    
    #finds accuracy score and saves it to metric dictionary
    def acc_score(self, data):
        self.metric['accuracy_score'] = accuracy_score(data.y_test, self.predict)
    
    #finds r^2 and saves it to metric dictionary
    def rsq(self, data):
        self.metric['r^2'] = r2_score(data.y_test, self.predict)
    
    #finds mse and saves it to metric dictionary
    def mse(self, data):
        self.metric['mse'] = mean_squared_error(data.y_test, self.predict)
    
    #finds roc_auc_score and saves it to metric dictionary
    def roc_score(self, data):
        prob = self.fit.predict_proba(data.X_test)
        self.metric['roc_auc_score']=roc_auc_score(data.y_test, prob[:, 1])
 
    #saves a decision tree image to the folder the program is being run in
    def img_dec_tree(self, data):
        dot_data = StringIO()  
        export_graphviz(self.model, out_file=dot_data, 
                        feature_names=data.X.columns, filled=True, rounded=True,
                        special_characters=True)  
        graph = pydot.graph_from_dot_data(dot_data.getvalue())
        graph.write_png('DT.png')

    
if __name__ == "__main__":
    dataName = input('Please input the file name: ')
    tarCol = input('What is the name of the target column: ')
    cat = input('Is this a categorical variable?[y/n]')
    dt = input('Would you like to build a decision tree[y/n]')
#     if dt == 'y':
#         en = input('Would you like to run ensemble methods?[y/n]')
#         if en =='y':
#             bgrr = input('Would you like Random Forest[1] or Bagging[2]')
    size = input('What do you want the split size to be: ')
#     dataName = 'used_cars_clean.csv'
#     tarCol = 'price'
#     cat = 'n'
#     dt = 'y'
#     en ='y'
#     bgrr='y'
#     size = .33
    
    data = Data(dataName, tarCol, size)
    models = []

    #if this is a regression problem
    if cat == 'n':
        
        #Linear Regression
        print 'Running Linear Regression'
        linreg = Model('Linear Regression')
        linreg.set_model_lin()
        gs = input('Do you want to conduct grid search for LinReg[y/n]:')
        if gs == 'y':
            params = input('Set the params for grid search for Linear Regression: ')
            linreg.set_model_params(params)
            linreg.grid_search(data)
            linreg.rsq(data)
            linreg.mse(data)
            models.append(linreg)
            print linreg.metric
        else:
            linreg.set_model_fit(data)
            linreg.set_model_pred(data)
            linreg.rsq(data)
            linreg.mse(data)
            models.append(linreg)
            print linreg.metric
            
        #KNeighbors Regression
        print 'Running K-Neighbors Regression'
        knr = Model('K-Nearest Regression')
        knr.set_model_knr()
        gs = input('Do you want to conduct grid search for K-Neighbors Regression[y/n]')
        if gs =='y':
            params = input('Set the params for the grid search for K-Neighbors Regression:')
            knr.set_model_params(params)
            knr.grid_search(data)
            knr.acc_score(data)
            models.append(knr)
            print knr.metric
        else:
            knr.set_model_fit(data)
            knr.set_model_pred(data)
            knr.acc_score(data)
            models.append(knr)
            print knr.metric
        
        #Decision Tree Modelling
        if dt == 'y':
            DTReg = Model('Decision Tree Regression')
            DTReg.set_model_DT_Reg()
            gs = input('Do you want to conduct grid search for the DT[y/n]')
            if gs == 'y':
                params = input('Set params for grid search for Decision Tree Regressor: ')
                DTReg.set_model_params(params)
                DTReg.grid_search(data)
                DTReg.mse(data)
                DTReg.img_dec_tree(data)
                models.append(DTReg)
                print DTReg.metric
            else:
                DTReg.set_model_fit(data)
                DTReg.set_model_pred(data)
                DTReg.mse(data)
                DTReg.img_dec_tree(data)
                models.append(DTReg)
                print DTReg.metric
                
            #running Random Forrest
            RFReg = Model('Random Forrest Regression')
            RFReg.set_model_RF_Reg()
            RFReg.set_model_fit(data)
            RFReg.set_model_pred(data)
            RFReg.mse(data)
            models.append(RFReg)
            print RFReg.metric

    #is this a categorical problem
    if cat == 'y':
        
        #run Knn if there are more than two values in Target Column
        print 'Running KNN'
        knn = Model('K-Nearest Neighbors')
        knn.set_model_knn()
        gs = input('Do you want to conduct grid search for KNN[y/n]: ')
        if gs == 'y':
            params = input('Set the params for grid search for KNN: ')
            knn.set_model_params(params)
            knn.grid_search(data, 5)
            knn.acc_score(data)
            models.append(knn)
            print knn.metric
        else:
            knn.set_model_fit(data)
            knn.set_model_pred(data)
            knn.acc_score(data)
            models.append(knn)
            print knn.metric
            
        if len(set(data.dat[tarCol])) < 3:
            #run logistic regression for two values in target Column
            print 'Running Logistic Regression'
            logreg = Model('Logistic Regression')
            logreg.set_model_log()
            gs = input('Do you want to conduct grid search for LogReg[y/n]')
            if gs == 'y':
                params = input('Set params for grid search for LogReg: ')
                logreg.set_model_params(params)
                logreg.grid_search(data)
                logreg.acc_score(data)
                logreg.roc_score(data)
                print logreg.metric
            else:
                logreg.set_model_fit(data)
                logreg.set_model_pred(data)
                logreg.acc_score(data)
                logreg.roc_score(data)
                print logreg.metric
                
        #Decision Tree Modelling
        if dt =='y':
            print 'Running Decision Tree Classifier'
            DTClass = Model('Decision Tree Classifier')
            DTClass.set_model_DT_Class()
            gs = input('Do you want to conduct grid search for the DT[y/n]')
            if gs == 'y':
                params = input('Set params for grid search for Decision Tree Classifier: ')
                DTClass.set_model_params(params)
                DTClass.grid_search(data)
                DTClass.acc_score(data)
                DTClass.img_dec_tree(data)
                models.append(DTClass)
                print DTClass.metric
            else:
                DTClass.set_model_fit(data)
                DTClass.set_model_pred(data)
                DTClass.acc_score(data)
                DTClass.img_dec_tree(data)
                models.append(DTClass)
                print DTClass.metric
            #Random Forrest Classification
            print 'Running Random Forrest Classifier'
            RFClass = Model('Random Forrest Classifier')
            RFClass.set_model_RF_Class()
            RFClass.set_model_fit(data)
            RFClass.set_model_pred(data)
            RFClass.mse(data)
            models.append(RFClass)
            print RFClass.metric  

Please input the file name: 'titanic_clean.csv'
What is the name of the target column: 'Survived'
Is this a categorical variable?[y/n]'y'
Would you like to build a decision tree[y/n]'y'
What do you want the split size to be: .33
Running KNN
Do you want to conduct grid search for KNN[y/n]: 'n'
{'accuracy_score': 0.60301507537688437}
Running Logistic Regression
Do you want to conduct grid search for LogReg[y/n]'n'
{'accuracy_score': 0.76884422110552764, 'roc_auc_score': 0.81649989576818838}
Running Decision Tree Classifier
Do you want to conduct grid search for the DT[y/n]'n'
{'accuracy_score': 0.7386934673366834}
Running Random Forrest Classifier
{'mse': 0.21105527638190955}


In [None]:

#     TODO NULL VALUES
#     if df.isnull().values.any():
#         dr = 0
#         while (dr != 1) or (dr !=2)
#             dr = input('Null values were detected. do you want to drop(1) or replace with mean(2) or zeros(3)')
#             if dr == 1:
#                 data.dat = data.dat.dropna(inplace = True)
#             elif dr == 2:
            
            
    
    #if user decides its a categorical problem and label encoded the target column
#     if cat == 'y':
#         data.dat[tarCol] = data.label_enc(data.dat[tarCol])
#         print 'Encoded target column as Label'
    
    
    #this goes through X columns and converts them to categories or not
#     for i in data.X.columns:
#         if (str == data.dat[i].dtype):
#             enc = input('Column %s seems to be a string. Turn this into a category?[y/n]' % (i))
#             if enc == 'y':
#                 data.dat=data.label_enc(data.dat[i])
#         elif len(set(data.dat[i])) < 6:
#             enc=input('Column %s seems to have less than 6 unique values. Turn into a category? [y/n]' % (i))
#             if enc == 'y':
#                 data.dat=data.label_enc(data.dat[i])
#     print data.dat.head()
    