In [1]:
%matplotlib inline
from __future__ import division
import pandas as pd
import numpy as np
import math
import csv
import xgboost as xgb
import collections
import matplotlib.pyplot as plt
from sklearn.decomposition import RandomizedPCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from collections import OrderedDict
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import BernoulliRBM
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import linear_model
from sklearn import ensemble
from sklearn import tree

In [33]:
def perform_naive_bayes(features_train, labels_train, features_test, labels_test):
    
    clf = GaussianNB()
    accuracy,f1score, pred = perform_operations(features_train, labels_train, features_test, labels_test,clf)
    return (clf,accuracy,f1score,pred)

In [2]:
def perform_logistic_regression(features_train, labels_train, features_test, labels_test):
    from sklearn.linear_model import LogisticRegression
    
    clf = LogisticRegression()
    accuracy,f1score, pred = perform_operations(features_train, labels_train, features_test, labels_test,clf)
    return (clf,accuracy,f1score,pred)



In [3]:
def perform_linear_regression(features_train,labels_train,features_test,labels_test):
    
    clf = linear_model.Lasso(alpha=0.1)
    clf.fit(features_train,labels_train)
    pred = clf.predict(features_test)
    pred_floor = [int (math.floor(x)) for x in pred]
    print "accuracy on test set is ", accuracy_score(labels_test, pred_floor)
    
    return pred_floor

In [4]:
def perform_decision_tree(features_train, labels_train, features_test, labels_test):
    
    clf = tree.DecisionTreeClassifier()
    accuracy,f1score, pred = perform_operations(features_train, labels_train, features_test, labels_test,clf)
    return (clf,accuracy,f1score,pred)

In [5]:
def perform_svm_SVC(features_train, labels_train, features_test, labels_test):
     
    clf = svm.LinearSVC()
    accuracy,f1score, pred = perform_operations(features_train, labels_train, features_test, labels_test,clf)
    return (clf,accuracy,f1score,pred)

In [6]:
def perform_gradient_boosting(features_train, labels_train, features_test, labels_test):
    
    clf = ensemble.GradientBoostingClassifier(n_estimators = len(features_train.columns))
    accuracy,f1score, pred = perform_operations(features_train, labels_train, features_test, labels_test,clf)
    return (clf,accuracy,f1score,pred)

In [7]:
def perform_random_forest(features_train, labels_train, features_test, labels_test):
    
    clf = ensemble.RandomForestClassifier(n_estimators = len(features_train.columns))
    accuracy,f1score, pred = perform_operations(features_train, labels_train, features_test, labels_test,clf)
    return (clf,accuracy,f1score,pred)

In [8]:
def perform_neural_networks(features_train, labels_train, features_test, labels_test):
    clf = BernoulliRBM(n_components=2)
    accuracy,f1score, pred = perform_operations(features_train, labels_train, features_test, labels_test,clf)
    return (clf,accuracy,f1score,pred)

In [9]:
def perform_operations(features_train, labels_train, features_test, labels_test,clf):
    
    clf.fit(features_train,labels_train)
    pred = clf.predict(features_test)
    accuracy = accuracy_score(labels_test, pred)
    f1score = f1_score(labels_test, pred, average = 'weighted')
    
    return (accuracy, f1score, pred)

In [10]:
def xgboost_train(features_train,labels_train,features_test, labels_test, num_class):
    params = {}
    params["objective"] = "multi:softprob"
    params["eta"] =  0.01
    params["subsample"] = 0.8
    params["colsample_bytree"] = 0.8
    params["num_class"]=num_class
#     params["scale_pos_weight"] = 1
    params["silent"] = 1
    params["max_depth"] = 15
    params["eval_metric"] = 'mlogloss'

    plst = list(params.items())
    offset = 10000
    num_rounds = 500
    
    ftrs_train, lbls_train, ftrs_vld, lbls_vld = break_training_data_set(features_train, labels_train)
    xgtrain = xgb.DMatrix(ftrs_train, lbls_train)
    xgval = xgb.DMatrix(ftrs_vld, lbls_vld)
    watchlist = [(xgtrain, 'train'),(xgval, 'val')]
    model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=120,verbose_eval=True)
    
    xgtest = xgb.DMatrix(features_test, labels_test)
    pred = model.predict(xgtest,ntree_limit=model.best_iteration)
    pred_df = pd.DataFrame(pred)
    pred_xgb = pred_df.apply(get_cat,axis=1)
        
    accuracy = accuracy_score(labels_test, pred_xgb)
    f1score = f1_score(labels_test, pred_xgb, average = 'weighted')
        
    return (model,accuracy,f1score,pred_xgb)

In [11]:
def get_cat(x):
    retval=x.idxmax()
    return retval

In [35]:
def apply_algorithms(features_train,labels_train,features_test,labels_test):
    # applying logistic regression for classification purposes 
    print "entered applied algorithms function"
    
    with open('algorithms_results.csv', 'wb') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        
        spamwriter.writerow(['Method','Accuracy','F1score'])
        predictions = []
        models = []
    
        clf, accuracy,f1Score,pred = perform_naive_bayes(features_train, labels_train, features_test, labels_test)
        predictions.append(pred)
        models.append(clf)
        print " accuracy in naive bayes is ", accuracy
        print " calculated f1Score is ", f1Score
        print "-----------------------------------------------------------------------------------------------"
        spamwriter.writerow(['Naive Bayes',accuracy,f1Score])
        
#         clf, accuracy,f1Score,pred = perform_logistic_regression(features_train, labels_train, features_test, labels_test)
#         predictions.append(pred)
#         models.append(clf)
#         print " accuracy in logistic regression is ", accuracy
#         print " calculated f1Score is ", f1Score
#         print "-----------------------------------------------------------------------------------------------"
#         spamwriter.writerow(['Logistic',accuracy,f1Score])
    
#         clf, accuracy,f1Score,pred = perform_svm_SVC(features_train, labels_train, features_test, labels_test)
#         predictions.append(pred)
#         models.append(clf)
#         print " accuracy in SVM regression is ", accuracy
#         print " calculated f1Score is ", f1Score
#         print "-----------------------------------------------------------------------------------------------"
#         spamwriter.writerow(['SVM',accuracy,f1Score])
        
#         clf, accuracy,f1Score,pred = perform_gradient_boosting(features_train, labels_train, features_test, labels_test)
#         predictions.append(pred)
#         models.append(clf)
#         print " accuracy in gradient boosting  is ", accuracy
#         print " calculated f1Score is ", f1Score
#         print "-----------------------------------------------------------------------------------------------"
#         spamwriter.writerow(['gradient',accuracy,f1Score])
    
#         clf, accuracy,f1Score,pred = perform_decision_tree(features_train, labels_train, features_test, labels_test)
#         predictions.append(pred)
#         models.append(clf)
#         print "accuracy in decision tree", accuracy
#         print "calculated f1Score is ", f1Score
#         print "-----------------------------------------------------------------------------------------------"  
#         spamwriter.writerow(['decision',accuracy,f1Score])
    
#         clf, accuracy,f1Score,pred = perform_random_forest(features_train, labels_train, features_test, labels_test)
#         predictions.append(pred)
#         models.append(clf)
#         print "accuracy in random forest", accuracy
#         print "calculated f1Score is ", f1Score
#         print "-----------------------------------------------------------------------------------------------"
#         spamwriter.writerow(['random_forest',accuracy,f1Score])
    
#         clf, accuracy,f1Score,pred = xgboost_train(features_train, labels_train, features_test, labels_test,
#                                                                               num_class=len(labels_train.unique()))
#         predictions.append(pred)
#         models.append(clf)
#         print " accuracy in xgboost regression is ", accuracy
#         print " calculated f1Score is ", f1Score
#         print "-----------------------------------------------------------------------------------------------"
#         spamwriter.writerow(['xgboost',accuracy,f1Score])
        
#         clf, accuracy,f1Score,pred = perform_neural_networks(features_train, labels_train, features_test, labels_test)
#         predictions.append(pred)
#         models.append(clf)
#         print " accuracy in neural network is ", accuracy
#         print " calculated f1Score is ", f1Score
#         print "-----------------------------------------------------------------------------------------------"
#         spamwriter.writerow(['neural_network',accuracy,f1Score])
    
    return (models,predictions)

In [13]:
def break_training_data_set(training_features, training_labels):
    
    msk = np.random.rand(len(training_features)) < 0.8
    ftrs_train = training_features[msk]
    lbls_train = training_labels[msk]

    ftrs_vld = training_features[~msk]
    lbls_vld = training_labels[~msk]
    
    return (ftrs_train, lbls_train, ftrs_vld, lbls_vld)

In [14]:
def process_data_set(fileName):
    df = pd.read_csv(fileName)
    df = df.fillna(-1)

    features=df.drop(["Id","Response"],axis=1)
    le=LabelEncoder()
    features["Product_Info_2"]=le.fit_transform(features["Product_Info_2"])
    
    return df,features

In [15]:
def create_dummy_variable(series):
    counter = collections.Counter(series)
    print counter
    categories = len(counter.keys())
    columns = []
    for val in series:
        column = [0 for i in xrange(categories) ]
        column[val-1] = 1
        columns.append(column)
    return columns,categories

In [16]:
def get_bernoulli_model(features_train,n_component):
    model = BernoulliRBM(n_components=n_component).fit(features_train)
    return model

In [17]:
def get_pca_model(features_train,n_component):
    pca = RandomizedPCA(n_components=n_component, whiten=True).fit(features_train)
    return pca

In [36]:
## xgboost with all the features
def primary_evaluation_script(train_df,labels, test_df):
    print "starting process"

    
    features_train, labels_train, features_test, labels_test = break_training_data_set(train_df, labels)
    models, predictions = apply_algorithms(features_train,labels_train,features_test,labels_test)

    print "starting to process test data "

    
#     model_sequence = ['logistic','svm','gradient','decisiontree','randomforest','xgboost']
    model_sequence =['naivebayes']
    predictions = []
    for model,name in zip (models,model_sequence):
        pred = write_file(model,test_df,name,test_id)
        predictions.append(pred)
        
    print "process completed "
    return predictions

In [19]:
def boost(trainFileName, testFileName):
    print "starting process"

    train_df, features = process_data_set(trainFileName)
    labels=train_df["Response"].astype("category")
    labels=labels.cat.rename_categories(range(8))

    features_train, labels_train, features_test, labels_test = break_training_data_set(features, labels)
    n_components = [10,20,30,40,50,60,70,80,90,100,110,120]
    
    while(n_components):
        n_component = n_components[0]
        n_components.remove(n_component)
        
#         model = get_bernoulli_model(features_train, n_component)
        model = get_pca_model(features_train,n_component)
        accuracy, f1Score, pred = boost_with_model(features_train,labels_train,features_test,labels_test,
                                                                       model,n_component)
        print " accuracy with bernoulli RBM is ", accuracy
        print " calculated f1Score is ", f1Score
        print "-----------------------------------------------------------------------------------------------"
    

In [20]:
def boost_with_model(features_train,labels_train,features_test,labels_test,model,n_component):
   
    train_data = model.transform(features_train)
    test_data = model.transform(features_test)
        
        
    clf = ensemble.RandomForestClassifier(n_estimators = n_component)
#     clf = svm.LinearSVC()
    clf=clf.fit(train_data,labels_train)
    pred = clf.predict(test_data)
        
    accuracy = accuracy_score(labels_test, pred)
    f1Score = f1_score(labels_test, pred, average = 'weighted')
        
    return (accuracy, f1Score, pred)

In [21]:
def feature_selection_withXGBoost(trainFileName):
    print "starting process"

    train_df, features = process_data_set(trainFileName)
    labels=train_df["Response"].astype("category")
    labels=labels.cat.rename_categories(range(8))

    features_train, labels_train, features_test, labels_test = break_training_data_set(features, labels)
    
    xgboost_model, accuracy,f1Score,pred = xgboost_train(features_train.as_matrix(), labels_train.as_matrix(), 
                                                     features_test.as_matrix(), labels_test.as_matrix(), 
                                                         num_class= len(labels_train.unique()))

    xgboost_features = xgboost_model.get_fscore()
    
    return xgboost_features,features

In [22]:
def boost_with_gbm(features,feature_dict):
    
    d_descending = sorted(feature_dict.items(), key=lambda x: (-x[1], x[0]))
    f1Score_old = 0
    ftrs_selected = 10
    while True:
        if ftrs_selected > len(features.columns):
            break
        top_10_descending = d_descending[1:ftrs_selected]
        col_nos = [int(col[0][1:]) for col in top_10_descending]

        new_train_data = features[col_nos]

        new_features_train, new_labels_train, new_features_test, new_labels_test = break_training_data_set(new_train_data, labels)

        clf, accuracy,f1Score,pred = perform_gradient_boosting(new_features_train, new_labels_train, 
                                                                           new_features_test, new_labels_test)

        print " accuracy in gradient boosting  is ", accuracy
        print " calculated f1Score is ", f1Score
        print "-----------------------------------------------------------------------------------------------"
    
    
    
        f1Score_old = f1Score
        ftrs_selected += 10
    
        if f1Score > f1Score_old:
            test_df=pd.read_csv("test.csv")
            test_df=test_df.fillna(-1)
            test_id = test_df['Id']
            features_sub=test_df.drop(["Id"],axis=1)
            features_sub["Product_Info_2"]=le.transform(features_sub["Product_Info_2"])
            new_features_sub = features_sub[col_nos]
            write_file(clf,new_features_sub,'gradient',test_id)

In [23]:
def write_file(clf,features_test,typeofSet,test_id):
    print "predicting for ---->", typeofSet
    if typeofSet == 'xgboost':
        xgtest=xgb.DMatrix(features_test)
        pred = clf.predict(xgtest,ntree_limit=clf.best_iteration)
        pred_df = pd.DataFrame(pred,columns=range(1,9))
        pred_df = pred_df.apply(get_cat,axis=1)
    else:
        pred = clf.predict(features_test)
        pred_df = [x+1 for x in pred]
    
    
    with open('%s_results.csv' % typeofSet, 'wb') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(['Id','Response'])

        for ids,label in zip(test_id,pred_df):
            spamwriter.writerow([ids,label])

        print "file written",typeofSet
    
    return pred_df  

In [24]:
def plot_graphs(dataFrame, features_of_interest):
#     dataFrame[features_of_interest[0]].hist()

    fig, axes = plt.subplots(nrows=2,ncols=2)
    axes[0].plot([0,1,2], [2,3,4])
#     count =0
#     for ax in axes:
#         col_name = features_of_interest[count]
#         ax.plot([0,1,2], [2,3,4])
#         count +=1
    fig.savefig('req_plots.png')
    plt.close()

In [25]:
## methods defined below this point are specific to the problem
def add_dummy_variable(dataset, variable_list):

    print "dimensions of dataset before processing", dataset.shape
    
    categorical_df = dataset[variable_list]
    print "dimensions of categorical dataset ", categorical_df.shape
    
    dataset = dataset.drop(variable_list,1)
    print "dimensions after dropping categorical variables", dataset.shape
    print "---------------------------------------------------------------------------------"

    dummy_df = pd.DataFrame()
    for name in variable_list:
        print "processing", name
        series = categorical_df[name].values
        dummy_df= dummy_df.add(pd.get_dummies(series))
    
#     dummy_df = pd.concat(new_df_list)
    dataset = dataset.add(dummy_df)
    print "final dimensions", dataset.shape
    print "---------------------------------------------------------------------------------"
    
    return dataset



In [26]:
## performing analyis with dummy variables 
def create_dummy_datasets(dataset,fileName):
    categorical_variables = ['Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_5', 'Product_Info_6', 
                          'Product_Info_7', 'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5', 'InsuredInfo_1', 
                          'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 'InsuredInfo_6', 
                          'InsuredInfo_7', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 
                          'Insurance_History_4', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 
                          'Family_Hist_1', 'Medical_History_2', 'Medical_History_3', 'Medical_History_4', 'Medical_History_5',
                          'Medical_History_6','Medical_History_7', 'Medical_History_8', 'Medical_History_9', 
                          'Medical_History_10', 'Medical_History_11','Medical_History_12', 'Medical_History_13', 
                          'Medical_History_14', 'Medical_History_16', 'Medical_History_17','Medical_History_18', 
                          'Medical_History_19', 'Medical_History_20', 'Medical_History_21', 'Medical_History_22',
                          'Medical_History_23', 'Medical_History_25', 'Medical_History_26', 'Medical_History_27', 
                          'Medical_History_28','Medical_History_29', 'Medical_History_30', 'Medical_History_31', 
                          'Medical_History_33', 'Medical_History_34','Medical_History_35', 'Medical_History_36', 
                          'Medical_History_37', 'Medical_History_38', 'Medical_History_39','Medical_History_40', 
                          'Medical_History_41']

    features = add_dummy_variable(dataset,categorical_variables)
    features = features.fillna(-1)

    
    features.to_csv(fileName+'_dataset.csv', sep =',')
    print "file written", fileName
    print "-----------------------------------------------------------------------------"
   


In [40]:
# Run this function to run the code 

train_df = pd.read_csv('train.csv')
print "dimensions of training data", train_df.shape

train_df = train_df.fillna(-1)
features_train=train_df.drop(["Id","Response"],axis=1)
le=LabelEncoder()
features_train["Product_Info_2"]=le.fit_transform(features_train["Product_Info_2"])
    
labels=train_df["Response"].astype("category")
labels=labels.cat.rename_categories(range(8))

test_df = pd.read_csv('test.csv')
print "dimension of test data", test_df.shape

test_id = test_df['Id']
test_df=test_df.fillna(-1)
features_test=test_df.drop(["Id"],axis=1)
features_test["Product_Info_2"]=le.transform(features_test["Product_Info_2"])


predictions = primary_evaluation_script(features_train,labels,features_test)

# pick_modes(predictions)





dimensions of training data (59381, 128)
dimension of test data (19765, 127)
starting process
entered applied algorithms function
 accuracy in naive bayes is  0.3445069942
 calculated f1Score is  0.303525010061
-----------------------------------------------------------------------------------------------
starting to process test data 
predicting for ----> naivebayes
file written naivebayes
process completed 


In [38]:
def pick_modes(predictions):
    new_features = pd.DataFrame(pd.Series(pred) for pred in predictions)
    new_features = new_features.transpose()

# Taking mode of all predictions 
    pred_df= [max(row) for row in new_features.as_matrix()]
    print len(pred_df)

    with open('modes_results.csv', 'wb') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(['Id','Response'])

        for ids,label in zip(test_id,pred_df):
            spamwriter.writerow([ids,label])

    print "file written modes_results.csv" 



In [48]:
df = pd.read_csv('train.csv')
# labels = ['Ht','Ins_Age','Wt','BMI']
# for label in labels:
#     ylabels = df[label]
#     xlabels = [0.0,0.2,0.4,0.6,0.8,1.0]
#     plt.hist(ylabels, 50, facecolor='b', alpha=0.7)
#     plt.xlabel('Normalized '+label)
#     plt.ylabel('Counts')
#     plt.title('Distribution of '+ label)
#     plt.savefig('../images/'+label+'.jpeg')
#     plt.close()

categorical_variables = ['Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_5', 'Product_Info_6', 
                          'Product_Info_7', 'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5', 'InsuredInfo_1', 
                          'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 'InsuredInfo_6', 
                          'InsuredInfo_7', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 
                          'Insurance_History_4', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 
                          'Family_Hist_1', 'Medical_History_2', 'Medical_History_3', 'Medical_History_4', 'Medical_History_5',
                          'Medical_History_6','Medical_History_7', 'Medical_History_8', 'Medical_History_9', 
                          'Medical_History_10', 'Medical_History_11','Medical_History_12', 'Medical_History_13', 
                          'Medical_History_14', 'Medical_History_16', 'Medical_History_17','Medical_History_18', 
                          'Medical_History_19', 'Medical_History_20', 'Medical_History_21', 'Medical_History_22',
                          'Medical_History_23', 'Medical_History_25', 'Medical_History_26', 'Medical_History_27', 
                          'Medical_History_28','Medical_History_29', 'Medical_History_30', 'Medical_History_31', 
                          'Medical_History_33', 'Medical_History_34','Medical_History_35', 'Medical_History_36', 
                          'Medical_History_37', 'Medical_History_38', 'Medical_History_39','Medical_History_40', 
                          'Medical_History_41']

prod_info_var = ['Product_Info_1', 'Product_Info_3','Product_Info_5', 'Product_Info_6',
                 'Product_Info_7']
product_info = df[prod_info_var]
print product_info.describe()

emp_info_var = ['Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5']
emp_info = df[emp_info_var]
print emp_info.describe()

ins_info_var = ['InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 'InsuredInfo_6', 
                          'InsuredInfo_7']
ins_info = df[ins_info_var]
print ins_info.describe()

ins_hist_var = ['Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 
                          'Insurance_History_4', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9']
ins_hist = df[ins_hist_var]
print ins_hist.describe()

med_hist_var = ['Medical_History_2', 'Medical_History_3', 'Medical_History_4', 'Medical_History_5',
                          'Medical_History_6','Medical_History_7', 'Medical_History_8', 'Medical_History_9', 
                          'Medical_History_10', 'Medical_History_11','Medical_History_12', 'Medical_History_13', 
                          'Medical_History_14', 'Medical_History_16', 'Medical_History_17','Medical_History_18', 
                          'Medical_History_19', 'Medical_History_20', 'Medical_History_21', 'Medical_History_22',
                          'Medical_History_23', 'Medical_History_25', 'Medical_History_26', 'Medical_History_27', 
                          'Medical_History_28','Medical_History_29', 'Medical_History_30', 'Medical_History_31', 
                          'Medical_History_33', 'Medical_History_34','Medical_History_35', 'Medical_History_36', 
                          'Medical_History_37', 'Medical_History_38', 'Medical_History_39','Medical_History_40', 
                          'Medical_History_41']
med_hist= df[med_hist_var]
print med_hist.describe()


    
    
    

# df.hist(column = 'Ht', figsize = (8,8), color = "blue", bins = 50).savefig('/images/plot_ht.jpeg')

# fig = plot_Ht.get_figure()
# fig.savefig('/images/plot_Ht.jpeg')
# df['Ht'].plot(kind = 'density', figsize =(8,8))

       Product_Info_1  Product_Info_3  Product_Info_5  Product_Info_6  \
count    59381.000000    59381.000000    59381.000000    59381.000000   
mean         1.026355       24.415655        2.006955        2.673599   
std          0.160191        5.072885        0.083107        0.739103   
min          1.000000        1.000000        2.000000        1.000000   
25%          1.000000       26.000000        2.000000        3.000000   
50%          1.000000       26.000000        2.000000        3.000000   
75%          1.000000       26.000000        2.000000        3.000000   
max          2.000000       38.000000        3.000000        3.000000   

       Product_Info_7  
count    59381.000000  
mean         1.043583  
std          0.291949  
min          1.000000  
25%          1.000000  
50%          1.000000  
75%          1.000000  
max          3.000000  
       Employment_Info_2  Employment_Info_3  Employment_Info_5
count       59381.000000       59381.000000       59381.000000


In [None]:
## Plotting pie chart distribution of labels of all results tried. 
fileNames = ['decisiontree_results.csv','randomforest_results.csv','svm_results.csv',
            'logistic_results.csv','gradient_results.csv','xgboost_results.csv']
df = pd.read_csv('train.csv')
initial_distribution = df['Response']
counter = collections.Counter(initial_distribution)
initial_frac = [val for key,val in counter.iteritems()]

for name in fileNames:
    content = name.split("_")
    plot_results_pie_chart(name,content[0]+"_label_distribution.jpeg",content[0]+" labels",initial_frac)

In [None]:
def plot_pie_chart(labels,fracs,fileName,title,initial_frac):
    explode= (0, 0, 0, 0, 0, 0, 0, 0.05)
    fig,(ax1,ax2) = plt.subplots(1,2)
    ax1.pie(fracs,explode=explode, labels=labels,
                autopct='%1.1f%%', shadow=True, startangle=90)
    ax2.pie(initial_frac,explode=explode, labels=labels,
                autopct='%1.1f%%', shadow=True, startangle=90)
    ax1.set_title(title, bbox={'facecolor':'0.8', 'pad':5})
    ax2.set_title('training labels', bbox={'facecolor':'0.8', 'pad':5})
#     fig.title(title, bbox={'facecolor':'0.8', 'pad':5})
    fig.savefig(fileName)
    plt.figure(figsize=(20,20))
    plt.close()

In [None]:
def plot_results_pie_chart(fileName,imageName,title,initial_frac):
    with open(fileName, 'rb') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
        response = [row[1] for row in spamreader ]
    response= response[1:]
    counter = collections.Counter(response)
    labels = [x+1 for x in xrange(8)]
    fracs = [val for key,val in counter.iteritems()]
    plot_pie_chart(labels,fracs,'../images/'+imageName,title,initial_frac)

In [None]:
def process_features(df_num):
#     df_num = features.drop(['Id'],1)
    df_num = df_num.select_dtypes(include=[np.float, np.int])
    total_columns = len(df_num.columns)
    df_num = df_num.ix[:,0:total_columns]
    df_num = df_num.fillna(df_num.mean())
    
    return (df_num, total_columns)

In [None]:
## Plotting some graphs with the result files 

df = pd.read_csv('gbm_accu_ftr_variation.csv')
feature_count = df['feature']
accuracy = df['accuracy']
f1Score = df['f1Score']

fig, ax = plt.subplots()  # create figure & 1 axis
ax.plot(feature_count,accuracy,'g+',label='accuracy vs features')
ax.plot(feature_count, f1Score,'bs', label = 'f1Score vs features')
ax.legend(loc=4)
ax.set_xlabel('features used for prediction')
ax.set_ylabel('scores')
ax.set_title('gradient boosting by feature selection from xgboost')
fig.show()
fig.savefig('gbm_accu_ftr_variation.png')   # save the figure to file

plt.close(fig)    # close the figure

In [None]:
df = pd.read_csv('method_accu_f1Score.csv')
methods = df['Method']
accuracy = df['accuracy']
f1Score = df['f1Score']

ax.set_color_cycle(['red', 'blue', 'yellow','orange','green','black'])
symbols =['-','+','^','*','!','/']
fig, ax = plt.subplots()
for i in xrange(len(accuracy)):
    ax.plot(accuracy[i],symbols[i],label= methods[i])
#     ax.plot(f1Score[i],label=methods[i])

ax.legend(loc='best')
fig.show()
fig.savefig('method_accu_f1Score.png')
plt.close(fig)