In [1]:
%pwd

u'C:\\Users\\mohit\\Desktop\\analytics'

In [2]:
%%file datacleaning.py
"""
This will clean the 
data handle outliers and give 5 bootstrap samples
"""
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import scipy.stats as stats
from matplotlib.backends.backend_pdf import PdfPages
#from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
from sklearn.model_selection import train_test_split
        
        
def getBootstrapSample(df,Y):
    """
    Take 2 input params and seperate cat and num data do outlier and missing value
    treatment and return the 5 bootstrap sample.
    """
    data = []
    cat_col = seperate_cat_vars(df)
    num_col = seperate_num_vars(df)
    df_num=df[num_col]
    missing_value_treatment=df_num.apply(lambda x: Missing_imputation(x))
    outlier_treatment= df_num.apply(lambda x: outlier_capping(x))
    df_cat=df[['department', 'salary']]
    for c_feature in ['department', 'salary']:
        df_cat[c_feature] = df_cat[c_feature].astype('category')
        df_cat = create_dummies(df_cat , c_feature )
    df_new = pd.concat([df_num, df_cat], axis=1)    
    train ,test = train_test_split(df_new,test_size=0.3,random_state = 123 )
    bs1 = train.sample(frac = 1, replace = True,random_state = 256)
    bs2 = train.sample(frac = 1, replace = True,random_state = 257)
    bs3 = train.sample(frac = 1, replace = True,random_state = 258)
    bs4 = train.sample(frac = 1, replace = True,random_state = 259)
    bs5 = train.sample(frac = 1, replace = True,random_state = 251)
    data.append(bs1)
    data.append(bs2)
    data.append(bs3)
    data.append(bs4)
    data.append(bs5)
    data.append(test)
    return data
    
    
def seperate_cat_vars(df):
    cat_var_names = []
    cat_var_names=[key for key in dict(df.dtypes) if dict(df.dtypes)[key] in ['object']]
    return cat_var_names


def seperate_num_vars(df):
    numeric_var_names = []
    numeric_var_names=[key for key in dict(df.dtypes) if dict(df.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
    return numeric_var_names

def Missing_imputation(x):
    x = x.fillna(x.mean())
    return x
    
def outlier_capping(x):
    x = x.clip_upper(x.quantile(0.99))
    x = x.clip_lower(x.quantile(0.01))
    return x 

def create_dummies( df, colname ):
    col_dummies = pd.get_dummies(df[colname], prefix=colname)
    col_dummies.drop(col_dummies.columns[0], axis=1, inplace=True)
    df = pd.concat([df, col_dummies], axis=1)
    df.drop( colname, axis = 1, inplace = True )
    return df
    

Overwriting datacleaning.py


In [3]:
%%file decisiontree.py
"""
this will build the 2 models in seperate functions using gini and entropy
"""
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import scipy.stats as stats
from matplotlib.backends.backend_pdf import PdfPages
#from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
from sklearn.model_selection import train_test_split
import sklearn.tree as dt
import sklearn.ensemble as en
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier, export_graphviz, export
from sklearn.model_selection import GridSearchCV
        
        
def getDecisionTreeByGini(bs):
    """
    Take will take one bootstrap sample and return the model using gini
    """
    bs_x = bs.columns.difference(['left'])
    bs_x = bs[bs_x]
    bs_y = bs.columns.difference(['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','department','salary'])
    bs_y = bs[bs_y]
    param_grid = {'max_depth': np.arange(3, 12),
             'max_features': np.arange(3,8)}
    tree = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 10)
    tree.fit(bs_x,bs_y)
    best_param = tree.best_params_
    clf_tree = DecisionTreeClassifier( max_depth = best_param["max_depth"], max_features=best_param["max_features"] )
    clf_tree.fit( bs_x, bs_y )
    return clf_tree

def getDecisionTreeByEntropy(bs):
    """
    Take will take one bootstrap sample and return the model using entropy
    """
    bs_x = bs.columns.difference(['left'])
    bs_x = bs[bs_x]
    bs_y = bs.columns.difference(['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','department','salary'])
    bs_y = bs[bs_y]
    param_grid = {'max_depth': np.arange(3, 12),
             'max_features': np.arange(3,8) }
    tree = GridSearchCV(DecisionTreeClassifier(criterion='entropy'),param_grid, cv = 10)
    tree.fit(bs_x,bs_y)
    best_param = tree.best_params_
    clf_tree = DecisionTreeClassifier(max_depth = best_param["max_depth"], max_features=best_param["max_features"],criterion='entropy' )
    clf_tree.fit( bs_x, bs_y )
    return clf_tree
    
    
    

Overwriting decisiontree.py


In [4]:
%%file randomforesttree.py
"""
this will build the randomforest and will return the output
"""
import pandas as pd
import numpy as np
import sklearn.tree as dt
import sklearn.ensemble as en
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier, export_graphviz, export
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
        
        
def getRandomForest(bs,decision_gini_model=None,decision_entropy_model=None,random_forest_model=None):
    
    if(decision_gini_model == None and decision_entropy_model == None and random_forest_model == None):
        bs_x = bs.columns.difference(['left'])
        bs_x = bs[bs_x]
        bs_y = bs.columns.difference(['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','department','salary'])
        bs_y = bs[bs_y]
        radm_clf = RandomForestClassifier(oob_score=True,n_estimators=100 )
        radm_clf.fit( bs_x, bs_y )
        return radm_clf
    else:
        bs_x = bs.columns.difference(['left'])
        bs_x = bs[bs_x]
        bs_y = bs.columns.difference(['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','department','salary'])
        bs_y = bs[bs_y]
        decision_gini_prob = pd.DataFrame(decision_gini_model.predict(bs_x))
        decision_entropy_prob = pd.DataFrame(decision_entropy_model.predict(bs_x))
        random_forest_prob = pd.DataFrame(random_forest_model.predict(bs_x))
        return random_forest_prob

    

    
    
    

Overwriting randomforesttree.py


In [5]:
%%file gradientboosting.py
"""
this will build the adaboosting and will return the output
"""
import pandas as pd
import numpy as np
import sklearn.tree as dt
import sklearn.ensemble as en
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier, export_graphviz, export
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
        
        
def getGradientBoosting(bs):
    
    bs_x = bs.columns.difference(['left'])
    bs_x = bs[bs_x]
    bs_y = bs.columns.difference(['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','department','salary'])
    bs_y = bs[bs_y]
    pargrid_ada = {'n_estimators': [100, 200, 400, 600, 800],
               'learning_rate': [10 ** x for x in range(-3, 3)]}
    grad_ada = GridSearchCV(estimator=GradientBoostingClassifier(), 
                        param_grid=pargrid_ada, 
                        cv=5,
                        verbose=True, n_jobs=-1)
    grad_ada.fit(bs_x, bs_y)
    return grad_ada
    
    


Overwriting gradientboosting.py


In [6]:
%%file bagging.py
"""
this will build the adaboosting and will return the output
"""
import pandas as pd
import numpy as np
import sklearn.tree as dt
import sklearn.ensemble as en
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier, export_graphviz, export
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
        
        
def getBaggingClassifier(bs):
    
    bs_x = bs.columns.difference(['left'])
    bs_x = bs[bs_x]
    bs_y = bs.columns.difference(['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','department','salary'])
    bs_y = bs[bs_y]
    bagclm = BaggingClassifier(oob_score=True, n_estimators=100)
    bagclm.fit(bs_x, bs_y)
    return bagclm
    
    


Overwriting bagging.py
