In [2]:
# Required Python Packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import pdb

# File Paths
INPUT_PATH = "D:/abc/Tima_data_clean.xlsx"

# Headers
HEADERS = ['RateCity', 'RateDistrict-New', 'Gender-New', 'LivingTime-New', 'TypeOfOwnershipName-New', 
          'IsResidential-New', 'CompanyName-New', 'CompanyPhone-New', 'CityName_Company-New', 'RateCity_Company-New',
          'RateDistrict_Company-New', 'Salary', 'ReceiveYourIncome-New', 'RelativeFamily-New', 'FullNameFamily-New', 'AddressHouseHold-New', 'Assets-New',
          'Facebook-New', 'IsMarried-New', 'NumberBaby-New', 'IsLivingTogether-New', 'RankCompany-New', 'CompanyTaxCode-New', 'BH-New', 'CD-New','Label']

def split_dataset(dataset, train_percentage, feature_headers, target_header):
    """
    Split the dataset with train_percentage
    :param dataset:
    :param train_percentage:
    :param feature_headers: feature header names and target header name as inputs
    :param target_header:
    :return: train_x, test_x, train_y, test_y
    """
    
    # split dataset into train and test dataset
    # The function split_datatset will randomly arrange the lines and then divide them into two parts
    train_x, test_x, train_y, test_y = train_test_split(dataset[feature_headers], dataset[target_header],
                                                       train_size=train_percentage)
    return train_x, test_x, train_y, test_y

def random_forest_classifier(features, target, number_estimators):
    """
    To train the random forest classifier with features and data
    :param features:
    :param target:
    :param number_estimators:
    :return: trained random forest classfier
    """
    
    clf = RandomForestClassifier(n_estimators=number_estimators)
    clf.fit(features, target)
    return clf

def main():
    """
    Main function
    :return:
    """
    #load the excel file into pandas dataframe
    dataset = pd.read_excel(INPUT_PATH)
    print("Dataset Shape :: ", dataset.shape, "\n")
    train_x, test_x, train_y, test_y = split_dataset(dataset, 0.7, HEADERS[:-1], HEADERS[-1])
    # HEADERS[:-1] contains all the features header names eliminate last HEADERS name
    # HEADERS[-1] contains last header name
    
    # Train and Test dataset size detials 
    print ("Train_x Shape :: ", train_x.shape)
    print ("Train_y Shape :: ", train_y.shape)
    print ("Test_x Shape :: ", test_x.shape)
    print ("Test_y Shape :: ", test_y.shape)
    print ("\n")
    
    # Create random forest classfier instance
    trained_model = random_forest_classifier(train_x, train_y, 15)
    print ("Trained model ::" , trained_model)
    print ("\n")
    
    # let’s predict target for all the test features (test_x) using the trained classifier. 
    # Later we will see what our trained model is predicting and what the actual output could be.
    predictions = trained_model.predict(test_x)
    
    print("Predict target for few test_x:")
    for i in range(0, 10):
        print ("Actual outcome :: {} and Predicted outcome :: {}".format(list(test_y)[i], predictions[i]))
    print("\n")
    
    # Train and Test Accuracy
    # To calculate the accuracy we are using scikit learn the accuracy_score method.
    print("Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x)))
    print("Test Accuracy :: ", accuracy_score(test_y, trained_model.predict(test_x)))
    
if __name__ == "__main__":
    main()

Dataset Shape ::  (1297, 26) 

Train_x Shape ::  (907, 25)
Train_y Shape ::  (907,)
Test_x Shape ::  (390, 25)
Test_y Shape ::  (390,)


Trained model :: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


Predict target for few test_x:
Actual outcome :: 1 and Predicted outcome :: 1
Actual outcome :: 1 and Predicted outcome :: 0
Actual outcome :: 1 and Predicted outcome :: 1
Actual outcome :: 1 and Predicted outcome :: 1
Actual outcome :: 0 and Predicted outcome :: 1
Actual outcome :: 1 and Predicted outcome :: 0
Actual outcome :: 1 and Predicted outcome :: 1
Actual outcome :: 1 and Predicted outcome :: 1
Actual outcome :

