In [176]:
import numpy as np
import pandas as pd
import re
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.preprocessing import Imputer

In [177]:
data_train = pd.read_excel("C:/Users/LENOVO/Desktop/Final_Train.xlsx")
data_test = pd.read_excel("C:/Users/LENOVO/Desktop/Final_Test.xlsx")

In [178]:
#Function to clean column Experience to retain only numbers.
def exp_in_years(input_data):
    return int(str([int(s) for s in str(input_data).split() if s.isdigit()]).replace("[","").replace("]",""))

In [179]:
#Function to clean column Place to retain only City.
def split_place(input_data):
    return str(str(input_data).split(",")[1:2]).replace("[","").replace("]","").replace(" ","").replace("'","")

In [180]:
#Function to convert column Rating to scale number ranging from 0 to 10 (10 including)
def convert_rating(input_data):
    Rating = str(input_data).replace("%","")
    if Rating.isdigit():
        return np.round(int(Rating)/10,0)
    else:
        return '0'

In [181]:
Generalist = ['BHMS','BAMS','BUMS','BSAM','MBBS','BDS','BEMS','BIMS','BAc','BSc','LCEH','GAMS']

In [182]:
Specialist = ['MD','MS','M','MD','MOrth','MDS','Masters','DNB','MA','MBA','MPH','MSc','MMed','MA','DMD','HMD','SC','MDEH','MOI']

In [183]:
Super_Specialist = ['DM','MCh']

In [184]:
PhD = ['PhD','Ph']

In [185]:
Diploma = ['DHMS','DIPLOMA','DLO','DCP','DAA','DDVL','DDV','DGO','DVD','DYA','DPM','DORL','IDCC','DNHE','Dip','NMD',
           'PGDE','DDVCPS','PGDHA','PGD','DEMS','DDV','DD','DSM','DIH','DICOI','DAT','DRCOG']

In [186]:
Membership = ['MFDS','MRCS','MRCP','MRCPS','MNAMS','MRSH','MRCPCH','MRCGP','MNAMS','MCIP','M','MRCEM','MAMS']

In [187]:
Fellowship = ['Fellowship','FCGP','FAGE','FICP','FRCP','IBCLC','AFIH','FAMS','FRGUHS','FNB','FCSI','FSCAI','FRACS','FCAH',
              'FIAMS','FRHS','FDSRCS','Fellow','FCPS','FFDRCSI','FICD','FICOI','FCCM','FCCP','FACE','FCIP','FCD','FACC',
              'FSRH','FAAD','FCPS','FICS','FICA','FIAMS']

In [188]:
Others = ['certification','certificate','GCEH','certified','CGO','CSD','CCDR','CCMTD','CCEDM','SCE','Course','Training'
          ,'CCEBDM','Externship','ATLS','ACLS','BLS','CCST','PCAD','AFIH']

In [189]:
#Function to explode column Qualification
def explode_qualification(input_data):
    input_data['Generalist'] = input_data['Specialist'] = input_data['Super_Spec'] = input_data['PhD'] = input_data['Diploma'] = input_data['Membership'] = input_data['Fellowship']= input_data['Others']=pd.to_numeric(0)
    
    for i in range(0,len(input_data)):
        temp = str(re.sub(r'[^\w]',' ',input_data.loc[i,'Qualification'])).split()
        input_data.loc[i,'Generalist'] = max([1 if X in Generalist else 0 for X in temp])
        input_data.loc[i,'Specialist'] = max([1 if X in Specialist else 0 for X in temp ])
        input_data.loc[i,'Super_Spec'] = max([1 if X in Super_Specialist else 0 for X in temp ])
        input_data.loc[i,'PhD']        = max([1 if X in PhD else 0 for X in temp ]) 
        input_data.loc[i,'Diploma']    = max([1 if X in Diploma else 0 for X in temp ]) 
        input_data.loc[i,'Membership'] = max([1 if X in Membership else 0 for X in temp ]) 
        input_data.loc[i,'Fellowship'] = max([1 if X in Fellowship else 0 for X in temp ]) 
        input_data.loc[i,'Others']     = max([1 if X in Others else 0 for X in temp ])
        
    return input_data

In [190]:
data_train.isnull().sum()

Qualification            0
Experience               0
Rating                3302
Place                   25
Profile                  0
Miscellaneous_Info    2620
Fees                     0
dtype: int64

In [191]:
#Function to one hot encode City and Profile Columns.
def one_hot(input_data):
    return pd.get_dummies(input_data['City']), pd.get_dummies(input_data['Profile'])    

In [192]:
data_train['Exp'] = data_train['Experience'].apply(exp_in_years)
data_test['Exp'] = data_test['Experience'].apply(exp_in_years)

In [193]:
data_train['City'] = data_train['Place'].apply(split_place)
data_train['City'][data_train['City'] == 'Sector5'] = 'Delhi'
data_train['City'][data_train['City'] == ''] = 'Bangalore'

data_test['City'] = data_test['Place'].apply(split_place)
data_test['City'][data_test['City'] == 'Sector5'] = 'Delhi'
data_test['City'][data_test['City'] == ''] = 'Bangalore'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [23]:
'''
corrupt_data = data_train[data_train['City'] =='Unknown']
data_train = data_train.drop(corrupt_data.index,axis=0)
data_train.reset_index(inplace=True)
--
corrupt_data = data_test[data_test['City'] =='Unknown']
data_test = data_test.drop(corrupt_data.index,axis=0)
data_test.reset_index(inplace=True)
'''

In [194]:
data_train['Rating'] = data_train['Rating'].apply(convert_rating)
data_test['Rating'] = data_test['Rating'].apply(convert_rating)

In [195]:
data_train = explode_qualification(data_train)
data_test = explode_qualification(data_test)

In [196]:
data_train.head()

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,Fees,Exp,City,Generalist,Specialist,Super_Spec,PhD,Diploma,Membership,Fellowship,Others
0,"BHMS, MD - Homeopathy",24 years experience,10,"Kakkanad, Ernakulam",Homeopath,"100% 16 Feedback Kakkanad, Ernakulam",100,24,Ernakulam,1,1,0,0,0,0,0,0
1,"BAMS, MD - Ayurveda Medicine",12 years experience,9,"Whitefield, Bangalore",Ayurveda,"98% 76 Feedback Whitefield, Bangalore",350,12,Bangalore,1,1,0,0,0,0,0,0
2,"MBBS, MS - Otorhinolaryngology",9 years experience,0,"Mathikere - BEL, Bangalore",ENT Specialist,,300,9,Bangalore,1,1,0,0,0,0,0,0
3,"BSc - Zoology, BAMS",12 years experience,0,"Bannerghatta Road, Bangalore",Ayurveda,"Bannerghatta Road, Bangalore ₹250 Available on...",250,12,Bangalore,1,0,0,0,0,0,0,0
4,BAMS,20 years experience,10,"Keelkattalai, Chennai",Ayurveda,"100% 4 Feedback Keelkattalai, Chennai",250,20,Chennai,1,0,0,0,0,0,0,0


In [197]:
data_train = data_train.join(one_hot(data_train))

In [198]:
data_test = data_test.join(one_hot(data_test))

In [199]:
data_train = data_train.drop(['Qualification','Experience','Rating','Place','Profile','Miscellaneous_Info','City'], axis=1)
data_test = data_test.drop(['Qualification','Experience','Rating','Place','Profile','Miscellaneous_Info','City'], axis=1)

In [200]:
train_Feature = data_train[[x for x in data_train.columns if (x != 'Fees' and x != 'index')]]
train_Target = data_train['Fees']

In [201]:
X_train, X_test, y_train, y_test = train_test_split(train_Feature, train_Target, test_size=0.30, random_state=101)

In [202]:
data_test.columns
#data_test.drop(['index'], axis=1, inplace=True)

Index([               u'Exp',         u'Generalist',         u'Specialist',
               u'Super_Spec',                u'PhD',            u'Diploma',
               u'Membership',         u'Fellowship',             u'Others',
                u'Bangalore',            u'Chennai',         u'Coimbatore',
                    u'Delhi',          u'Ernakulam',          u'Hyderabad',
                   u'Mumbai', u'Thiruvananthapuram',           u'Ayurveda',
                  u'Dentist',     u'Dermatologists',     u'ENT Specialist',
         u'General Medicine',          u'Homeopath'],
      dtype='object')

In [203]:
len(train_Feature)

5961

In [204]:
#Model_1
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

y_pred_test = regr.predict(X_test)
y_pred_train = regr.predict(X_train)
test_mse = sqrt(mean_squared_error(y_test, y_pred_test))
test_msle = sqrt(mean_squared_log_error(y_test, y_pred_test))

train_mse = sqrt(mean_squared_error(y_train, y_pred_train))
train_msle = sqrt(mean_squared_log_error(y_train, y_pred_train))

print ("Test-MLE/MSLE: ", test_mse, test_msle)
print ("Train-MLE/MSLE: ", train_mse, train_msle)

predictions = pd.DataFrame(np.round(regr.predict(data_test),0),columns=['Fees'])
predictions.to_excel("C:/Users/LENOVO/Desktop/Basic_LR.xlsx",index=False)

('Test-MLE/MSLE: ', 169.65475305756186, 0.6635604144071413)
('Train-MLE/MSLE: ', 174.65031198033626, 0.6506883241281446)


In [205]:
#Model_2
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42, max_depth=15)
rf.fit(X_train, y_train)

y_pred_test = rf.predict(X_test)
y_pred_train = rf.predict(X_train)
test_mse = sqrt(mean_squared_error(y_test, y_pred_test))
test_msle = sqrt(mean_squared_log_error(y_test, y_pred_test))

train_mse = sqrt(mean_squared_error(y_train, y_pred_train))
train_msle = sqrt(mean_squared_log_error(y_train, y_pred_train))

print ("Test-MLE/MSLE: ", test_mse, test_msle)
print ("Train-MLE/MSLE: ", train_mse, train_msle)

predictions = pd.DataFrame(np.round(rf.predict(data_test),0),columns=['Fees'])
predictions.to_excel("C:/Users/LENOVO/Desktop/RandomForest.xlsx",index=False)

('Test-MLE/MSLE: ', 177.70953323625955, 0.6648195119343969)
('Train-MLE/MSLE: ', 132.47335244433887, 0.5115458732686278)


In [208]:
regr = linear_model.LinearRegression()
rfe = RFE(regr,15)
fit = rfe.fit(X_train, y_train)

print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_
print (zip(X_train.columns, fit.ranking_))

Num Features: 15
Selected Features: [False False False  True False False False False False  True  True  True
  True  True  True  True  True  True  True  True  True  True  True]
Feature Ranking: [9 3 2 1 7 8 4 5 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[('Exp', 9), ('Generalist', 3), ('Specialist', 2), ('Super_Spec', 1), ('PhD', 7), ('Diploma', 8), ('Membership', 4), ('Fellowship', 5), ('Others', 6), ('Bangalore', 1), ('Chennai', 1), ('Coimbatore', 1), ('Delhi', 1), ('Ernakulam', 1), ('Hyderabad', 1), ('Mumbai', 1), ('Thiruvananthapuram', 1), (u'Ayurveda', 1), (u'Dentist', 1), (u'Dermatologists', 1), (u'ENT Specialist', 1), (u'General Medicine', 1), (u'Homeopath', 1)]


In [209]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [210]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 33.6min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'bootstrap': [True, False], 'min_samples_leaf': [1, 2, 4], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'min_samples_split': [2, 5, 10], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [211]:
rf_random.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=90,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=4, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [212]:
y_pred_test = rf_random.predict(X_test)
y_pred_train = rf_random.predict(X_train)
test_mse = sqrt(mean_squared_error(y_test, y_pred_test))
test_msle = sqrt(mean_squared_log_error(y_test, y_pred_test))

train_mse = sqrt(mean_squared_error(y_train, y_pred_train))
train_msle = sqrt(mean_squared_log_error(y_train, y_pred_train))

print ("Test-MLE/MSLE: ", test_mse, test_msle)
print ("Train-MLE/MSLE: ", train_mse, train_msle)

predictions = pd.DataFrame(np.round(rf_random.predict(data_test),0),columns=['Fees'])
predictions.to_excel("C:/Users/LENOVO/Desktop/RandomSearch.xlsx",index=False)

('Test-MLE/MSLE: ', 167.426300913272, 0.6489040886651221)
('Train-MLE/MSLE: ', 163.3495524066283, 0.6140572015828599)


In [None]:
#Model 3 Randomforest with GridSearch Cross validation.

In [213]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [10,20,30,40,50],
    'max_features': [2,3],
    'min_samples_leaf': [3,4,5],
    'min_samples_split': [8,10,12],
    'n_estimators': [100,200,400,600,800]
}

In [214]:
rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 450 candidates, totalling 1350 fits


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   53.3s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed: 26.3min
[Parallel(n_jobs=-1)]: Done 1350 out of 1350 | elapsed: 35.0min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [True], 'min_samples_leaf': [3, 4, 5], 'n_estimators': [100, 200, 400, 600, 800], 'min_samples_split': [8, 10, 12], 'max_features': [2, 3], 'max_depth': [10, 20, 30, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [215]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=3,
           min_samples_split=12, min_weight_fraction_leaf=0.0,
           n_estimators=600, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [216]:
y_pred_test = grid_search.predict(X_test)
y_pred_train = grid_search.predict(X_train)
test_mse = sqrt(mean_squared_error(y_test, y_pred_test))
test_msle = sqrt(mean_squared_log_error(y_test, y_pred_test))

train_mse = sqrt(mean_squared_error(y_train, y_pred_train))
train_msle = sqrt(mean_squared_log_error(y_train, y_pred_train))

print ("Test-MLE/MSLE: ", test_mse, test_msle)
print ("Train-MLE/MSLE: ", train_mse, train_msle)

predictions = pd.DataFrame(np.round(grid_search.predict(data_test),0),columns=['Fees'])
predictions.to_excel("C:/Users/LENOVO/Desktop/GridSearch.xlsx",index=False)

('Test-MLE/MSLE: ', 167.34642592345756, 0.6486736535649416)
('Train-MLE/MSLE: ', 162.43438164524466, 0.6115685430267621)


In [218]:
#Model 4 GridSearch with more parameters.
param_grid = {
    'bootstrap': [True, False],
    'max_depth': [40,50,60,70,80],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': [3,4,5],
    'min_samples_split': [6,7,8],
    'n_estimators': [500,600,700,800,900,1000]
}

In [219]:
rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1620 candidates, totalling 4860 fits


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed: 22.8min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed: 40.8min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed: 57.9min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed: 81.4min
[Parallel(n_jobs=-1)]: Done 1981 tasks      | elapsed: 112.1min
[Parallel(n_jobs=-1)]: Done 2588 tasks      | elapsed: 150.2min
[Parallel(n_jobs=-1)]: Done 3277 tasks      | elapsed: 201.9min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 257.5min
[Parallel(n_jobs=-1)]: Done 4860 out of 4860 | elapsed: 308.4min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [True, False], 'min_samples_leaf': [3, 4, 5], 'n_estimators': [500, 600, 700, 800, 900, 1000], 'min_samples_split': [6, 7, 8], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [40, 50, 60, 70, 80]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [220]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=80,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=5, min_samples_split=8,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [221]:
y_pred_test = grid_search.predict(X_test)
y_pred_train = grid_search.predict(X_train)
test_mse = sqrt(mean_squared_error(y_test, y_pred_test))
test_msle = sqrt(mean_squared_log_error(y_test, y_pred_test))

train_mse = sqrt(mean_squared_error(y_train, y_pred_train))
train_msle = sqrt(mean_squared_log_error(y_train, y_pred_train))

print ("Test-MLE/MSLE: ", test_mse, test_msle)
print ("Train-MLE/MSLE: ", train_mse, train_msle)

predictions = pd.DataFrame(np.round(grid_search.predict(data_test),0),columns=['Fees'])
predictions.to_excel("C:/Users/LENOVO/Desktop/GridSearch_v1.xlsx",index=False)

('Test-MLE/MSLE: ', 167.12934229284443, 0.6480875111068343)
('Train-MLE/MSLE: ', 164.7960783318647, 0.6186863000480827)
