In [11]:
import pandas as pd
import pyodbc
import pickle
import datetime
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest           #Used to identify most impactful features towards an outcome
from sklearn.feature_selection import f_classif             #Score function for SelectKBest
#Models
from xgboost import XGBClassifier                           #https://towardsdatascience.com/https-medium-com-vishalmorde-xgboost-algorithm-long-she-may-rein-edd9f99be63d
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics                                 #Provide accuracy and confidence scores & confusion matrix (see output template below) | https://wiki.pathmind.com/accuracy-precision-recall-f1
#Confusion Matrix Output:
# [True positives, False negatives]
# [False positives ,True negatives]

from lime import lime_tabular                               #Used to explain model results (i.e. show WHY a prediction was made as it had been)
import matplotlib.pyplot as plt

import os
import glob

In [12]:
#Demo flood data | dataset is used to determine the most common months for flooding based on historical data for past 100+ years
#Note: The models score well - often with 100% accuracy and confidence.
data_folder = os.path.join(os.getcwd(), 'data')
sourceType = "csv"
sourceLocation = glob.glob(os.path.join(data_folder,"./kerala.csv"))
colOutcome = "FLOODS"
display(sourceLocation[0])
colDrops = ["SUBDIVISION"]

'c:\\Users\\maf74\\source\\repos\\miked-generic-ml\\data\\./kerala.csv'

In [13]:
df = pd.read_csv(sourceLocation[0])        #Read file from source
df = df.drop(colDrops, axis=1)          #Drop any columns noted as not needed
Y = df[colOutcome]                      #Create series from this dataframe column (outcomes)

#Perform data cleanup (str to int if needed) and set X (features) dataframe
listNumCols = [col for col in df.columns if df[col].dtype == 'int64' or         #Get list of all int/float columns.  These do not need to be changed
               df[col].dtype == 'float64']
listOtherCols = [col for col in df[df.columns.difference(listNumCols)]]         #Get listing of all other columns.  These will be mapped to int representing str value
for col in listOtherCols:                                                       #Loop through column list
    mapper = {k: 0 + i for i, k in enumerate(set(df[col]))}                     #Create mapping object of unique values
    df[col] = df[col].map(mapper)                                               #Apply mapped values back to original dataframe to encode character data
float64_cols = list(df.select_dtypes(include='float64'))                        #Select columns with 'float64' dtype (convert tp 32 on line below.  needed fpr SKLearn)
df[float64_cols] = df[float64_cols].astype('float32')                           #The same code again calling the columns

X = df.drop([colOutcome], axis=1)                                               #Create features frame (also remove column noting outcome)
print("Entries with "+colOutcome+":    " + str(len(df[df[colOutcome] == 1])))   #Number of items indicating positive outcome
print("Entries without "+colOutcome+": " + str(len(df[df[colOutcome] == 0])))   #Number of items indicating negative outcome
print("Feature columns and data types:")
print(X.info())
print("-------------------------------------------------------------")

Entries with FLOODS:    60
Entries without FLOODS: 58
Feature columns and data types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   YEAR              118 non-null    int64  
 1   JAN               118 non-null    float32
 2   FEB               118 non-null    float32
 3   MAR               118 non-null    float32
 4   APR               118 non-null    float32
 5   MAY               118 non-null    float32
 6   JUN               118 non-null    float32
 7   JUL               118 non-null    float32
 8   AUG               118 non-null    float32
 9   SEP               118 non-null    float32
 10  OCT               118 non-null    float32
 11  NOV               118 non-null    float32
 12  DEC               118 non-null    float32
 13   ANNUAL RAINFALL  118 non-null    float32
dtypes: float32(13), int64(1)
memory usage: 7.0 KB
None
-

In [14]:
#Determine the features potentially most impacting
bestFeatures = SelectKBest(score_func=f_classif, k=3).fit(X, Y)                 #https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
df_scores = pd.DataFrame(bestFeatures.scores_)                                  #This section will determine which features appear to have the most impact in the outcome
df_columns = pd.DataFrame(X.columns)                                            # Note that the score_func value can be changed (see link above) to change the methodology
featuresScores = pd.concat([df_columns, df_scores], axis=1)                     # and sampling of the data to determine impact of features.
featuresScores.columns = ["Features", "Score"]
print("Relevance of features to outcome (lowest to highest):")
print(featuresScores.sort_values(by="Score"))                                   #Print listing of features sorted by their impact (greatest at bottom)
dfMax = featuresScores[featuresScores.Score == featuresScores.Score.max()]      #Get row with highest scored feature
maxFeature = dfMax["Features"].iloc[0]                                          #Get name of highest scored feature

#Split into train test sets (20% set aside to test the model)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)        #Create test & train sets with set test_size

Relevance of features to outcome (lowest to highest):
            Features       Score
4                APR    0.137368
2                FEB    0.148280
12               DEC    0.343369
10               OCT    0.420337
3                MAR    0.879362
1                JAN    2.502272
0               YEAR    4.962598
11               NOV    7.018248
5                MAY    7.267663
8                AUG   10.387935
9                SEP   19.145486
6                JUN   28.248747
7                JUL   28.287433
13   ANNUAL RAINFALL  175.849458


In [15]:
modelRF = RandomForestClassifier(random_state=1, n_estimators=100)              #Create model
modelRF.fit(X_train, Y_train)                                                   #Train model on data
#Make and evaluate predictions
modelPredictionsRF = modelRF.predict(X_test)                                    #Test model and record outcomes
accRF = metrics.accuracy_score(Y_test, modelPredictionsRF)                      #Check model's accuracy
print("Random Forest Accuracy: ", str(round(accRF, 2)*100), "%")

#Confusion matrix identifies results outcomes by true/false positive/negative
cm_modelRF = metrics.confusion_matrix(Y_test, modelPredictionsRF)               #Build confusion matrix for model
F1_scoreRF = metrics.f1_score(Y_test, modelPredictionsRF)                       #Determine confidence in model
print("Confidence Score (F1): ", str(round(F1_scoreRF, 2)*100), "%")
print("Confusion Matrix Below:")
print(cm_modelRF)

Random Forest Accuracy:  96.0 %
Confidence Score (F1):  97.0 %
Confusion Matrix Below:
[[ 7  1]
 [ 0 16]]


In [16]:
modelXGB = XGBClassifier()                                                      #Create model
modelXGB.fit(X_train, Y_train)                                                  #Train model on data
#Make and evaluate predictions
modelPredictionsXGB = modelXGB.predict(X_test)                                  #Test model and record outcomes
accXGB = metrics.accuracy_score(Y_test, modelPredictionsXGB)                    #Check model's accuracy
print("XGBoost Accuracy: ", str(round(accXGB, 2)*100), "%")

#Confusion matrix identifies results outcomes by true/false positive/negative
cm_modelXGB = metrics.confusion_matrix(Y_test, modelPredictionsXGB)             #Build confusion matrix for model
F1_scoreXGB = metrics.f1_score(Y_test, modelPredictionsXGB)                     #Determine confidence in model
print("Confidence Score (F1): ", str(round(F1_scoreXGB, 2)*100), "%")
print("Confusion Matrix Below:")
print(cm_modelXGB)

XGBoost Accuracy:  96.0 %
Confidence Score (F1):  97.0 %
Confusion Matrix Below:
[[ 7  1]
 [ 0 16]]


In [17]:
# dirModels = os.path.join(os.getcwd(), 'models')
# modelPath = os.path.join(dirModels,"./DemoML_" + colOutcome.replace(" ", "_") + ".pymod")
os.makedirs('outputs', exist_ok=True)

pickle.dump(modelXGB, open("outputs/kerala.pymod", 'wb'))

In [10]:
#Simulate "new" data to run through model (which has now been built and saved).  Below will load model and display results including prediction, probability and reason why
dfS = df.sample(n=1)                                                                                        #Using original dataframe, randomly select rows (simulates "new" data to run through model)
dfN = dfS.drop([colOutcome], axis=1, errors='ignore')                                                       #Drop stated outcome column as that wasn't in model's data and shouldn't be in "new" data
modelXGB = pickle.load(open("outputs/kerala.pymod", 'rb'))         #Recreate model based on saved file
pred = modelXGB.predict(dfN)                                                                                #Make prediction on data using model (drop outcome column if it exists)
prob = modelXGB.predict_proba(dfN)                                                                          #Make probability on prediction [Neg Pos] (drop outcome column if it exists)

#Add columns to dataset to summarize model's prediction
listProb = []                                                                                               #Create list to hold probability
for i, p in enumerate(pred):                                                                                #Loop through predictions [%False %True]
    listProb.append(round(prob[i][p], 2)*100)                                                               #Get probability for corresponding outcome (either false or true)

dfS["Prediction"] = pred.tolist()                                                                           #Apply prediction back to original data
dfS["Probability"] = listProb                                                                               #Apply probability back to original data
dfS["Model Accuracy"] = round(accXGB, 2)*100                                                                #Apply model accuracy
dfS["Model Confidence"] = round(F1_scoreXGB, 2)*100                                                         #Apply model confidence
print(dfS)                                                                                                  #View sampled data

print("*****************************************************************")
#Explain results (graphed and saved as HTML)
explainer = lime_tabular.LimeTabularExplainer(X_train.to_numpy(), mode="regression", feature_names=dfN.columns.values.tolist(), feature_selection="auto")
explanation = explainer.explain_instance(dfN.values.flatten(), modelXGB.predict)
dfE = pd.DataFrame(explanation.as_list(label=1), columns=['Feature', 'Value'])
print(dfE)

    YEAR        JAN        FEB        MAR        APR         MAY         JUN  \
62  1963  30.200001  24.799999  69.800003  96.300003  157.100006  393.299988   

           JUL    AUG         SEP         OCT        NOV        DEC  \
62  720.200012  511.0  223.899994  282.600006  93.400002  48.400002   

     ANNUAL RAINFALL  
62       2651.100098  
    YEAR        JAN        FEB        MAR        APR         MAY         JUN  \
62  1963  30.200001  24.799999  69.800003  96.300003  157.100006  393.299988   

           JUL    AUG         SEP         OCT        NOV        DEC  \
62  720.200012  511.0  223.899994  282.600006  93.400002  48.400002   

     ANNUAL RAINFALL  FLOODS  Prediction  Probability  Model Accuracy  \
62       2651.100098       0           0    98.000002           100.0   

    Model Confidence  
62             100.0  
*****************************************************************
                                 Feature     Value
0  2603.15 <  ANNUAL RAINFALL <= 293

In [None]:
## new section, simply want to infer from everything above

