In [1]:
import pandas as pd
import sklearn
from sklearn import tree

#

In [2]:
df = pd.read_csv('balance-scale.data', header=None, names=['C','LW','LD','RW','RD'])
df

#

Unnamed: 0,C,LW,LD,RW,RD
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5
...,...,...,...,...,...
620,L,5,5,5,1
621,L,5,5,5,2
622,L,5,5,5,3
623,L,5,5,5,4


In [3]:
# Creating a list of records where the format is ['description', mean, std]
recording_list = []
#

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

X = df.loc[:, ['LW','LD','RW','RD']]
y = df.loc[:, ['C']]

clf = DecisionTreeClassifier()
scores = cross_val_score(clf, X, y, cv=3)
print('With 3 cross_validation', 'Mean:', scores.mean(), scores.std())
recording_list.append(['With 3 cross_validation DecisionTreeClassifier', scores.mean(), scores.std()])

clf = DecisionTreeClassifier()
scores = cross_val_score(clf, X, y, cv=5)
print('With 5 cross_validation', 'Mean:', scores.mean(), scores.std())
recording_list.append(['With 5 cross_validation DecisionTreeClassifier', scores.mean(), scores.std()])

clf = DecisionTreeClassifier()
scores = cross_val_score(clf, X, y, cv=10)
print('With 10 cross_validation', 'Mean:', scores.mean(), scores.std())
recording_list.append(['With 10 cross_validation DecisionTreeClassifier', scores.mean(), scores.std()])


#

With 3 cross_validation Mean: 0.5633357870199975 0.1084098622894239
With 5 cross_validation Mean: 0.5824 0.12189438051034183
With 10 cross_validation Mean: 0.6721454173067076 0.1064649731893019


It looks like that the mean of cross validation results is affected by the number of specified folds during cross validation. It means that the decision tree model gets more accurate as you feed it a lot more data.

Let's look at the importance of the features.



In [5]:
clf = DecisionTreeClassifier()
clf.fit(X, y)
clf.feature_importances_

#

array([0.24962917, 0.26291426, 0.22783845, 0.25961812])

It looks like that importances of the attributes are balanced out well with a difference between 1 to 3 percent. Thus it means that pretty much they are equal to each other.

Even though the balanced class (i.e. 'B') is only 8% of the dataset, models still looks like to struggle to perform good on average even though the it would have trained on a large number of 'L' or 'R' classes.

I need to look at other machine learning algorithms to see whether they perform better.



In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import numpy as np
enc_y = LabelEncoder()
enc_y.fit(np.ravel(y.to_numpy()))
encoded_y = enc_y.transform(np.ravel(y.to_numpy()))
clf = LogisticRegression(random_state=0)
scores = cross_val_score(clf, X, encoded_y, cv=10)
print('With LogisticRegression', 'Mean:', scores.mean(), scores.std())
recording_list.append(['With LogisticRegression', scores.mean(), scores.std()])

#

With LogisticRegression Mean: 0.8512544802867383 0.05183392985486847


In [7]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(max_iter=1000, tol=1e-3)
scores = cross_val_score(clf, X, encoded_y, cv=10)
print('With SGDClassifier', 'Mean:', scores.mean(), scores.std())
recording_list.append(['With SGDClassifier', scores.mean(), scores.std()])

#

With SGDClassifier Mean: 0.8574244751664107 0.09700179741186786


In [8]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
scores = cross_val_score(clf, X, encoded_y, cv=10)
print('With SVC', 'Mean:', scores.mean(), scores.std())
recording_list.append(['With SVC', scores.mean(), scores.std()])

#

With SVC Mean: 0.857731694828469 0.06931390559131817


In [9]:
scikit-learnfrom sklearn.svm import LinearSVC
clf = LinearSVC(max_iter=5000)
scores = cross_val_score(clf, X, encoded_y, cv=10)
print('With LinearSVC', 'Mean:', scores.mean(), scores.std())
recording_list.append(['With LinearSVC', scores.mean(), scores.std()])

#

With LinearSVC Mean: 0.8496671786994368 0.05562250420421758


In [10]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200)
scores = cross_val_score(clf, X, encoded_y, cv=10)
print('With RandomForestClassifier', 'Mean:', scores.mean(), scores.std())
recording_list.append(['With RandomForestClassifier', scores.mean(), scores.std()])

#

With RandomForestClassifier Mean: 0.6880184331797234 0.10553635102729655


In [11]:
from sklearn.naive_bayes import CategoricalNB
clf = CategoricalNB()
scores = cross_val_score(clf, X, encoded_y, cv=10)
print('With CategoricalNB', 'Mean:', scores.mean(), scores.std())
recording_list.append(['With CategoricalNB', scores.mean(), scores.std()])

#

With CategoricalNB Mean: 0.6940860215053762 0.10213202128757502


In [12]:
from sklearn.neighbors import NearestCentroid
clf = NearestCentroid()
scores = cross_val_score(clf, X, encoded_y, cv=10)
print('With NearestCentroid', 'Mean:', scores.mean(), scores.std())
recording_list.append(['With NearestCentroid', scores.mean(), scores.std()])

#

With NearestCentroid Mean: 0.7250640040962621 0.12521517465521045


In [13]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
scores = cross_val_score(clf, X, encoded_y, cv=10)
print('With KNeighborsClassifier', 'Mean:', scores.mean(), scores.std())
recording_list.append(['With KNeighborsClassifier', scores.mean(), scores.std()])

#

With KNeighborsClassifier Mean: 0.7360727086533538 0.0644228441968043


In [14]:
from sklearn.neighbors import RadiusNeighborsClassifier
clf = RadiusNeighborsClassifier(radius=2.0)
scores = cross_val_score(clf, X, encoded_y, cv=10)
print('With RadiusNeighborsClassifier', 'Mean:', scores.mean(), scores.std())
recording_list.append(['With RadiusNeighborsClassifier', scores.mean(), scores.std()])

#

With RadiusNeighborsClassifier Mean: 0.8672811059907835 0.04806896995866881


In [15]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(max_iter=1200)
scores = cross_val_score(clf, X, encoded_y, cv=10)
print('With MLPClassifier', 'Mean:', scores.mean(), scores.std())
recording_list.append(['With MLPClassifier', scores.mean(), scores.std()])

#

With MLPClassifier Mean: 0.9744495647721454 0.02860650605943155


In [16]:
sorted_list = sorted(recording_list, key=lambda item: item[1], reverse=True)
for classifier, mean, std in sorted_list:
    print(classifier, 'Mean:', mean, 'std:', std)

#

With MLPClassifier Mean: 0.9744495647721454 std: 0.02860650605943155
With RadiusNeighborsClassifier Mean: 0.8672811059907835 std: 0.04806896995866881
With SVC Mean: 0.857731694828469 std: 0.06931390559131817
With SGDClassifier Mean: 0.8574244751664107 std: 0.09700179741186786
With LogisticRegression Mean: 0.8512544802867383 std: 0.05183392985486847
With LinearSVC Mean: 0.8496671786994368 std: 0.05562250420421758
With KNeighborsClassifier Mean: 0.7360727086533538 std: 0.0644228441968043
With NearestCentroid Mean: 0.7250640040962621 std: 0.12521517465521045
With CategoricalNB Mean: 0.6940860215053762 std: 0.10213202128757502
With RandomForestClassifier Mean: 0.6880184331797234 std: 0.10553635102729655
With 10 cross_validation DecisionTreeClassifier Mean: 0.6721454173067076 std: 0.1064649731893019
With 5 cross_validation DecisionTreeClassifier Mean: 0.5824 std: 0.12189438051034183
With 3 cross_validation DecisionTreeClassifier Mean: 0.5633357870199975 std: 0.1084098622894239


After experimenting with 11 algorithms, it looks like a neural network type classifier wins the testing round.

Now I need to perform two things:
    - I will need to delve further into algorithms to understand how they work as to have an idea why they perform either poorly or brilliantly. Algorithms I decided to learn about are:
        1. MLPClassififer -- to understand neural networks in general
        2. SGDClassifier -- to understand the stochastic classifier since it's second best
        3. Decision Tree -- to understand decision trees since it's the worst model to train from the get go.
    - During testing, I have been wondering about the current dataset and I think I have managed to notice few descrepencies I didn't see before. I will elaborate on this later.




```     ______                  __  _                ___
    /  _/ /____  _________ _/ /_(_)___  ____     |__ \
    / // __/ _ \/ ___/ __ `/ __/ / __ \/ __ \    __/ /
  _/ // /_/  __/ /  / /_/ / /_/ / /_/ / / / /   / __/
 /___/\__/\___/_/   \__,_/\__/_/\____/_/ /_/   /____/
```



Purpose of this iteration is to see whether decision trees algorithms will be improved from its worse accuracy performance.

The plan is to test different set of configurations and then do the same thing for feature engineered features.

After reading about complex decisions trees that can fail to generalise a problem [Link](https://scikit-learn.org/stable/modules/tree.html#tree),
the following has been advised to attempt to reduce the chances of such issue:
- Set a required minimum samples at leaf nodes.
- Set a depth level number to say how far the tree can go.




In [17]:
# Testing parameters with original dataset
from sklearn.model_selection import GridSearchCV

decision_tree_clf = DecisionTreeClassifier()
parameters = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None],
    'min_samples_leaf': [0.2, 0.4, 0.5, 1]
}
clf = GridSearchCV(decision_tree_clf, parameters, cv=10)
X = df.loc[:, ['LW','LD','RW','RD']]
y = df.loc[:, ['C']]
clf.fit(X, y)
pd.DataFrame(clf.cv_results_).loc[:, ['param_max_depth', 'param_min_samples_leaf', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by=['rank_test_score']).head()

#

Unnamed: 0,param_max_depth,param_min_samples_leaf,mean_test_score,std_test_score,rank_test_score
15,4.0,1,0.69319,0.086092,1
43,,1,0.668894,0.098979,2
31,8.0,1,0.668843,0.105015,3
39,10.0,1,0.667281,0.098959,4
35,9.0,1,0.665719,0.099081,5


In [18]:
# Creating columns for the calculated weights.
left_array = df.loc[:, ['LW', 'LD']].to_numpy()
calculations = [item[0] * item[1] for item in left_array]
df['L_calc'] = calculations
right_array = df.loc[:, ['RW', 'RD']].to_numpy()
calculations = [item[0] * item[1] for item in right_array]
df['R_calc'] = calculations
df.head()

#

Unnamed: 0,C,LW,LD,RW,RD,L_calc,R_calc
0,B,1,1,1,1,1,1
1,R,1,1,1,2,1,2
2,R,1,1,1,3,1,3
3,R,1,1,1,4,1,4
4,R,1,1,1,5,1,5


In [19]:
# Testing parameters with a new feature of calculations of weight and height for each side.

decision_tree_clf = DecisionTreeClassifier()
parameters = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None],
    'min_samples_leaf': [0.2, 0.4, 0.5, 1]
}
clf = GridSearchCV(decision_tree_clf, parameters, cv=10)
X = df.loc[:, ['LW','LD','RW','RD', 'L_calc', 'R_calc']]
y = df.loc[:, ['C']]
clf.fit(X, y)
pd.DataFrame(clf.cv_results_).loc[:, ['param_max_depth', 'param_min_samples_leaf', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by=['rank_test_score']).head()

#

Unnamed: 0,param_max_depth,param_min_samples_leaf,mean_test_score,std_test_score,rank_test_score
31,8.0,1,0.895571,0.089734,1
35,9.0,1,0.892396,0.086711,2
39,10.0,1,0.884511,0.090592,3
43,,1,0.88446,0.098327,4
27,7.0,1,0.881285,0.087815,5


In [20]:
# Test the configurations by using just the calculations of the weights and distance.
decision_tree_clf = DecisionTreeClassifier()
parameters = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None],
    'min_samples_leaf': [0.2, 0.4, 0.5, 1]
}
clf = GridSearchCV(decision_tree_clf, parameters, cv=10)
X = df.loc[:, [ 'L_calc', 'R_calc']]
y = df.loc[:, ['C']]
clf.fit(X, y)
pd.DataFrame(clf.cv_results_).loc[:, ['param_max_depth', 'param_min_samples_leaf', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by=['rank_test_score']).head()


#

Unnamed: 0,param_max_depth,param_min_samples_leaf,mean_test_score,std_test_score,rank_test_score
43,,1,0.977573,0.023943,1
39,10.0,1,0.977573,0.023943,1
35,9.0,1,0.977573,0.023943,1
31,8.0,1,0.977573,0.023943,1
27,7.0,1,0.974347,0.030634,5


It looks like by introducing the calculations of weights and heights for each side helped the decision tree tremendously. Especially when only providing the calculatins on their own.

I suppose that such calculations is kinda like a cheat, because I think it makes it easier for the model to 'sense' the algorithmic logic.

I am very interested to see what will happen if I introduce boolean flag features (like making a hot-spot (I think) type features that are used for neural networks).



In [21]:
# Creating feature columns to represent hot-spotted boolean values for the classes.
samples_array = df.loc[:, ['LW', 'LD', 'RW', 'RD']].to_numpy()
df['left_flag'] = [(item[0] * item[1]) > (item[2] * item[3]) for item in samples_array]
df['right_flag'] = [(item[0] * item[1]) < (item[2] * item[3]) for item in samples_array]
df['balanced_flag'] = [(item[0] * item[1]) == (item[2] * item[3]) for item in samples_array]
df.head()

#

Unnamed: 0,C,LW,LD,RW,RD,L_calc,R_calc,left_flag,right_flag,balanced_flag
0,B,1,1,1,1,1,1,False,False,True
1,R,1,1,1,2,1,2,False,True,False
2,R,1,1,1,3,1,3,False,True,False
3,R,1,1,1,4,1,4,False,True,False
4,R,1,1,1,5,1,5,False,True,False


In [22]:
# Test the configurations by using just the calculations of the weights and distance.
decision_tree_clf = DecisionTreeClassifier()
parameters = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None],
    'min_samples_leaf': [0.2, 0.4, 0.5, 1]
}
clf = GridSearchCV(decision_tree_clf, parameters, cv=10)
X = df.loc[:, ['LW', 'LD', 'RW', 'RD', 'left_flag', 'balanced_flag', 'right_flag']]
y = df.loc[:, ['C']]
clf.fit(X, y)
pd.DataFrame(clf.cv_results_).loc[:, ['param_max_depth', 'param_min_samples_leaf', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by=['rank_test_score']).head()

#

Unnamed: 0,param_max_depth,param_min_samples_leaf,mean_test_score,std_test_score,rank_test_score
43,,1,1.0,0.0,1
15,4.0,1,1.0,0.0,1
27,7.0,1,1.0,0.0,1
11,3.0,1,1.0,0.0,1
31,8.0,1,1.0,0.0,1


In [23]:
# Test the configurations by using just the calculations of the weights and distance.
decision_tree_clf = DecisionTreeClassifier()
parameters = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None],
    'min_samples_leaf': [0.2, 0.4, 0.5, 1]
}
clf = GridSearchCV(decision_tree_clf, parameters, cv=10)
X = df.loc[:, ['left_flag', 'balanced_flag', 'right_flag']]
y = df.loc[:, ['C']]
clf.fit(X, y)
pd.DataFrame(clf.cv_results_).loc[:, ['param_max_depth', 'param_min_samples_leaf', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values(by=['rank_test_score']).head()

#

Unnamed: 0,param_max_depth,param_min_samples_leaf,mean_test_score,std_test_score,rank_test_score
43,,1,1.0,0.0,1
15,4.0,1,1.0,0.0,1
27,7.0,1,1.0,0.0,1
11,3.0,1,1.0,0.0,1
31,8.0,1,1.0,0.0,1


Conclusions after trying feature engineering:
- Calculated weights on each side:
  - with the rest of features, it's on average 90%
  - on their own, it's staggering 97%
- Boolean flags
  - With or without the rest of the feautres, it's pretty much 100%.
    - In other words, I have already done the job for the algorithm :smiley:.

Thus, it seems that the calculated weights of the features helps to make better guesses about the incoming values whereas the boolean flags are cheats.

The next experiment will see whether I can make a model from balanced dataset where there are equal number of samples for each class.

Before I do that though, I need to look into overfitting problems of the algorithm. During experiments, I noticed that the decision tree's accuracy would decrease
if I fed it less training data when I performed different cross-validations (i.e. k-folds).
What I would like to see is whether samples that sit on each end of the range (e.g. on value 1 and 5 rather than inbetween them) that can affect the accuracy and only use original features and the calculated weights.
I will make the rations of class samples balanced as to be consistent with the mentioned experiment.

ToDo Overfitting Experiments:
- [x] Only upper values of the range (i.e. 5 in weights and distance no matter what in other values as long as conditions are met)
    - ~[ ] With original features~
    - [x] With original features and the calculated weights
- [x] Only lower values of the range (i.e. 1 in weights and distance no matter what in other values as long as conditions are met)
    - ~[ ] With original features~
    - [x] With original features and the calculated weights
- [x] Upper and lower values of the range (i.e. both of the prior conditions)
    - ~[ ] With original features~
    - [x] With original features and the calculated weights

Update: just did some look up at the tree algorithm structures in 'looking_at_decision_trees_structre.py' file and its generated pdf documents.



In [24]:
upper_values_df = df[(df.LD == 5) | (df.LW == 5) | (df.RD == 5) | (df.RW == 5)]
lower_values_df = df[(df.LD == 1) | (df.LW == 1) | (df.RD == 1) | (df.RW == 1)]
upper_values_df.groupby('C').count()
lower_values_df.groupby('C').count()

# From the looks of it, I will choose 15 samples of each class for lower and upper values
SEED = 1111

# Making random samples for lower values by each class
bal_df = lower_values_df[lower_values_df.C == 'B']
left_df = lower_values_df[lower_values_df.C == 'L']
right_df = lower_values_df[lower_values_df.C == 'R']

bal_samples = bal_df.sample(n=15, random_state=SEED)
left_samples = left_df.sample(n=15, random_state=SEED)
right_samples = right_df.sample(n=15, random_state=SEED)

lower_samples_df = pd.concat([bal_samples, left_samples, right_samples])
lower_samples_df.index

# Making random samples for upper values by each class
bal_df = upper_values_df[upper_values_df.C == 'B']
left_df = upper_values_df[upper_values_df.C == 'L']
right_df = upper_values_df[upper_values_df.C == 'R']
bal_df.describe()
left_df.describe()

bal_samples = bal_df.sample(n=15, random_state=SEED)
left_samples = left_df.sample(n=15, random_state=SEED)
right_samples = right_df.sample(n=15, random_state=SEED)

upper_samples_df = pd.concat([bal_samples, left_samples, right_samples])
upper_samples_df.index

both_samples_df = pd.concat([lower_samples_df, upper_samples_df])

#

In [25]:
# For dataset description purpose
lower_samples_df.describe()

#

Unnamed: 0,LW,LD,RW,RD,L_calc,R_calc
count,45.0,45.0,45.0,45.0,45.0,45.0
mean,2.533333,2.422222,2.488889,2.488889,5.533333,5.577778
std,1.501514,1.469213,1.546583,1.618392,4.957089,5.136717
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,3.0,2.0
50%,2.0,2.0,2.0,2.0,4.0,4.0
75%,4.0,4.0,4.0,4.0,5.0,5.0
max,5.0,5.0,5.0,5.0,25.0,25.0


In [26]:
upper_samples_df.describe()

#

Unnamed: 0,LW,LD,RW,RD,L_calc,R_calc
count,45.0,45.0,45.0,45.0,45.0,45.0
mean,3.466667,3.2,3.511111,3.511111,10.444444,11.977778
std,1.47093,1.546256,1.440048,1.561209,6.276057,6.923244
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,2.0,2.0,2.0,5.0,5.0
50%,4.0,3.0,4.0,4.0,10.0,10.0
75%,5.0,5.0,5.0,5.0,15.0,15.0
max,5.0,5.0,5.0,5.0,25.0,25.0


In [27]:

def separate_dataframe_from_training_one(original_df, training_df):
    '''Return a newly-generated dataframe where one's rows don't exist in a training one'''
    original_set = set(original_df.index)
    training_set = set(training_df.index)
    non_training_set = original_set - training_set
    return original_df.iloc[list(non_training_set)]

#

In [28]:
from sklearn.metrics import accuracy_score

separated_df = separate_dataframe_from_training_one(df, lower_samples_df)

decision_tree_clf = DecisionTreeClassifier()
list_of_features = ['LW', 'LD', 'RW', 'RD', 'L_calc', 'R_calc']
X = lower_samples_df.loc[:, list_of_features]
y = lower_samples_df.loc[:, ['C']]
decision_tree_clf.fit(X, y)
X = separated_df.loc[:, list_of_features]
y = separated_df.loc[:, ['C']]
accuracy_score(decision_tree_clf.predict(X), y)

#

0.7051724137931035

In [29]:
# Storing tree into the dot format for observation
# tree.export_graphviz(decision_tree_clf, out_file='lower_1_tree.dot')

#

In [30]:
separated_df = separate_dataframe_from_training_one(df, upper_samples_df)

decision_tree_clf = DecisionTreeClassifier()
list_of_features = ['LW', 'LD', 'RW', 'RD', 'L_calc', 'R_calc']
# list_of_features = ['LW', 'LD', 'RW', 'RD']
X = upper_samples_df.loc[:, list_of_features]
y = upper_samples_df.loc[:, ['C']]
decision_tree_clf.fit(X, y)
X = separated_df.loc[:, list_of_features]
y = separated_df.loc[:, ['C']]
accuracy_score(decision_tree_clf.predict(X), y)

#

0.8155172413793104

In [31]:
separated_df = separate_dataframe_from_training_one(df, both_samples_df)

decision_tree_clf = DecisionTreeClassifier()
list_of_features = ['LW', 'LD', 'RW', 'RD', 'L_calc', 'R_calc']
# list_of_features = ['LW', 'LD', 'RW', 'RD']
X = both_samples_df.loc[:, list_of_features]
y = both_samples_df.loc[:, ['C']]
decision_tree_clf.fit(X, y)
X = separated_df.loc[:, list_of_features]
y = separated_df.loc[:, ['C']]
accuracy_score(decision_tree_clf.predict(X), y)

#

0.8521256931608133

It looks like that the models still perform good even though I was expecting some overfitting problems. Even if there are only few samples to train from for particular boundaries
(e.g. on average 70% accuracy for lower values and average 80)

I believe it is because the samples have a wide range of values for weights and distances that still managed to make well-performed models
(e.g. lower-values-only samples without the calculated weights would produce on average 59%).

Thus, I shall make samples that only contain measurements that meet certain range criterial (i.e. maximum and minimum values)

I believe this is because there's a wide range of the calculated weights for the classes and thus it helped the model
to determine classes accurately. Like you can see below.



In [32]:
# Checking out the calculated weights
upper_samples_df.loc[:, ['L_calc', 'R_calc']].describe()
lower_samples_df.loc[:, ['L_calc', 'R_calc']].describe()
both_samples_df.loc[:, ['L_calc', 'R_calc']].describe()

lower_samples_df.R_calc.nunique()
lower_samples_df.L_calc.nunique()
both_samples_df.L_calc.nunique()
both_samples_df.R_calc.nunique()

#

14

Thus, I shall make a dataframes for calculated weights value to see whether I can achieve overfitting.



In [33]:
upper_values_df = df[(df.LD >= 4) & (df.LW >= 4) & (df.RD >= 4) & (df.RW >= 4)]
lower_values_df = df[(df.LD <= 2) & (df.LW <= 2) & (df.RD <= 2) & (df.RW <= 2)]
upper_values_df.groupby('C').count()
lower_values_df.groupby('C').count()
lower_values_df[lower_values_df.C == 'B'].describe()
lower_values_df[lower_values_df.C == 'R'].describe()
lower_values_df[lower_values_df.C == 'L'].describe()
upper_values_df[upper_values_df.C == 'B'].describe()
upper_values_df[upper_values_df.C == 'R'].describe()
upper_values_df[upper_values_df.C == 'L'].describe()
# Ok, it looks like there are very few samples for training purposes which is kinda ideal for overfitting.
# But what's more ideal is the range of calculated weights that hopefully will introduce overfitting.
# It looks like there are only maximum 6 samples, I shall pick just 4 for each class and value boundary.
SEED = 1111

# Making random samples for lower values by each class
bal_df = lower_values_df[lower_values_df.C == 'B']
left_df = lower_values_df[lower_values_df.C == 'L']
right_df = lower_values_df[lower_values_df.C == 'R']

bal_samples = bal_df.sample(n=4, random_state=SEED)
left_samples = left_df.sample(n=4, random_state=SEED)
right_samples = right_df.sample(n=4, random_state=SEED)

lower_samples_df = pd.concat([bal_samples, left_samples, right_samples])
lower_samples_df.index

# Making random samples for upper values by each class
bal_df = upper_values_df[upper_values_df.C == 'B']
left_df = upper_values_df[upper_values_df.C == 'L']
right_df = upper_values_df[upper_values_df.C == 'R']
bal_df.describe()
left_df.describe()

bal_samples = bal_df.sample(n=4, random_state=SEED)
left_samples = left_df.sample(n=4, random_state=SEED)
right_samples = right_df.sample(n=4, random_state=SEED)

upper_samples_df = pd.concat([bal_samples, left_samples, right_samples])
upper_samples_df.index

both_samples_df = pd.concat([lower_samples_df, upper_samples_df])

#

In [34]:
# For dataset description purpose
lower_samples_df.describe()

#

Unnamed: 0,LW,LD,RW,RD,L_calc,R_calc
count,12.0,12.0,12.0,12.0,12.0,12.0
mean,1.416667,1.5,1.416667,1.5,2.166667,2.166667
std,0.514929,0.522233,0.514929,0.522233,1.193416,1.193416
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,1.0,1.0
50%,1.0,1.5,1.0,1.5,2.0,2.0
75%,2.0,2.0,2.0,2.0,2.5,2.5
max,2.0,2.0,2.0,2.0,4.0,4.0


In [35]:
upper_samples_df.describe()

#

Unnamed: 0,LW,LD,RW,RD,L_calc,R_calc
count,12.0,12.0,12.0,12.0,12.0,12.0
mean,4.416667,4.5,4.416667,4.5,19.916667,19.916667
std,0.514929,0.522233,0.514929,0.522233,3.553701,3.553701
min,4.0,4.0,4.0,4.0,16.0,16.0
25%,4.0,4.0,4.0,4.0,16.0,16.0
50%,4.0,4.5,4.0,4.5,20.0,20.0
75%,5.0,5.0,5.0,5.0,21.25,21.25
max,5.0,5.0,5.0,5.0,25.0,25.0


In [36]:
from sklearn.metrics import accuracy_score

separated_df = separate_dataframe_from_training_one(df, lower_samples_df)

decision_tree_clf = DecisionTreeClassifier()
list_of_features = ['LW', 'LD', 'RW', 'RD', 'L_calc', 'R_calc']
X = lower_samples_df.loc[:, list_of_features]
y = lower_samples_df.loc[:, ['C']]
decision_tree_clf.fit(X, y)
X = separated_df.loc[:, list_of_features]
y = separated_df.loc[:, ['C']]
accuracy_score(decision_tree_clf.predict(X), y)

#

0.399673735725938

In [37]:
# Storing tree into the dot format for observation
# tree.export_graphviz(decision_tree_clf, out_file='lower_2_tree.dot')

#

In [38]:
separated_df = separate_dataframe_from_training_one(df, upper_samples_df)

decision_tree_clf = DecisionTreeClassifier()
list_of_features = ['LW', 'LD', 'RW', 'RD', 'L_calc', 'R_calc']
# list_of_features = ['LW', 'LD', 'RW', 'RD']
X = upper_samples_df.loc[:, list_of_features]
y = upper_samples_df.loc[:, ['C']]
decision_tree_clf.fit(X, y)
X = separated_df.loc[:, list_of_features]
y = separated_df.loc[:, ['C']]
accuracy_score(decision_tree_clf.predict(X), y)

#

0.27895595432300163

In [39]:
separated_df = separate_dataframe_from_training_one(df, both_samples_df)

decision_tree_clf = DecisionTreeClassifier()
list_of_features = ['LW', 'LD', 'RW', 'RD', 'L_calc', 'R_calc']
# list_of_features = ['LW', 'LD', 'RW', 'RD']
X = both_samples_df.loc[:, list_of_features]
y = both_samples_df.loc[:, ['C']]
decision_tree_clf.fit(X, y)
X = separated_df.loc[:, list_of_features]
y = separated_df.loc[:, ['C']]
accuracy_score(decision_tree_clf.predict(X), y)

#

0.6339434276206323

Ok, I can see that the model is under performing for three categories of datasets,
I will pick the lower tier for comparison since it looks like it

Update: just did some look up at the tree algorithm structures in 'looking_at_decision_trees_structre.py' file and its generated pdf documents.

At this point, I will have a look at SGDClassifier algorithm.