## Collect Data

In [None]:
import pandas as pd
data = pd.read_csv('sudoku_data.csv')

Unnamed: 0,NoC,stddev,Var,r0,r1,r2,Pair Red2,Pair Diff2,Poss,r3,...,Pair Red,Pair Diff,Pairs,Passes AvgPoss,Pairs Poss,NoC after pass,NoC after 2pass,NoC after passes,NoC diff,difficulty
0,32,0.955814,3,12,16,4,73,41,2.775510,16,...,56,24,23.0,0.000000,-79,81,81,81,49,0
1,34,1.030402,3,12,22,0,48,14,2.872340,16,...,42,8,18.0,0.000000,-37,37,37,37,3,0
2,28,1.196703,3,14,12,2,33,5,3.377358,10,...,31,3,3.0,3.306122,-21,30,30,30,2,0
3,33,1.632993,5,8,24,1,51,18,3.125000,18,...,47,14,24.0,0.000000,-54,46,46,46,13,0
4,28,0.737028,2,12,12,4,43,15,3.056604,14,...,37,9,6.0,2.052632,-37,34,34,34,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,29,0.916246,3,12,12,5,29,15,3.173077,12,...,29,6,1.0,3.235294,0,29,29,29,0,2
5996,27,0.816497,2,16,8,3,33,23,3.666667,10,...,32,13,4.0,3.260870,-35,27,33,33,6,2
5997,28,1.369870,5,8,20,0,30,53,3.566038,12,...,30,33,2.0,3.162791,-16,28,30,30,2,2
5998,28,0.866025,5,10,16,2,31,32,3.452830,14,...,30,16,3.0,1.142857,-12,28,28,28,0,2


## Features
Clue Count
1.   NoC - counts the number of clues
2.   Poss - calculates the average possibilities per empty cell

Clue Variation
1.   stddev - calculates the standard deviation of the number of each clue
2.   Var - subtracts the frequency of the least common clue from the frequency of the most common clue

Clue Placement
1.   r0 - counts the number of clues in corner boxes
2.   r1 - counts the number of clues in edge boxes
3.   r2 - counts the number of clues in center box
4.   r3 - counts the number of clues on the outside of the puzzle
6.   Clusters - counts the number of blocks of adjacent clues

Solving Strategy
1.   NoC after pass - counts the number of clues after a **"first pass"**
2.   NoC after 2pass - counts the number of clues after 2 passes
3.   NoC after passes - counts the number of clues after as many passes are made as possible (limited at 40 passes)
4.   Pairs - counts the number of pairs or triples
5.   Pair Red - counts the number of clues after possibilities reduced by pair solving
6.   Pair Red2 - counts the number of clues after possibilities reduced twice by pair solving
7.   NoC diff - counts the difference in the number of clues after 2 first passes are made
8.   Pair Diff2 - counts the difference in the number of clues after 2 pair solves
9.  Pair Poss - subtracts the total number of possibilities (including that of filled clues) after 2 pair solves from the original total number of possibilities

## What is a First Pass?

A first pass solves all cells in the sudoku with only one possible value, by eliminating the numbers in the same box, column, or row as the cell. It also solves for cells which have a possible value not present anywhere else in its column, row, or box.

## Random Forest Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# function for creating a classifier and with a certain number of estimators
def predict_rf(features, random_state,e):
  final_features = features
  X_1 = data_1[final_features]
  y_1 = data_1['difficulty']
  X_2 = data_2[final_features]
  y_2 = data_2['difficulty']
  X_3 = data_3[final_features]
  y_3 = data_3['difficulty']

  X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.2, random_state=random_state)
  X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=random_state)
  X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3, y_3, test_size=0.2, random_state=random_state)

  X_train, X_test, y_train, y_test = pd.concat([X_train_1,X_train_2,X_train_3], ignore_index=True),pd.concat([X_test_1,X_test_2,X_test_3],ignore_index=True), pd.concat([y_train_1,y_train_2,y_train_3],ignore_index=True),pd.concat([y_test_1,y_test_2,y_test_3],ignore_index=True)

  classifier = RandomForestClassifier(n_estimators=e, random_state=53)
  classifier.fit(X_train[features], y_train)
  y_pred = classifier.predict(X_test)
  training_accuracy = accuracy_score(y_train,classifier.predict(X_train[features]))
  testing_accuracy = accuracy_score(y_test, y_pred)
  return testing_accuracy

In [None]:
# testing different # of estimators
features =  ['NoC after 2pass','NoC after passes','Poss','Pairs','NoC','stddev','Pair Red','Clusters','r1','Pair Red2','Pair Diff2','Pairs Poss','Var']
estimator = 0
accuracy = 0
for x in range(5,300,5):
  a = predict_rf(features, 50, x)
  if a>accuracy:
    accuracy = a
    estimator = x
print(accuracy)
print(estimator)

0.7766666666666666
160


In [None]:
# function for printing the accuracy of a model with certain features
def predict_rf(features, random_state):
  final_features = features
  X_1 = data_1[final_features]
  y_1 = data_1['difficulty']
  X_2 = data_2[final_features]
  y_2 = data_2['difficulty']
  X_3 = data_3[final_features]
  y_3 = data_3['difficulty']

  X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.2, random_state=random_state)
  X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=random_state)
  X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3, y_3, test_size=0.2, random_state=random_state)

  X_train, X_test, y_train, y_test = pd.concat([X_train_1,X_train_2,X_train_3], ignore_index=True),pd.concat([X_test_1,X_test_2,X_test_3],ignore_index=True), pd.concat([y_train_1,y_train_2,y_train_3],ignore_index=True),pd.concat([y_test_1,y_test_2,y_test_3],ignore_index=True)

  classifier = RandomForestClassifier(n_estimators=160, random_state=53)
  classifier.fit(X_train[features], y_train)
  y_pred = classifier.predict(X_test)
  training_accuracy = accuracy_score(y_train,classifier.predict(X_train[features]))
  testing_accuracy = accuracy_score(y_test, y_pred)
  print("Training Accuracy:", training_accuracy)
  print("Testing Accuracy:", testing_accuracy)
  return y_pred

In [None]:
# final classifier
features =  ['NoC after 2pass','NoC after passes','Poss','Pairs','NoC','stddev','Pair Red','Clusters','r1','Pair Red2','Pair Diff2','Pairs Poss','Var']
y_pred = predict_rf(features, 50)


Training Accuracy: 1.0
Testing Accuracy: 0.7766666666666666


In [None]:
# Using everything except solving strategies
features = ['NoC','Poss','stddev','Var','r0','r1','r2','r3','Clusters']
y_pred = predict_rf(features, 50)

Training Accuracy: 0.9989583333333333
Testing Accuracy: 0.44


In [None]:
# testing one solving strategy variable at a time
variables = ['NoC after pass','NoC after 2pass', 'NoC diff','NoC after passes',
            'Pairs','Pair Red','Pair Red2','Pair Diff2','Pairs Poss']
for v in variables:
  features = ['NoC','Poss','stddev','Var','r0','r1','r2','r3','Clusters',v]
  print("results for " + str(v) + " feature")
  y_pred_3 = predict_rf(features, 50)
  print(" ")

results for NoC after pass feature
Training Accuracy: 0.9997916666666666
Testing Accuracy: 0.48333333333333334
 
results for NoC after 2pass feature
Training Accuracy: 1.0
Testing Accuracy: 0.48583333333333334
 
results for NoC diff feature
Training Accuracy: 1.0
Testing Accuracy: 0.4925
 
results for NoC after passes feature
Training Accuracy: 1.0
Testing Accuracy: 0.48583333333333334
 
results for Pairs feature
Training Accuracy: 0.9997916666666666
Testing Accuracy: 0.49083333333333334
 
results for Pair Red feature
Training Accuracy: 1.0
Testing Accuracy: 0.49083333333333334
 
results for Pair Red2 feature
Training Accuracy: 0.9997916666666666
Testing Accuracy: 0.5116666666666667
 
results for Pair Diff2 feature
Training Accuracy: 0.9997916666666666
Testing Accuracy: 0.6641666666666667
 
results for Pairs Poss feature
Training Accuracy: 1.0
Testing Accuracy: 0.5175
 


In [None]:
#testing individual variables
variables = ['NoC','Poss','stddev','Var','r0','r1','r2','r3','Clusters',
             'NoC after pass','NoC after 2pass', 'NoC diff','NoC after passes',
            'Pairs','Pair Red','Pair Red2','Pair Diff2','Pairs Poss']
for v in variables:
  features = [v]
  print("results for " + str(v) + " feature")
  y_pred_3 = predict_rf(features, 50)
  print(" ")

results for NoC feature
Training Accuracy: 0.43395833333333333
Testing Accuracy: 0.42
 
results for Poss feature
Training Accuracy: 0.5275
Testing Accuracy: 0.42083333333333334
 
results for stddev feature
Training Accuracy: 0.42520833333333335
Testing Accuracy: 0.375
 
results for Var feature
Training Accuracy: 0.3485416666666667
Testing Accuracy: 0.3258333333333333
 
results for r0 feature
Training Accuracy: 0.3877083333333333
Testing Accuracy: 0.3525
 
results for r1 feature
Training Accuracy: 0.3614583333333333
Testing Accuracy: 0.3441666666666667
 
results for r2 feature
Training Accuracy: 0.37625
Testing Accuracy: 0.3475
 
results for r3 feature
Training Accuracy: 0.379375
Testing Accuracy: 0.36583333333333334
 
results for Clusters feature
Training Accuracy: 0.3777083333333333
Testing Accuracy: 0.3641666666666667
 
results for NoC after pass feature
Training Accuracy: 0.48833333333333334
Testing Accuracy: 0.4425
 
results for NoC after 2pass feature
Training Accuracy: 0.5
Testin