In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
# import svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
# import knn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
# read the csv file, skip the first row, use ';' as separator, name the columns
images = pd.read_csv('images.csv', skiprows=1, sep=';', header=None, names=['ID', 'Label'])
images.set_index('ID', inplace=True)
images.head()

Unnamed: 0_level_0,Label
ID,Unnamed: 1_level_1
1,binocular
2,chair
3,tick
4,minaret
5,Faces


In [3]:
print("Images file Shape: ", images.shape)

Images file Shape:  (9144, 1)


In [4]:
edges = pd.read_csv('EdgeHistogram.csv', sep=';', skiprows=1, header=None)
edges.set_index(0, inplace=True)
# change the index name to ID
edges.index.names = ['ID']
print(edges.head())

    1   2   3   4   5   6   7   8   9   10  ...  71  72  73  74  75  76  77  \
ID                                          ...                               
1    1   1   1   2   2   2   4   4   2   2  ...   1   2   2   3   3   1   1   
2    2   2   1   0   3   0   4   2   2   4  ...   1   4   3   3   5   2   2   
3    5   1   6   2   6   3   2   7   1   6  ...   3   4   7   0   6   3   2   
4    0   0   0   0   0   0   0   1   0   1  ...   0   0   1   0   1   0   0   
5    1   6   4   2   2   0   6   6   5   4  ...   6   2   6   3   3   6   1   

    78  79  80  
ID              
1    1   1   2  
2    1   0   4  
3    7   3   6  
4    1   0   1  
5    2   5   6  

[5 rows x 80 columns]


In [5]:
# merge the two dataframes
df = pd.merge(images, edges, on='ID')
print(df.head())

        Label  1  2  3  4  5  6  7  8  9  ...  71  72  73  74  75  76  77  78  \
ID                                        ...                                   
1   binocular  1  1  1  2  2  2  4  4  2  ...   1   2   2   3   3   1   1   1   
2       chair  2  2  1  0  3  0  4  2  2  ...   1   4   3   3   5   2   2   1   
3        tick  5  1  6  2  6  3  2  7  1  ...   3   4   7   0   6   3   2   7   
4     minaret  0  0  0  0  0  0  0  1  0  ...   0   0   1   0   1   0   0   1   
5       Faces  1  6  4  2  2  0  6  6  5  ...   6   2   6   3   3   6   1   2   

    79  80  
ID          
1    1   2  
2    0   4  
3    3   6  
4    0   1  
5    5   6  

[5 rows x 81 columns]


In [6]:
#train test split
X = df.drop('Label', axis=1)
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train Shape: ", X_train.shape)
print("X_test Shape: ", X_test.shape)

X_train Shape:  (7315, 80)
X_test Shape:  (1829, 80)


In [7]:
models = {'RandomForest': RandomForestClassifier(),
          'SVM': SVC(),
          'KNN': KNeighborsClassifier()}

params = {'RandomForest': {'n_estimators': [100, 200, 300, 400],
                        'max_depth': [1, 2, 3, 4, 5, 6],
                        'criterion': ['gini', 'entropy']},
          'SVM': {'C': [0.1, 1, 2, 3, 4],
                  'gamma': [1, 0.1, 0.01, 0.001],
                  'kernel': ['rbf', 'linear']},
          'KNN': {'n_neighbors': [3, 5, 7, 9],
                  'weights': ['uniform', 'distance'],
                  'metric': ['euclidean', 'manhattan']}}

In [8]:
# path = 'group022_result'
# write a function that takes a model(we have total 3) so we get three files and get its confusion matrix and save it to a csv file like in the format group022_result1, group022_result2, etc. In the file row 1 and column
# 1 should be class labels themselves, and the labels should be sorted in alphabetical order. The rest of the cells should be the confusion matrix values.
for i, k in enumerate(models):
    model = models[k]
    param = params[k]
    grid = GridSearchCV(model, param, cv=5)
    grid.fit(X_train, y_train)
    grid_pred = grid.predict(X_test)
    # print accuracy
    print("Accuracy for model {} is: {}".format(model.__class__.__name__, grid.best_score_))
    
    best_params = grid.best_params_
    hyper_values = {'classifier_name': model.__class__.__name__, 'library': 'sklearn'}
    hyper_values.update(best_params)
    pd.DataFrame.from_dict(data=hyper_values, orient='index').to_csv('group022_parameters' + str(i+1) + '.csv', header=False)
    
    grid_cm = confusion_matrix(y_test, grid_pred)
    # print confusion matrix in dataframe where row and column names are class labels
    grid_cm_df = pd.DataFrame(grid_cm, columns=np.unique(y_test), index=np.unique(y_test))
    grid_cm_df.index.name = 'Actual'
    grid_cm_df.columns.name = 'Predicted'
    # sort index and columns alphabetically
    grid_cm_df.index = sorted(grid_cm_df.index.values, key=lambda x: x.lower())
    grid_cm_df.columns = sorted(grid_cm_df.columns.values, key=lambda x: x.lower())
    # sace this df to csv
    grid_cm_df.to_csv('group022_result' + str(i+1) + '.csv')

Accuracy for model RandomForestClassifier is: 0.3513328776486671
Accuracy for model SVC is: 0.5680109364319892
Accuracy for model KNeighborsClassifier is: 0.49719753930280247
