In [1]:
#IMPORT NECESSARY LIBRARIES
import pandas as pd

In [2]:
import numpy as np

In [3]:
#TO CONCATENATE CSV FILES
import os, glob

In [4]:
#TASK 1: RETRIEVING AND PREPARING THE DATA 

In [12]:
#CONCATENATING ALL CSV FILES
all_files = glob.glob("*.csv")

In [14]:
dataframes = []

In [15]:
for filename in all_files:
    df = pd.read_csv(filename, index_col = None, sep=',',decimal='.', header = None)
    dataframes.append(df)

In [16]:
mice = pd.concat(dataframes, axis = 0, ignore_index = True)

In [11]:
#CHECK FILES WERE MERGED PROPERLY
mice.head()

Unnamed: 0,0,1,2,3,4
0,0.0,1502,2215,2153,1
1,1.0,1667,2072,2047,1
2,2.0,1611,1957,1906,1
3,3.0,1601,1939,1831,1
4,4.0,1643,1965,1879,1


In [None]:
mice.tail()

In [None]:
#CHECK FOR MISSING VALUES
mice.isnull().sum()

In [None]:
#INSERT HEADERS FOR EACH COLUMN
mice.columns = ['Sequential No.', 'X Acceleration', 'Y Acceleration', 'Z Acceleration', 'Activity']

In [None]:
#CHECK HEADERS WERE ADDED 
mice.head()

In [None]:
#DELETE "SEQUENTIAL NO." COLUMN
mice = mice.drop(columns = "Sequential No.")

In [None]:
#CHECK THAT "SEQUENTIAL NO." COLUMN IS DELETED 
mice.head()

In [None]:
#SUMMARY STATISTICS FOR DATASET 
mice.describe(include = 'all') 

In [None]:
#CHECK THE NO. OF ACTIVITIES (SHOULD ONLY BE 7) - SANITY CHECK
print(mice.groupby('Activity').size())

In [None]:
#FILTER ACTIVITY LABELLED "1 - 7"
mice.query('Activity != 0', inplace = True)

In [None]:
#CHECK THAT ACTIVITY LABELLED "0" HAS BEEN DROPPED
print(mice.groupby('Activity').size())

In [None]:
#TASK 2: DATA EXPLORATION

In [None]:
#GROUP BY ACTIVITY & X ACCELERATION - SUMMARY STATISTICS
mice.groupby('Activity')['X Acceleration'].describe()

In [None]:
#GROUP BY ACTIVITY & Y ACCELERATION - SUMMARY STATISTICS
mice.groupby('Activity')['Y Acceleration'].describe()

In [None]:
#GROUP BY ACTIVITY & Z ACCELERATION - SUMMARY STATISTICS
mice.groupby('Activity')['Z Acceleration'].describe()

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt

In [None]:
sns.set(style="whitegrid")

In [None]:
#BOXPLOT ACTIVITY & X ACCELERATION 
plt.figure()
ax = sns.boxplot(x="Activity", y="X Acceleration", data=mice)

In [None]:
#BOXPLOT ACTIVITY & Y ACCELERATION 
plt.figure()
ay = sns.boxplot(x="Activity", y="Y Acceleration", data=mice)

In [None]:
#BOXPLOT ACTIVITY & Z ACCELERATION 
plt.figure()
az = sns.boxplot(x="Activity", y="Z Acceleration", data=mice)

In [None]:
#AVERAGE FOR EACH ACCELERATION AND ACTIVITY
mice.groupby('Activity')[['X Acceleration','Y Acceleration','Z Acceleration']].mean()

In [None]:
# PLOT AVERAGE FOR EACH ACCELERATION VS ACTIVITY
plt.figure()
N = 7
X_mean = (1977.69, 1969.49, 1996.27, 1976.82, 2000.55, 2027.11, 1997.85)
Y_mean = (2376.56, 2371.05, 2378.30, 2386.29, 2385.49, 2374.07, 2388.54)
Z_mean = (1966.42, 1940.45, 1965.73, 1978.71, 1997, 1952.19, 1973.05)

ind = np.arange(N)
width = 0.25
plt.bar(ind, X_mean, width, label='X Acceleration', color='dodgerblue')
plt.bar(ind + width, Y_mean, width, label = 'Y Acceleration', color='orangered')
plt.bar(ind + width*2, Z_mean, width, label = 'Z Acceleration', color='gold')

plt.xlabel('Activity')
plt.ylabel('Acceleration')
plt.title('Average Acceleration (X, Y, Z) for each Activity')

plt.xticks(ind + width / 2, ('1', '2', '3', '4', '5', '6', '7'))
plt.legend(loc="lower right")
plt.show()

In [None]:
#TASK 3: DATA MODELLING - CLASSIFICATION

In [None]:
#CREATE FEATURE AND TARGET VARIABLES(X and Y variables)
Acceleration = ['X Acceleration', 'Y Acceleration', 'Z Acceleration']

X = mice[Acceleration] #Feature variables
Y = mice.Activity #Target variable

In [None]:
#CHECK FEATURE VARIABLES
print(X)

In [None]:
#CHECK TARGET VARIABLE
print(Y)

In [None]:
#SPLIT DATA INTO TRAINING AND TESTING SET (80/20)
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [None]:
X_test.shape

In [None]:
#CHECK NORMALITY OF DATASET
train_labels = Y_train.value_counts()
plt.figure(figsize = (12,8))
plt.bar(train_labels.index.values, train_labels)
plt.xticks(train_labels.index.values)
plt.xlabel('Activity')
plt.ylabel('Frequency')
plt.title('FREQUENCY FOR EACH ACTIVITY')

In [None]:
#SCALE FEATURES 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#DECISION TREE - DEFAULT PARAMETERS 
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#CREATE DECISION TREE CLASSIFIER OBJECT
clf = tree.DecisionTreeClassifier()

#TRAIN DECISION TREE CLASSIFER
clf = clf.fit(X_train, Y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}%'.format(clf.score(X_train, Y_train)*100))
print('Accuracy of Decision Tree classifer on test set: {:.2f}%'.format(clf.score(X_test, Y_test)*100))

In [None]:
#CONFUSION MATRIX AND CLASSIFICATION REPORT FOR DECISION TREE - DEFAULT PARAMETERS
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Y_pre = clf.predict(X_test)
cm = confusion_matrix(Y_test, Y_pre)
print(cm)

print(classification_report(Y_test, Y_pre))

In [None]:
#TEST DIFFERENT PARAMETERS - CHECK 'GINI' AND INCREASE PARAMETERS
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(criterion="gini", max_depth=None, min_samples_split=4,
                            min_samples_leaf=4, max_features="auto", max_leaf_nodes=None)

clf = clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_test)

print('Accuracy of Decision Tree classifer on test set: {:.2f}%'.format(accuracy_score(Y_test, Y_pred)*100))

In [None]:
#TEST DIFFERENT PARAMETERS = CHECK 'ENTROPY' AND INCREASE PARAMETERS
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(criterion="entropy", max_depth=None, min_samples_split=4,
                            min_samples_leaf=4, max_features="auto", max_leaf_nodes=None)

clf = clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_test)

print('Accuracy of Decision Tree classifier on training set: {:.2f}%'.format(clf.score(X_train, Y_train)*100))
print('Accuracy of Decision Tree classifer on test set: {:.2f}%'.format(accuracy_score(Y_test, Y_pred)*100))

In [None]:
#TEST DIFFERENT PARAMETERS - CHECK 'GINI' AND FURTHER INCREASE PARAMETERS
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(criterion="gini", max_depth=5, min_samples_split=10,
                            min_samples_leaf=5, max_features=None, max_leaf_nodes=5)

clf = clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_test)

print('Accuracy of Decision Tree classifer on test set: {:.2f}%'.format(accuracy_score(Y_test, Y_pred)*100))

In [None]:
#TEST DIFFERENT PARAMETERS - CHECK 'ENTROPY' AND FURTHER INCREASE PARAMETERS
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(criterion="entropy", max_depth=10, min_samples_split=20,
                            min_samples_leaf=10, max_features="auto", max_leaf_nodes=10)


clf = clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_test)

print('Accuracy of Decision Tree classifer on test set: {:.2f}%'.format(accuracy_score(Y_test, Y_pred)*100))

In [None]:
#DECISION TREE - TUNED PARAMETERS
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(criterion="entropy", max_depth=15, min_samples_split=10,
                            min_samples_leaf=10, max_features="auto", max_leaf_nodes=None)

clf = clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_test)

print('Accuracy of Decision Tree classifier on training set: {:.2f}%'.format(clf.score(X_train, Y_train)*100))
print('Accuracy of Decision Tree classifer on test set: {:.2f}%'.format(accuracy_score(Y_test, Y_pred)*100))

In [None]:
#CONFUSION MATRIX AND CLASSIFICATION REPORT FOR DECISION TREE - TUNED PARAMETERS
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Y_pre = clf.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)

print(classification_report(Y_test, Y_pred))

In [None]:
#K-NEAREST NEIGHBOURS - DEFAULT PARAMETERS (MINMAX SCALER)
from sklearn.neighbors import KNeighborsClassifier 

#RESCALE FEATURES - MINMAX SCALER
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)

print('Accuracy of K-NN classifier on training set: {:.2f}%'.format(knn.score(X_train, Y_train)*100))
print('Accuracy of K-NN classifier on test set: {:.2f}%'.format(knn.score(X_test, Y_test)*100))

In [None]:
#K-NEAREST NEIGHBOURS - DEFAULT PARAMETERS (STANDARD SCALER)
from sklearn.neighbors import KNeighborsClassifier 

#RESCALE FEATURES - STANDARD SCALER
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)

print('Accuracy of K-NN classifier on training set: {:.2f}%'.format(knn.score(X_train, Y_train)*100))
print('Accuracy of K-NN classifier on test set: {:.2f}%'.format(knn.score(X_test, Y_test)*100))

In [None]:
#CONFUSION MATRIX AND CLASSIFICATION REPORT FOR KNN
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred = knn.predict(X_test)
print(confusion_matrix(Y_test, pred))

print(classification_report(Y_test, pred))

In [None]:
#K-NEAREST NEIGHBOURS - TUNED PARAMETERS (INCREASE NO. NEIGHBOURS BY 10)
from sklearn.neighbors import KNeighborsClassifier 

#RESCALE FEATURES - STANDARD SCALER
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=10, weights='uniform',p=1)
knn.fit(X_train, Y_train)

print('Accuracy of K-NN classifier on training set: {:.2f}%'.format(knn.score(X_train, Y_train)*100))
print('Accuracy of K-NN classifier on test set: {:.2f}%'.format(knn.score(X_test, Y_test)*100))

In [None]:
#K-NEAREST NEIGHBOURS  TUNED PARAMETERS (INCREASE NO. NEIGHBOURS BY 30)
from sklearn.neighbors import KNeighborsClassifier 

#RESCALE FEATURES - STANDARD SCALER
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=30, weights='uniform',p=1)
knn.fit(X_train, Y_train)

print('Accuracy of K-NN classifier on training set: {:.2f}%'.format(knn.score(X_train, Y_train)*100))
print('Accuracy of K-NN classifier on test set: {:.2f}%'.format(knn.score(X_test, Y_test)*100))

In [None]:
#K-NEAREST NEIGHBOURS - COMPARE DISTANCE METRICS 
from sklearn.neighbors import KNeighborsClassifier 

#RESCALE FEATURES - STANDARD SCALER
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=30, weights='distance',p=1)
knn.fit(X_train, Y_train)

print('Accuracy of K-NN classifier on training set: {:.2f}%'.format(knn.score(X_train, Y_train)*100))
print('Accuracy of K-NN classifier on test set: {:.2f}%'.format(knn.score(X_test, Y_test)*100))

In [None]:
#K-NEAREST NEIGHBOURS - COMPARE P 
from sklearn.neighbors import KNeighborsClassifier 

#RESCALE FEATURES - STANDARD SCALER
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=40, weights='distance',p=1)
knn.fit(X_train, Y_train)

print('Accuracy of K-NN classifier on training set: {:.2f}%'.format(knn.score(X_train, Y_train)*100))
print('Accuracy of K-NN classifier on test set: {:.2f}%'.format(knn.score(X_test, Y_test)*100))

In [None]:
#K-NEAREST NEIGHBOURS  - COMPARE P 
from sklearn.neighbors import KNeighborsClassifier 

#RESCALE FEATURES - STANDARD SCALER
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=40, weights='uniform',p=2)
knn.fit(X_train, Y_train)

print('Accuracy of K-NN classifier on training set: {:.2f}%'.format(knn.score(X_train, Y_train)*100))
print('Accuracy of K-NN classifier on test set: {:.2f}%'.format(knn.score(X_test, Y_test)*100))

In [None]:
#K-NEAREST NEIGHBOURS - COMPARE NO. OF NEIGHBOURS
from sklearn.neighbors import KNeighborsClassifier 

#RESCALE FEATURES - STANDARD SCALER
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=50, weights='uniform',p=2)
knn.fit(X_train, Y_train)

print('Accuracy of K-NN classifier on training set: {:.2f}%'.format(knn.score(X_train, Y_train)*100))
print('Accuracy of K-NN classifier on test set: {:.2f}%'.format(knn.score(X_test, Y_test)*100))

In [None]:
#CONFUSION MATRIX AND CLASSIFICATION REPORT FOR KNN
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred = knn.predict(X_test)
print(confusion_matrix(Y_test, pred))

print(classification_report(Y_test, pred))