# Getting and Cleaning Data Course Project

## Brief Explanation

This script does the following.

1. Merges the training and the test sets to create one data set.
2. Extracts only the measurements on the mean and standard deviation for each measurement.
3. Uses descriptive activity names to name the activities in the data set
4. Appropriately labels the data set with descriptive variable names.
5. From the data set in step 4, creates a second, independent tidy data set with the average of each variable for each activity and each subject.

## Reading the files

### 1. Reading the training files

In [8]:
train_activity <- read.table("./UCI HAR Dataset/train/y_train.txt")
train_features <- read.table("./UCI HAR Dataset/train/X_train.txt")
train_subject <- read.table("./UCI HAR Dataset/train/subject_train.txt")

Checking the dimensions of the dataframs read from the datasets

In [10]:
print(dim(train_activity))
print(dim(train_features))
print(dim(train_subject))

[1] 7352    1
[1] 7352  561
[1] 7352    1


### 2. Reading the test files 

In [11]:
test_activity <- read.table("./UCI HAR Dataset/test/y_test.txt")
test_features <- read.table("./UCI HAR Dataset/test/X_test.txt")
test_subject <- read.table("./UCI HAR Dataset/test/subject_test.txt")

Checking the dimensions of the dataframs read from the datasets

In [12]:
print(dim(test_activity))
print(dim(test_features))
print(dim(test_subject))

[1] 2947    1
[1] 2947  561
[1] 2947    1


---

## Merging the dataset

In [74]:
features_data <- rbind(train_features,test_features)
activity_data <- rbind(train_activity,test_activity)
subject_data <- rbind(train_subject,test_subject)
colnames(activity_data) <- c('Activity')
colnames(subject_data) <- c('Subject')

Checking the overall dimension of the merged data

In [63]:
cat("Is the dimension of merged features data is equal to the sum of individual data?",(dim(features_data)[1]==
   dim(train_features)[1] + dim(test_features)[1]) && dim(features_data)[2]==dim(train_features)[2] , "\n")

cat("Is the dimension of merged activity data is equal to the sum of individual data?",(dim(activity_data)[1]==
   dim(train_activity)[1] + dim(test_activity)[1]) && dim(activity_data)[2]==dim(train_activity)[2], "\n")

cat("Is the dimension of merged subject data is equal tto the sum of individual data?",(dim(subject_data)[1]==
   dim(train_subject)[1] + dim(test_subject)[1]) && dim(subject_data)[2]==dim(train_subject)[2], "\n")

Is the dimension of merged features data is equal to the sum of individual data? TRUE 
Is the dimension of merged activity data is equal to the sum of individual data? TRUE 
Is the dimension of merged subject data is equal tto the sum of individual data? TRUE 


In [64]:
head(features_data)

V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V552,V553,V554,V555,V556,V557,V558,V559,V560,V561
0.2885845,-0.02029417,-0.1329051,-0.9952786,-0.9831106,-0.9135264,-0.9951121,-0.9831846,-0.923527,-0.9347238,...,-0.07432303,-0.2986764,-0.7103041,-0.11275434,0.030400372,-0.4647614,-0.01844588,-0.8412468,0.1799406,-0.05862692
0.2784188,-0.01641057,-0.1235202,-0.9982453,-0.9753002,-0.960322,-0.9988072,-0.9749144,-0.9576862,-0.9430675,...,0.15807454,-0.5950509,-0.8614993,0.05347695,-0.007434566,-0.7326262,0.70351059,-0.8447876,0.1802889,-0.05431672
0.2796531,-0.01946716,-0.1134617,-0.9953796,-0.967187,-0.978944,-0.9965199,-0.9636684,-0.9774686,-0.9386916,...,0.41450281,-0.3907482,-0.7601037,-0.11855926,0.17789948,0.1006992,0.80852908,-0.8489335,0.1806373,-0.04911782
0.2791739,-0.02620065,-0.1232826,-0.9960915,-0.9834027,-0.9906751,-0.9970995,-0.9827498,-0.9893025,-0.9386916,...,0.40457253,-0.1172902,-0.4828445,-0.03678797,-0.012892494,0.640011,-0.48536645,-0.8486494,0.1819348,-0.04766318
0.2766288,-0.01656965,-0.1153619,-0.9981386,-0.9808173,-0.9904816,-0.9983211,-0.9796719,-0.9904411,-0.9424691,...,0.08775301,-0.3514709,-0.6992052,0.12332005,0.12254196,0.6935783,-0.61597061,-0.8478653,0.1851512,-0.04389225
0.2771988,-0.01009785,-0.1051373,-0.997335,-0.9904868,-0.99542,-0.9976274,-0.9902177,-0.9955489,-0.9424691,...,0.01995331,-0.5454101,-0.8446193,0.08263215,-0.14343901,0.2750408,-0.36822404,-0.8496316,0.1848225,-0.04212638


In [65]:
head(activity_data)

Activity
5
5
5
5
5
5


In [66]:
head(subject_data)

Subject
1
1
1
1
1
1


---

## Extraction

### Reading the feature description file

In [48]:
features_names <- read.table('./UCI HAR Dataset/features.txt')
head(features_names,n=10)

V1,V2
1,tBodyAcc-mean()-X
2,tBodyAcc-mean()-Y
3,tBodyAcc-mean()-Z
4,tBodyAcc-std()-X
5,tBodyAcc-std()-Y
6,tBodyAcc-std()-Z
7,tBodyAcc-mad()-X
8,tBodyAcc-mad()-Y
9,tBodyAcc-mad()-Z
10,tBodyAcc-max()-X


### Extracting the required features

In [49]:
# Extracting the features from the feature names which contain 'mean' or 'std' in the name
extract_feat_names <- subset(features_names,grepl("mean|std",features_names[,2]))
dim(extract_feat_names)

In [54]:
# using the features extracted previously subsets the feature data so that it contains the 
# features which only contain 'std' or mean in the name
extract_features_data <- features_data[,extract_feat_names[,1]]
#checking the dimension of the extracted data
dim(extract_features_data)

In [55]:
#renaming the column names
colnames(extract_features_data) <- extract_feat_names[,2]
head(extract_features_data)

tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tGravityAcc-mean()-X,tGravityAcc-mean()-Y,tGravityAcc-mean()-Z,tGravityAcc-std()-X,...,fBodyAccMag-meanFreq(),fBodyBodyAccJerkMag-mean(),fBodyBodyAccJerkMag-std(),fBodyBodyAccJerkMag-meanFreq(),fBodyBodyGyroMag-mean(),fBodyBodyGyroMag-std(),fBodyBodyGyroMag-meanFreq(),fBodyBodyGyroJerkMag-mean(),fBodyBodyGyroJerkMag-std(),fBodyBodyGyroJerkMag-meanFreq()
0.2885845,-0.02029417,-0.1329051,-0.9952786,-0.9831106,-0.9135264,0.9633961,-0.1408397,0.11537494,-0.9852497,...,-0.08843612,-0.9937257,-0.993755,0.3469885,-0.9801349,-0.9613094,-0.1289889,-0.9919904,-0.9906975,-0.07432303
0.2784188,-0.01641057,-0.1235202,-0.9982453,-0.9753002,-0.960322,0.9665611,-0.1415513,0.10937881,-0.9974113,...,-0.04414989,-0.9903355,-0.9919603,0.5320605,-0.9882956,-0.9833219,-0.2719585,-0.9958539,-0.9963995,0.15807454
0.2796531,-0.01946716,-0.1134617,-0.9953796,-0.967187,-0.978944,0.9668781,-0.1420098,0.10188392,-0.999574,...,0.25789914,-0.9892801,-0.9908667,0.660795,-0.9892548,-0.9860277,-0.2127279,-0.9950305,-0.9951274,0.41450281
0.2791739,-0.02620065,-0.1232826,-0.9960915,-0.9834027,-0.9906751,0.9676152,-0.1439765,0.09985014,-0.9966456,...,0.0735815,-0.9927689,-0.9916998,0.6789213,-0.9894128,-0.9878358,-0.0356842,-0.9952207,-0.9952369,0.40457253
0.2766288,-0.01656965,-0.1153619,-0.9981386,-0.9808173,-0.9904816,0.9682244,-0.1487502,0.0944859,-0.9984293,...,0.39431033,-0.9955228,-0.994389,0.5590577,-0.991433,-0.9890594,-0.273582,-0.9950928,-0.9954648,0.08775301
0.2771988,-0.01009785,-0.1051373,-0.997335,-0.9904868,-0.99542,0.9679482,-0.14821,0.09190972,-0.9989793,...,0.43796212,-0.9947329,-0.9951562,0.2469096,-0.9905,-0.9858609,-0.2973291,-0.9951433,-0.9952387,0.01995331


---

## Activity Naming

### Reading the activity description file

In [52]:
activity_names <- read.table("./UCI HAR Dataset/activity_labels.txt")

### Naming the activities

In [75]:
#naming the activities in activity_data using factor function
activity_data$Activity<-factor(activity_data$Activity,levels = 1:6 , labels = activity_names[,2])
head(activity_data)

Activity
STANDING
STANDING
STANDING
STANDING
STANDING
STANDING


---

## Labeling

In [103]:
# extract column names
col_names <- colnames(extract_features_data)

In [104]:
# removing '()' from the col_names
col_names<- gsub('\\()',"",col_names)

#replaing mean with Mean
col_names<- gsub('-mean',"Mean",col_names)

#replacing std with StandardDeviation
col_names<- gsub('-std',"StandardDeviation",col_names)

#capitalizing x,y,z,axis
col_names<- gsub('[Bb]ody[Bb]ody',"Body",col_names)

#replacing Acc with Acceleration
col_names <- gsub('[Aa]cc','Acceleration',col_names)

#replacing Mag with magnitude
col_names <- gsub('[Mm]ag','Magnitude',col_names)

# replacing prefix t and f with TimeDomain and FrequencyDomain respectively
col_names <- gsub('^t','TimeDomain',col_names)
col_names <- gsub('^f','FrequencyDomain',col_names)

#replacing Gyro with Gyroscope
col_names <- gsub('[Gg]yro','Gyroscope',col_names)

In [105]:
colnames(extract_features_data) <- col_names
head(extract_features_data)

TimeDomainBodyAccelerationMean-X,TimeDomainBodyAccelerationMean-Y,TimeDomainBodyAccelerationMean-Z,TimeDomainBodyAccelerationStandardDeviation-X,TimeDomainBodyAccelerationStandardDeviation-Y,TimeDomainBodyAccelerationStandardDeviation-Z,TimeDomainGravityAccelerationMean-X,TimeDomainGravityAccelerationMean-Y,TimeDomainGravityAccelerationMean-Z,TimeDomainGravityAccelerationStandardDeviation-X,...,FrequencyDomainBodyAccelerationMagnitudeMeanFreq,FrequencyDomainBodyAccelerationJerkMagnitudeMean,FrequencyDomainBodyAccelerationJerkMagnitudeStandardDeviation,FrequencyDomainBodyAccelerationJerkMagnitudeMeanFreq,FrequencyDomainBodyGyroscopeMagnitudeMean,FrequencyDomainBodyGyroscopeMagnitudeStandardDeviation,FrequencyDomainBodyGyroscopeMagnitudeMeanFreq,FrequencyDomainBodyGyroscopeJerkMagnitudeMean,FrequencyDomainBodyGyroscopeJerkMagnitudeStandardDeviation,FrequencyDomainBodyGyroscopeJerkMagnitudeMeanFreq
0.2885845,-0.02029417,-0.1329051,-0.9952786,-0.9831106,-0.9135264,0.9633961,-0.1408397,0.11537494,-0.9852497,...,-0.08843612,-0.9937257,-0.993755,0.3469885,-0.9801349,-0.9613094,-0.1289889,-0.9919904,-0.9906975,-0.07432303
0.2784188,-0.01641057,-0.1235202,-0.9982453,-0.9753002,-0.960322,0.9665611,-0.1415513,0.10937881,-0.9974113,...,-0.04414989,-0.9903355,-0.9919603,0.5320605,-0.9882956,-0.9833219,-0.2719585,-0.9958539,-0.9963995,0.15807454
0.2796531,-0.01946716,-0.1134617,-0.9953796,-0.967187,-0.978944,0.9668781,-0.1420098,0.10188392,-0.999574,...,0.25789914,-0.9892801,-0.9908667,0.660795,-0.9892548,-0.9860277,-0.2127279,-0.9950305,-0.9951274,0.41450281
0.2791739,-0.02620065,-0.1232826,-0.9960915,-0.9834027,-0.9906751,0.9676152,-0.1439765,0.09985014,-0.9966456,...,0.0735815,-0.9927689,-0.9916998,0.6789213,-0.9894128,-0.9878358,-0.0356842,-0.9952207,-0.9952369,0.40457253
0.2766288,-0.01656965,-0.1153619,-0.9981386,-0.9808173,-0.9904816,0.9682244,-0.1487502,0.0944859,-0.9984293,...,0.39431033,-0.9955228,-0.994389,0.5590577,-0.991433,-0.9890594,-0.273582,-0.9950928,-0.9954648,0.08775301
0.2771988,-0.01009785,-0.1051373,-0.997335,-0.9904868,-0.99542,0.9679482,-0.14821,0.09190972,-0.9989793,...,0.43796212,-0.9947329,-0.9951562,0.2469096,-0.9905,-0.9858609,-0.2973291,-0.9951433,-0.9952387,0.01995331


---

## Independant data set

### Merging all the data and creating tables

In [114]:
library(data.table)
#merging all the data into a single data frame
overall_data <- cbind(subject_data,activity_data,extract_features_data)
#creating the data table from dataframe
overall_data <- data.table(overall_data)
head(overall_data)

Subject,Activity,TimeDomainBodyAccelerationMean-X,TimeDomainBodyAccelerationMean-Y,TimeDomainBodyAccelerationMean-Z,TimeDomainBodyAccelerationStandardDeviation-X,TimeDomainBodyAccelerationStandardDeviation-Y,TimeDomainBodyAccelerationStandardDeviation-Z,TimeDomainGravityAccelerationMean-X,TimeDomainGravityAccelerationMean-Y,...,FrequencyDomainBodyAccelerationMagnitudeMeanFreq,FrequencyDomainBodyAccelerationJerkMagnitudeMean,FrequencyDomainBodyAccelerationJerkMagnitudeStandardDeviation,FrequencyDomainBodyAccelerationJerkMagnitudeMeanFreq,FrequencyDomainBodyGyroscopeMagnitudeMean,FrequencyDomainBodyGyroscopeMagnitudeStandardDeviation,FrequencyDomainBodyGyroscopeMagnitudeMeanFreq,FrequencyDomainBodyGyroscopeJerkMagnitudeMean,FrequencyDomainBodyGyroscopeJerkMagnitudeStandardDeviation,FrequencyDomainBodyGyroscopeJerkMagnitudeMeanFreq
1,STANDING,0.2885845,-0.02029417,-0.1329051,-0.9952786,-0.9831106,-0.9135264,0.9633961,-0.1408397,...,-0.08843612,-0.9937257,-0.993755,0.3469885,-0.9801349,-0.9613094,-0.1289889,-0.9919904,-0.9906975,-0.07432303
1,STANDING,0.2784188,-0.01641057,-0.1235202,-0.9982453,-0.9753002,-0.960322,0.9665611,-0.1415513,...,-0.04414989,-0.9903355,-0.9919603,0.5320605,-0.9882956,-0.9833219,-0.2719585,-0.9958539,-0.9963995,0.15807454
1,STANDING,0.2796531,-0.01946716,-0.1134617,-0.9953796,-0.967187,-0.978944,0.9668781,-0.1420098,...,0.25789914,-0.9892801,-0.9908667,0.660795,-0.9892548,-0.9860277,-0.2127279,-0.9950305,-0.9951274,0.41450281
1,STANDING,0.2791739,-0.02620065,-0.1232826,-0.9960915,-0.9834027,-0.9906751,0.9676152,-0.1439765,...,0.0735815,-0.9927689,-0.9916998,0.6789213,-0.9894128,-0.9878358,-0.0356842,-0.9952207,-0.9952369,0.40457253
1,STANDING,0.2766288,-0.01656965,-0.1153619,-0.9981386,-0.9808173,-0.9904816,0.9682244,-0.1487502,...,0.39431033,-0.9955228,-0.994389,0.5590577,-0.991433,-0.9890594,-0.273582,-0.9950928,-0.9954648,0.08775301
1,STANDING,0.2771988,-0.01009785,-0.1051373,-0.997335,-0.9904868,-0.99542,0.9679482,-0.14821,...,0.43796212,-0.9947329,-0.9951562,0.2469096,-0.9905,-0.9858609,-0.2973291,-0.9951433,-0.9952387,0.01995331


### Average of each variable for each activity and each subject

In [120]:
library(dplyr)
tidy_data <- overall_data %>% group_by(Subject,Activity) %>% summarise_all(funs(mean))

In [121]:
head(tidy_data,n=10)

Subject,Activity,TimeDomainBodyAccelerationMean-X,TimeDomainBodyAccelerationMean-Y,TimeDomainBodyAccelerationMean-Z,TimeDomainBodyAccelerationStandardDeviation-X,TimeDomainBodyAccelerationStandardDeviation-Y,TimeDomainBodyAccelerationStandardDeviation-Z,TimeDomainGravityAccelerationMean-X,TimeDomainGravityAccelerationMean-Y,...,FrequencyDomainBodyAccelerationMagnitudeMeanFreq,FrequencyDomainBodyAccelerationJerkMagnitudeMean,FrequencyDomainBodyAccelerationJerkMagnitudeStandardDeviation,FrequencyDomainBodyAccelerationJerkMagnitudeMeanFreq,FrequencyDomainBodyGyroscopeMagnitudeMean,FrequencyDomainBodyGyroscopeMagnitudeStandardDeviation,FrequencyDomainBodyGyroscopeMagnitudeMeanFreq,FrequencyDomainBodyGyroscopeJerkMagnitudeMean,FrequencyDomainBodyGyroscopeJerkMagnitudeStandardDeviation,FrequencyDomainBodyGyroscopeJerkMagnitudeMeanFreq
1,WALKING,0.2773308,-0.017383819,-0.1111481,-0.28374026,0.114461337,-0.2600279,0.9352232,-0.282165,...,0.19064372,-0.0571194,-0.1034924,0.09382218,-0.1992526,-0.321018,0.2688443675,-0.3193086,-0.3816019,0.19066345
1,WALKING_UPSTAIRS,0.2554617,-0.023953149,-0.097302,-0.35470803,-0.002320265,-0.01947924,0.8933511,-0.3621534,...,-0.09774335,-0.44265216,-0.5330599,0.08535241,-0.3259615,-0.1829855,-0.2193033761,-0.6346651,-0.6939305,0.11427734
1,WALKING_DOWNSTAIRS,0.2891883,-0.009918505,-0.1075662,0.03003534,-0.031935943,-0.23043421,0.9318744,-0.2666103,...,0.11918714,0.02621849,-0.1040523,0.07649155,-0.1857203,-0.3983504,0.3496138955,-0.2819634,-0.3919199,0.19000071
1,SITTING,0.2612376,-0.001308288,-0.1045442,-0.97722901,-0.922618642,-0.93958629,0.8315099,0.2044116,...,0.23665501,-0.98526213,-0.9816062,0.3518522,-0.9584356,-0.9321984,-0.0002621867,-0.9897975,-0.9870496,0.18477593
1,STANDING,0.2789176,-0.01613759,-0.1106018,-0.9957599,-0.973190056,-0.97977588,0.942952,-0.2729838,...,0.28455529,-0.99254248,-0.992536,0.4222201,-0.9846176,-0.9784661,-0.0286057725,-0.9948154,-0.9946711,0.33449873
1,LAYING,0.2215982,-0.040513953,-0.1132036,-0.92805647,-0.836827406,-0.8260614,-0.2488818,0.7055498,...,0.08640856,-0.93330036,-0.921804,0.26639115,-0.8621902,-0.8243194,-0.1397750127,-0.9423669,-0.9326607,0.17648591
2,WALKING,0.2764266,-0.01859492,-0.1055004,-0.42364284,-0.078091253,-0.42525752,0.9130173,-0.3466071,...,0.39320621,-0.16906435,-0.164092,0.20750093,-0.5307048,-0.6517928,0.3052838253,-0.5832493,-0.5581046,0.12634461
2,WALKING_UPSTAIRS,0.2471648,-0.021412113,-0.1525139,-0.30437641,0.10802728,-0.11212102,0.7907174,-0.4162149,...,0.1076804,-0.18951114,-0.2604238,-0.01191682,-0.4506122,-0.4386204,-0.022752769,-0.6007985,-0.6218202,0.0467434
2,WALKING_DOWNSTAIRS,0.2776153,-0.022661416,-0.1168129,0.04636668,0.262881789,-0.10283791,0.8618313,-0.3257801,...,0.09383213,0.22224741,0.2274807,0.01883551,-0.3208385,-0.3725768,0.1039712417,-0.3801753,-0.343699,-0.05124796
2,SITTING,0.2770874,-0.015687994,-0.1092183,-0.98682228,-0.950704499,-0.95982817,0.9404773,-0.10563,...,0.1309342,-0.9838747,-0.9841242,0.28006242,-0.9718406,-0.9613857,-0.1063589092,-0.989862,-0.9896329,0.19472215


### Write the tidy data to the csv file

In [123]:
write.table(tidy_data,"./tidy_data.csv",sep=",",row.names = FALSE)