# Data Cleaning of .csv Data

### Imports

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
# Reading in the .csv data
fer2013_data = pd.read_csv("../data/fer2013.csv")

In [3]:
fer2013_data.head()

Unnamed: 0,emotion,pixels,Usage
0,0,70 80 82 72 58 58 60 63 54 58 60 48 89 115 121...,Training
1,0,151 150 147 155 148 133 111 140 170 174 182 15...,Training
2,2,231 212 156 164 174 138 161 173 182 200 106 38...,Training
3,4,24 32 36 30 32 23 19 20 30 41 21 22 32 34 21 1...,Training
4,6,4 0 0 0 0 0 0 0 0 0 0 0 3 15 23 28 48 50 58 84...,Training


In [4]:
# checking to see the split of the dataset between training and testing
fer2013_data['Usage'].value_counts()

Usage
Training       28709
PublicTest      3589
PrivateTest     3589
Name: count, dtype: int64

In [5]:
# reshaping the dataset so it only contains one set of training and testing sets
fer2013_data = fer2013_data[fer2013_data['Usage'] != 'PrivateTest']

In [6]:
fer2013_data['Usage'].value_counts()

Usage
Training      28709
PublicTest     3589
Name: count, dtype: int64

In [7]:
# splitting up the training data from the broader dataset
train_data = fer2013_data[fer2013_data['Usage'] == 'Training']

train_data.head(10)

Unnamed: 0,emotion,pixels,Usage
0,0,70 80 82 72 58 58 60 63 54 58 60 48 89 115 121...,Training
1,0,151 150 147 155 148 133 111 140 170 174 182 15...,Training
2,2,231 212 156 164 174 138 161 173 182 200 106 38...,Training
3,4,24 32 36 30 32 23 19 20 30 41 21 22 32 34 21 1...,Training
4,6,4 0 0 0 0 0 0 0 0 0 0 0 3 15 23 28 48 50 58 84...,Training
5,2,55 55 55 55 55 54 60 68 54 85 151 163 170 179 ...,Training
6,4,20 17 19 21 25 38 42 42 46 54 56 62 63 66 82 1...,Training
7,3,77 78 79 79 78 75 60 55 47 48 58 73 77 79 57 5...,Training
8,3,85 84 90 121 101 102 133 153 153 169 177 189 1...,Training
9,2,255 254 255 254 254 179 122 107 95 124 149 150...,Training


In [None]:
# dropping the unnecessary usage column at this point
train_data.drop(columns = ['Usage'], inplace = True)

In [9]:
train_data

Unnamed: 0,emotion,pixels
0,0,70 80 82 72 58 58 60 63 54 58 60 48 89 115 121...
1,0,151 150 147 155 148 133 111 140 170 174 182 15...
2,2,231 212 156 164 174 138 161 173 182 200 106 38...
3,4,24 32 36 30 32 23 19 20 30 41 21 22 32 34 21 1...
4,6,4 0 0 0 0 0 0 0 0 0 0 0 3 15 23 28 48 50 58 84...
...,...,...
28704,2,84 85 85 85 85 85 85 85 86 86 86 87 86 86 91 9...
28705,0,114 112 113 113 111 111 112 113 115 113 114 11...
28706,4,74 81 87 89 95 100 98 93 105 120 127 133 146 1...
28707,0,222 227 203 90 86 90 84 77 94 87 99 119 134 14...


In [22]:
# checking the distribution of emotion classes within the training set
train_data['emotion'].value_counts(normalize = True)

emotion
3    0.251315
6    0.172942
4    0.168240
2    0.142708
0    0.139155
5    0.110453
1    0.015187
Name: proportion, dtype: float64

In [14]:
fer2013_data['Usage'].value_counts()

Usage
Training      28709
PublicTest     3589
Name: count, dtype: int64

In [16]:
# labeling the testing set from the broader dataset
test_data = fer2013_data[fer2013_data['Usage'] == 'PublicTest']

test_data.head()

Unnamed: 0,emotion,pixels,Usage
28709,0,254 254 254 254 254 249 255 160 2 58 53 70 77 ...,PublicTest
28710,1,156 184 198 202 204 207 210 212 213 214 215 21...,PublicTest
28711,4,69 118 61 60 96 121 103 87 103 88 70 90 115 12...,PublicTest
28712,6,205 203 236 157 83 158 120 116 94 86 155 180 2...,PublicTest
28713,3,87 79 74 66 74 96 77 80 80 84 83 89 102 91 84 ...,PublicTest


In [None]:
# dropping usage column from testing set
test_data.drop(columns = 'Usage', inplace = True)

test_data.head()

In [20]:
train_data.to_csv('train_data.csv', index = False)

In [21]:
test_data.to_csv('test_data.csv', index = False)