In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/balance-fer2013-data-image/__results__.html
/kaggle/input/balance-fer2013-data-image/balanced_emotions_dataset.csv
/kaggle/input/balance-fer2013-data-image/__notebook__.ipynb
/kaggle/input/balance-fer2013-data-image/__output__.json
/kaggle/input/balance-fer2013-data-image/custom.css
/kaggle/input/balance-fer2013-data-image/__results___files/__results___8_0.png
/kaggle/input/balance-fer2013-data-image/__results___files/__results___11_0.png
/kaggle/input/balance-fer2013-data-image/__results___files/__results___22_0.png


In [2]:
import numpy as np
import pandas as pd
import cv2  # For image processing
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [3]:
data = pd.read_csv("/kaggle/input/balance-fer2013-data-image/balanced_emotions_dataset.csv")
data.head()

Unnamed: 0,emotion,pixels,Usage,emotion_text
0,0,143 144 145 145 146 145 151 149 142 135 128 13...,Training,anger
1,0,182 167 154 146 134 84 30 10 3 17 27 10 7 11 6...,Training,anger
2,0,207 208 208 209 208 214 176 139 116 66 124 135...,Training,anger
3,0,193 193 196 203 216 229 233 243 189 114 87 95 ...,Training,anger
4,0,168 195 207 208 209 207 208 204 207 207 208 20...,Training,anger


In [4]:
data.shape

(28000, 4)

In [5]:
print(f"Length of pixel data: {len(data.pixels[0].split())}")

Length of pixel data: 2304


In [6]:
def preprocess_pixels(img_array):
    # Ensure the input is already a NumPy array, so no need to split or convert again
    img_array = img_array.reshape(48, 48)  # Reshape to 48x48
    img_array = np.expand_dims(img_array, -1)  # Add channel dimension for grayscale
    img_array = np.repeat(img_array, 3, axis=-1)  # Convert to 3 channels for Xception
    img_array = cv2.resize(img_array, (299, 299)) / 255.0  # Resize to 299x299 and normalize
    return img_array


In [7]:
# Process data in chunks
def preprocess_chunk(chunk):
    chunk['pixels'] = chunk['pixels'].apply(lambda x: np.array(x.split(), dtype='float32'))
    chunk['image'] = chunk['pixels'].apply(preprocess_pixels)  # Pass the array directly to preprocess_pixels
    return chunk

In [8]:
# Load data in chunks
chunk_size = 1000
chunks = []
for chunk in pd.read_csv("/kaggle/input/balance-fer2013-data-image/balanced_emotions_dataset.csv",chunksize=chunk_size):
    chunks.append(chunk)

In [9]:
# Apply preprocessing to each chunk
processed_chunks = []
for chunk in chunks:
    processed_chunk = preprocess_chunk(chunk)
    processed_chunks.append(processed_chunk)
      
final_data = pd.concat(processed_chunks, ignore_index=True)

In [10]:
final_data

Unnamed: 0,emotion,pixels,Usage,emotion_text,image
0,0,"[143.0, 144.0, 145.0, 145.0, 146.0, 145.0, 151...",Training,anger,"[[[0.56078434, 0.56078434, 0.56078434], [0.560..."
1,0,"[182.0, 167.0, 154.0, 146.0, 134.0, 84.0, 30.0...",Training,anger,"[[[0.7137255, 0.7137255, 0.7137255], [0.713725..."
2,0,"[207.0, 208.0, 208.0, 209.0, 208.0, 214.0, 176...",Training,anger,"[[[0.8117647, 0.8117647, 0.8117647], [0.811764..."
3,0,"[193.0, 193.0, 196.0, 203.0, 216.0, 229.0, 233...",Training,anger,"[[[0.75686276, 0.75686276, 0.75686276], [0.756..."
4,0,"[168.0, 195.0, 207.0, 208.0, 209.0, 207.0, 208...",Training,anger,"[[[0.65882355, 0.65882355, 0.65882355], [0.658..."
...,...,...,...,...,...
27995,1,"[48.0, 34.0, 21.0, 18.0, 16.0, 21.0, 26.0, 36....",PrivateTest,disgust,"[[[0.1882353, 0.1882353, 0.1882353], [0.188235..."
27996,1,"[98.0, 103.0, 107.0, 105.0, 100.0, 103.0, 108....",PrivateTest,disgust,"[[[0.38431373, 0.38431373, 0.38431373], [0.384..."
27997,1,"[247.0, 247.0, 247.0, 246.0, 252.0, 224.0, 150...",PrivateTest,disgust,"[[[0.96862745, 0.96862745, 0.96862745], [0.968..."
27998,1,"[186.0, 146.0, 50.0, 42.0, 43.0, 35.0, 48.0, 9...",PrivateTest,disgust,"[[[0.7294118, 0.7294118, 0.7294118], [0.729411..."


In [11]:
type(final_data['image'][0])

numpy.ndarray

In [12]:
len(final_data['image'][0][0][0])

3

In [13]:
# Enregistrer les données en Parquet
final_data.to_csv("/kaggle/working/preprocessed_train_dataset.csv", index=False)
