In [18]:
# import statements
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import glob
import os

In [65]:
folder_path = 'csv_letters/*.csv'
x = []
y = []

# Example class names
class_names = ["letterA", "letterB", "letterC","letterD","letterE","letterF","letterG","letterH","letterI","letterK","letterL","letterM","letterN","letterO","letterP","letterQ","letterR","letterS","letterT","letterU","letterV","letterW","letterX","letterY"]

# Create a dictionary mapping class names to numerical labels
class_to_label = {class_name: label for label, class_name in enumerate(class_names)}
# Use glob to get all the csv files in the folder
csv_files = glob.glob(folder_path)

# Initialize an empty list to store the combined DataFrames
combined_dataframes = []
max_len = 0
for i in range(1, len(csv_files), 2):
    # Read the first file into a DataFrame
    df1 = pd.read_csv(csv_files[i], header=0, delimiter=";", usecols=list(range(0,2))+list(range(3,367)), decimal=',')
    if (len(df1)>max_len):
        max_len = len(df1)
print(f"max length is {max_len}")

max length is 315


In [66]:

# Iterate over the files two by two
for i in range(1, len(csv_files), 2):
    # Read the first file into a DataFrame
    df1 = pd.read_csv(csv_files[i], header=0, delimiter=";", usecols=list(range(0,2))+list(range(3,367)), decimal=',')
    # Get the shape of df1
    # print(f"Shape of df1: {df1.shape}")

    #get the class and add it to y
    name = os.path.splitext(csv_files[i])
    classtype = name[0].split('\\')[1][:7] #this takes letterX from the title
    numerical_label = class_to_label.get(classtype, -1)  # -1 if not found
    if numerical_label != -1:
        y.append(numerical_label)
    else:
        print("failed to add to y: " + classtype)

    # If there is a next file, read it and combine with the first
    if i+1 < len(csv_files):
        df2 = pd.read_csv(csv_files[i+1], header=0, delimiter=";", usecols=list(range(0,2))+list(range(3,367)), decimal=',')
        df2 = df2.add_prefix('right_')
    else:
        # If there is no next file, create an empty DataFrame with the same columns as df1
        df2 = pd.DataFrame(columns=df1.columns)

    # Ensure that both DataFrames have the same number of rows by filling NaN values in df2
    if len(df1) > len(df2):
        df2 = df2.reindex(df1.index, fill_value=np.nan)
    elif len(df2) > len(df1):
        df1 = df1.reindex(df2.index, fill_value=np.nan)

    # Concatenate the DataFrames
    combined_df = pd.concat([df1, df2], axis=1)
    combined_df = combined_df.reindex(range(max_len)).fillna(0)
    # Print the shape of combined_df
    print(f"Shape of combined_df: {combined_df.shape}")
    combined_dataframes.append(combined_df)

# Convert the list of DataFrames to a NumPy array
x = np.array(combined_dataframes)
x = np.asarray(x).astype('float32') # meant to fix an error when training the model
y = np.array(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# print(x)
# print(y)



Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of combined_df: (315, 732)
Shape of c

In [67]:
model = models.Sequential()
model.add(layers.Conv1D(64, 3, activation='relu', input_shape=(315,732)))
model.add(layers.MaxPooling1D(2))
model.add(layers.Conv1D(128, 3, activation='relu'))
model.add(layers.MaxPooling1D(2))
model.add(layers.Conv1D(128, 3, activation='relu'))
model.add(layers.Flatten())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(24, activation='softmax'))

In [68]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_18 (Conv1D)          (None, 313, 64)           140608    
                                                                 
 max_pooling1d_12 (MaxPoolin  (None, 156, 64)          0         
 g1D)                                                            
                                                                 
 conv1d_19 (Conv1D)          (None, 154, 128)          24704     
                                                                 
 max_pooling1d_13 (MaxPoolin  (None, 77, 128)          0         
 g1D)                                                            
                                                                 
 conv1d_20 (Conv1D)          (None, 75, 128)           49280     
                                                                 
 flatten_6 (Flatten)         (None, 9600)             

In [69]:
model.compile(optimizer='adam',  # we used adam during machine learning course
              loss='sparse_categorical_crossentropy',  # Multiclass classification loss
              metrics=['accuracy'])  # Track accuracy during training
print(x.size)
print(y.size)
training_data = model.fit(x_train,y_train,epochs=50)

58797900
255
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [70]:
result = model.evaluate(x_test,y_test)



In [None]:
import os
import pandas as pd

# Define a function to get the label based on the filename
def get_label(filename):
    letter = filename[6].lower()  # Get the letter from the filename and convert to lowercase
    if letter == 'j':
        return None  # Skip 'j'
    else:
        return ord(letter) - ord('a')  # Convert letter to corresponding integer label

# Specify the folder path where your CSV files are located
folder_path = 'csv_letters_filtered'

# Initialize an empty DataFrame to store the combined data
combined_df = pd.DataFrame()

# Iterate through each CSV file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)

        try:
            # Read the first 50 rows from the CSV file
            cols_to_use = list(range(3, 367))
            df = pd.read_csv(file_path, header=0, nrows=50, delimiter=';', usecols=cols_to_use, decimal=',', dtype=float, skiprows=1)

            # Check if the DataFrame has the right number of columns
            if len(df.columns) != 364:  # Modify this number based on your expected number of columns
                print(f"Skipping file '{filename}' as it doesn't have the right number of columns.")
                continue

            # Add a new column with the label
            label = get_label(filename)
            if label is not None:
                df.insert(0, 'Label', label)

            # Add a new column with the filename
            #df['Original_Filename'] = filename

            # Append the data to the combined DataFrame
            combined_df = pd.concat([combined_df, df], ignore_index=True)
        except ValueError:
            print(f"Skipping file '{filename}' as it couldn't be read due to a ValueError.")

# Save the combined data to a new CSV file
combined_df.to_csv('combined_data.csv', index=False)

print("Current working directory:", os.getcwd())

Skipping file 'letterC005_mijn35_L.csv' as it couldn't be read due to a ValueError.
Skipping file 'letterD004_mijn35_L.csv' as it couldn't be read due to a ValueError.
Skipping file 'letterS006_mijn35_L.csv' as it couldn't be read due to a ValueError.
Skipping file 'letterT004_mijn35_L.csv' as it couldn't be read due to a ValueError.


In [13]:
df = pd.read_csv('combined_data.csv', delimiter=",", decimal=".")
print(df.shape)

(12178, 365)


In [24]:
import pandas as pd
import numpy as np

df1 = np.loadtxt('data_glove/data_martijn_zus.csv', delimiter=',', skiprows=1)
df2 = np.loadtxt('data_glove/Data_joren.csv', delimiter=',')

print(df1.shape)
print(df2.shape)

if df1.shape[1] == df2.shape[1]:
    combined_array = np.concatenate((df1, df2), axis=0)
    print("Combined shape:", combined_array.shape)
else:
    print("Number of columns in both arrays must be the same for concatenation.")

(2400, 365)
(9600, 365)
Combined shape: (12000, 365)


In [3]:
import pandas as pd
import os

# Function to convert letter to corresponding label
def letter_to_label(letter):
    if letter < 'J':
        return ord(letter) - ord('A')
    else:
        return ord(letter) - ord('A') - 1

# Path to the folder containing CSV files
folder_path = 'csv_letters_martijn_moeder'

# List to store dataframes for each file
dfs = []

# Iterate through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        # Read CSV file into a dataframe
        df = pd.read_csv(os.path.join(folder_path, filename), delimiter=";", decimal=',', usecols=list(range(3, 367)))
        # Take the first 50 samples
        df = df.head(50)
        print(df.shape)
        # Extract the letter from the filename
        letter = filename.split('_')[0][-4]
        print(letter)
        # Add a label column based on the letter
        df['label'] = letter_to_label(letter)
        df = df[['label'] + [col for col in df.columns if col != 'label']]
        # Append dataframe to the list
        dfs.append(df)

# Concatenate all dataframes into a single dataframe
result_df = pd.concat(dfs)
print(result_df.shape)

# Write the result to a new CSV file
result_df.to_csv('data_glove/data_martijn_moeder.csv', index=False)

(50, 364)
A
(50, 364)
A
(50, 364)
B
(50, 364)
B
(50, 364)
C
(50, 364)
C
(50, 364)
D
(50, 364)
D
(50, 364)
E
(50, 364)
E
(50, 364)
F
(50, 364)
F
(50, 364)
G
(50, 364)
G
(50, 364)
H
(50, 364)
H
(50, 364)
I
(50, 364)
I
(50, 364)
K
(50, 364)
K
(50, 364)
L
(50, 364)
L
(50, 364)
M
(50, 364)
M
(50, 364)
N
(50, 364)
N
(50, 364)
O
(50, 364)
O
(50, 364)
P
(50, 364)
P
(50, 364)
Q
(50, 364)
Q
(50, 364)
R
(50, 364)
R
(50, 364)
S
(50, 364)
S
(50, 364)
T
(50, 364)
T
(50, 364)
U
(50, 364)
U
(50, 364)
V
(50, 364)
V
(50, 364)
W
(50, 364)
W
(50, 364)
X
(50, 364)
X
(50, 364)
Y
(50, 364)
Y
(2400, 365)
